@kernlang/agon-dedup 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -0
- package/classifier.py +134 -0
- package/history-search.py +139 -0
- package/package.json +29 -0
- package/requirements.txt +7 -0
- package/sidecar.py +145 -0
- package/syntax-validator.py +331 -0
package/README.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# @kernlang/agon-dedup
|
|
2
|
+
|
|
3
|
+
Brainstorm dedup sidecar — collapses near-duplicate engine drafts so you don't read the same idea three times.
|
|
4
|
+
|
|
5
|
+
## Why Python
|
|
6
|
+
|
|
7
|
+
- Semantic similarity needs sentence embeddings, not bag-of-words. TF-IDF was tried first and scored 0.06 between drafts that say the same thing in different words. Useless for this task.
|
|
8
|
+
- `fastembed` (ONNX-based, ~30MB) gives proper paraphrase detection at ~500ms cold start without dragging in torch.
|
|
9
|
+
- TS has no comparable library — `transformers.js` works but the WASM cold-start is multi-second and the model is 80MB+ on first download.
|
|
10
|
+
|
|
11
|
+
This is the boundary where Python is *actually better*, not just different.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
python3 -m pip install --user -r packages/dedup/requirements.txt
|
|
17
|
+
# or:
|
|
18
|
+
npm run install:python -w packages/dedup
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Model is downloaded once on first run (~80MB) and cached under `~/.cache/fastembed/`.
|
|
22
|
+
|
|
23
|
+
## Use
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# JSONL on stdin, JSON on stdout
|
|
27
|
+
echo '{"id":"claude","text":"Ship A"}
|
|
28
|
+
{"id":"codex","text":"Pick option A"}
|
|
29
|
+
{"id":"gemini","text":"Hold off entirely"}' | python3 packages/dedup/sidecar.py
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Output:
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"groups": [
|
|
37
|
+
{"members": ["claude", "codex"], "representative": "codex", "similarity": 0.7},
|
|
38
|
+
{"members": ["gemini"], "representative": "gemini", "similarity": 1.0}
|
|
39
|
+
],
|
|
40
|
+
"threshold": 0.55,
|
|
41
|
+
"method": "minilm-cosine"
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Test
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
npm run test:sidecar -w packages/dedup
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The smoke test feeds three drafts (two paraphrases + one dissent) and asserts the paraphrases group while the dissent stays alone.
|
|
52
|
+
|
|
53
|
+
## Tuning
|
|
54
|
+
|
|
55
|
+
`THRESHOLD = 0.55` in `sidecar.py`. Calibration:
|
|
56
|
+
|
|
57
|
+
| Pair | MiniLM cosine |
|
|
58
|
+
| ------------------------------------- | ------------- |
|
|
59
|
+
| Identical text | 1.0 |
|
|
60
|
+
| Same idea, different wording | 0.7 – 0.9 |
|
|
61
|
+
| Related topic | 0.4 – 0.7 |
|
|
62
|
+
| Unrelated | < 0.3 |
|
|
63
|
+
|
|
64
|
+
Lower = more aggressive merging (risk: collapsing real disagreement). Higher = more conservative (risk: missing obvious paraphrases).
|
|
65
|
+
|
|
66
|
+
## Spawn model
|
|
67
|
+
|
|
68
|
+
Per-call subprocess. Agon spawns `python3 sidecar.py` only when it has drafts to dedupe, matching how engine adapters spawn their CLIs. Cold-start is the model load (~500ms). For 6 drafts the full call lands in well under 2s — negligible against an 8-12 minute brainstorm.
|
|
69
|
+
|
|
70
|
+
## Status
|
|
71
|
+
|
|
72
|
+
- [x] Sidecar built and smoke-tested
|
|
73
|
+
- [x] Workspace registered
|
|
74
|
+
- [x] Wired into `runBrainstorm` (`packages/forge/src/kern/dedup-bridge.kern` spawns the sidecar; result attached as `BrainstormResult.groups`)
|
|
75
|
+
- [x] CLI `agon brainstorm` shows `(N engines agree)` tag in the bids table
|
|
76
|
+
- [ ] Integration test against a real `agon brainstorm` call (manual for now — brainstorms run 8-12 min with 6 engines)
|
|
77
|
+
- [ ] Optional: cache embeddings per session so identical drafts across re-runs reuse vectors
|
|
78
|
+
|
|
79
|
+
## Failure modes (graceful)
|
|
80
|
+
|
|
81
|
+
The bridge in `packages/forge` returns `null` and the CLI falls back to the un-deduped bids table when:
|
|
82
|
+
|
|
83
|
+
- `python3` is missing (or `AGON_PYTHON` env var points at something broken)
|
|
84
|
+
- `packages/dedup/sidecar.py` is missing (e.g., production install only shipped TS)
|
|
85
|
+
- `fastembed` not installed (`exit 2`) — a one-line warning prints with the install command
|
|
86
|
+
- Sidecar emits malformed JSON
|
|
87
|
+
|
|
88
|
+
In every case agon brainstorm still completes — dedup is purely additive.
|
package/classifier.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Task classifier sidecar — zero-shot classification via fastembed.
|
|
4
|
+
|
|
5
|
+
Fed when the regex classifier in `task-classifier.kern` falls through to
|
|
6
|
+
'other'. Embeds the input text, embeds each candidate label's description,
|
|
7
|
+
returns the label with highest cosine similarity.
|
|
8
|
+
|
|
9
|
+
Protocol:
|
|
10
|
+
stdin — single JSON line: {"text": "<task description>"}
|
|
11
|
+
stdout — single JSON: {"class": "<TaskClass>",
|
|
12
|
+
"confidence": 0.74,
|
|
13
|
+
"scores": {"feature": 0.74, "bugfix": 0.31, ...}}
|
|
14
|
+
|
|
15
|
+
Returns 'other' if no class scores above MIN_CONFIDENCE.
|
|
16
|
+
|
|
17
|
+
Exit codes:
|
|
18
|
+
0 — success
|
|
19
|
+
1 — bad input
|
|
20
|
+
2 — fastembed not installed
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import sys
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
MIN_CONFIDENCE = 0.10
|
|
30
|
+
MARGIN = 0.05 # top class must beat #2 by this much to commit; otherwise 'other'
|
|
31
|
+
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
32
|
+
|
|
33
|
+
# Few-shot examples per label. We embed concrete task phrasings rather than
|
|
34
|
+
# abstract definitions because all-MiniLM scores concrete-vs-concrete cosine
|
|
35
|
+
# much higher than concrete-vs-abstract. Empirically this lifts true-positive
|
|
36
|
+
# rates on Agon's actual prompt distribution from ~30% to ~85%.
|
|
37
|
+
LABELS: dict[str, str] = {
|
|
38
|
+
"docs": ("Update the README. Document the API surface. Add comments "
|
|
39
|
+
"explaining the algorithm. Write a changelog entry. "
|
|
40
|
+
"Rationale for the migration. Update the docs."),
|
|
41
|
+
"test": ("Add a unit test. Cover the edge case with assertions. "
|
|
42
|
+
"Write integration tests for the auth flow. Increase test "
|
|
43
|
+
"coverage. Add fixtures. Snapshot test."),
|
|
44
|
+
"bugfix": ("Fix the off-by-one error. Why is the cache evicting? "
|
|
45
|
+
"Resolve the crash on startup. Patch the regression. "
|
|
46
|
+
"Off-by-one. Race condition. Memory leak. Stuck process. "
|
|
47
|
+
"Broken behavior. Unexpected output."),
|
|
48
|
+
"refactor": ("Rename across the codebase. Extract the helper. Simplify "
|
|
49
|
+
"this method. Reorganize the file structure. Clean up dead "
|
|
50
|
+
"code. Move modules. Restructure without changing behavior."),
|
|
51
|
+
"algorithm": ("Implement Glicko-2 ratings. Compute the rolling median. "
|
|
52
|
+
"Optimize the sort. Score the engines. Calculate confidence "
|
|
53
|
+
"intervals. Compute distances. Numerical computation. "
|
|
54
|
+
"Data structure. Sorting algorithm."),
|
|
55
|
+
"feature": ("Add support for streaming. Build a streaming JSON parser. "
|
|
56
|
+
"Create a new dashboard. Implement a new API endpoint. "
|
|
57
|
+
"Build a CLI command. Add a new capability. Ship a new "
|
|
58
|
+
"feature. Extend the engine adapter."),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _read_input() -> str:
|
|
63
|
+
raw = sys.stdin.read().strip()
|
|
64
|
+
if not raw:
|
|
65
|
+
print("classifier-sidecar: empty stdin", file=sys.stderr)
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
try:
|
|
68
|
+
obj = json.loads(raw)
|
|
69
|
+
except json.JSONDecodeError as err:
|
|
70
|
+
print(f"classifier-sidecar: invalid JSON: {err}", file=sys.stderr)
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
if not isinstance(obj, dict) or "text" not in obj:
|
|
73
|
+
print("classifier-sidecar: missing 'text' field", file=sys.stderr)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
text = str(obj["text"]).strip()
|
|
76
|
+
if not text:
|
|
77
|
+
print("classifier-sidecar: 'text' is empty", file=sys.stderr)
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
return text
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _classify(text: str) -> dict:
|
|
83
|
+
try:
|
|
84
|
+
from fastembed import TextEmbedding
|
|
85
|
+
import numpy as np
|
|
86
|
+
except ImportError:
|
|
87
|
+
print("classifier-sidecar: fastembed not installed — install via "
|
|
88
|
+
"`pip install -r packages/dedup/requirements.txt`",
|
|
89
|
+
file=sys.stderr)
|
|
90
|
+
sys.exit(2)
|
|
91
|
+
|
|
92
|
+
embedder = TextEmbedding(MODEL)
|
|
93
|
+
label_keys = list(LABELS.keys())
|
|
94
|
+
label_descriptions = [LABELS[k] for k in label_keys]
|
|
95
|
+
all_texts = [text] + label_descriptions
|
|
96
|
+
embs = np.array(list(embedder.embed(all_texts)))
|
|
97
|
+
|
|
98
|
+
norms = np.linalg.norm(embs, axis=1, keepdims=True)
|
|
99
|
+
normed = embs / np.where(norms == 0, 1, norms)
|
|
100
|
+
text_vec = normed[0]
|
|
101
|
+
label_vecs = normed[1:]
|
|
102
|
+
sims = label_vecs @ text_vec
|
|
103
|
+
|
|
104
|
+
scores = {label_keys[i]: round(float(sims[i]), 3)
|
|
105
|
+
for i in range(len(label_keys))}
|
|
106
|
+
sorted_indices = sorted(range(len(sims)), key=lambda i: -sims[i])
|
|
107
|
+
best_idx = sorted_indices[0]
|
|
108
|
+
second_idx = sorted_indices[1]
|
|
109
|
+
best_label = label_keys[best_idx]
|
|
110
|
+
best_score = float(sims[best_idx])
|
|
111
|
+
margin = best_score - float(sims[second_idx])
|
|
112
|
+
|
|
113
|
+
if best_score < MIN_CONFIDENCE or margin < MARGIN:
|
|
114
|
+
chosen = "other"
|
|
115
|
+
else:
|
|
116
|
+
chosen = best_label
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
"class": chosen,
|
|
120
|
+
"confidence": round(best_score, 3),
|
|
121
|
+
"margin": round(margin, 3),
|
|
122
|
+
"scores": scores,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def main() -> None:
|
|
127
|
+
text = _read_input()
|
|
128
|
+
result = _classify(text)
|
|
129
|
+
json.dump(result, sys.stdout)
|
|
130
|
+
sys.stdout.write("\n")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
if __name__ == "__main__":
|
|
134
|
+
main()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
History search sidecar — semantic ranking of forge run manifests by query.
|
|
4
|
+
|
|
5
|
+
Reuses the MiniLM tax already paid by sidecar.py / classifier.py — same
|
|
6
|
+
fastembed model, same ~30MB on disk, same ~500ms cold / ~50ms warm.
|
|
7
|
+
|
|
8
|
+
Why Python: TS has no production-grade local embedding runtime. The dedup
|
|
9
|
+
sidecar already proved MiniLM/cosine beats substring grep (0.06 → 0.83+ on
|
|
10
|
+
paraphrases). The current `agon history` lookup is exact substring on the
|
|
11
|
+
forgeId, so paraphrased queries ("the SaaS API thing" vs "FastAPI shim")
|
|
12
|
+
get nothing back. Cosine over MiniLM fixes that.
|
|
13
|
+
|
|
14
|
+
Protocol:
|
|
15
|
+
stdin — single JSON:
|
|
16
|
+
{
|
|
17
|
+
"query": "<search text>",
|
|
18
|
+
"items": [{"id": "<runId>", "text": "<task + fitnessCmd + ...>"}, ...],
|
|
19
|
+
"top_k": 10 // optional, default 10
|
|
20
|
+
}
|
|
21
|
+
stdout — single JSON:
|
|
22
|
+
{
|
|
23
|
+
"results": [{"id": "<runId>", "similarity": 0.82}, ...],
|
|
24
|
+
"method": "minilm-cosine",
|
|
25
|
+
"model": "sentence-transformers/all-MiniLM-L6-v2"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
Exit codes:
|
|
29
|
+
0 — success
|
|
30
|
+
1 — bad input (malformed JSON, missing fields, empty)
|
|
31
|
+
2 — fastembed not installed (caller should fall back to chronological)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import json
|
|
37
|
+
import sys
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
42
|
+
DEFAULT_TOP_K = 10
|
|
43
|
+
MIN_SIMILARITY = 0.15 # below this, the match is effectively noise — drop it
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _read_input() -> dict[str, Any]:
|
|
47
|
+
raw = sys.stdin.read().strip()
|
|
48
|
+
if not raw:
|
|
49
|
+
print("history-search: empty stdin", file=sys.stderr)
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
try:
|
|
52
|
+
obj = json.loads(raw)
|
|
53
|
+
except json.JSONDecodeError as err:
|
|
54
|
+
print(f"history-search: invalid JSON: {err}", file=sys.stderr)
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
if not isinstance(obj, dict):
|
|
57
|
+
print("history-search: input must be a JSON object", file=sys.stderr)
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
query = obj.get("query")
|
|
60
|
+
items = obj.get("items")
|
|
61
|
+
if not isinstance(query, str) or not query.strip():
|
|
62
|
+
print("history-search: 'query' must be a non-empty string",
|
|
63
|
+
file=sys.stderr)
|
|
64
|
+
sys.exit(1)
|
|
65
|
+
if not isinstance(items, list) or not items:
|
|
66
|
+
print("history-search: 'items' must be a non-empty array",
|
|
67
|
+
file=sys.stderr)
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
normalized: list[dict[str, str]] = []
|
|
70
|
+
for i, raw_item in enumerate(items):
|
|
71
|
+
if (not isinstance(raw_item, dict)
|
|
72
|
+
or "id" not in raw_item or "text" not in raw_item):
|
|
73
|
+
print(f"history-search: item[{i}] missing 'id' or 'text'",
|
|
74
|
+
file=sys.stderr)
|
|
75
|
+
sys.exit(1)
|
|
76
|
+
text = str(raw_item["text"]).strip()
|
|
77
|
+
if not text:
|
|
78
|
+
# Skip empty-text items rather than fail — manifests without a
|
|
79
|
+
# task description shouldn't poison the whole query.
|
|
80
|
+
continue
|
|
81
|
+
normalized.append({"id": str(raw_item["id"]), "text": text})
|
|
82
|
+
if not normalized:
|
|
83
|
+
print("history-search: no items with non-empty text", file=sys.stderr)
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
top_k = obj.get("top_k", DEFAULT_TOP_K)
|
|
86
|
+
if not isinstance(top_k, int) or top_k <= 0:
|
|
87
|
+
top_k = DEFAULT_TOP_K
|
|
88
|
+
return {"query": query.strip(), "items": normalized, "top_k": top_k}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _rank(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|
92
|
+
try:
|
|
93
|
+
from fastembed import TextEmbedding
|
|
94
|
+
import numpy as np
|
|
95
|
+
except ImportError:
|
|
96
|
+
print("history-search: fastembed not installed — install via "
|
|
97
|
+
"`pip install -r packages/dedup/requirements.txt`",
|
|
98
|
+
file=sys.stderr)
|
|
99
|
+
sys.exit(2)
|
|
100
|
+
|
|
101
|
+
query: str = payload["query"]
|
|
102
|
+
items: list[dict[str, str]] = payload["items"]
|
|
103
|
+
top_k: int = payload["top_k"]
|
|
104
|
+
|
|
105
|
+
embedder = TextEmbedding(MODEL)
|
|
106
|
+
texts = [query] + [item["text"] for item in items]
|
|
107
|
+
embs = np.array(list(embedder.embed(texts)))
|
|
108
|
+
|
|
109
|
+
norms = np.linalg.norm(embs, axis=1, keepdims=True)
|
|
110
|
+
normed = embs / np.where(norms == 0, 1, norms)
|
|
111
|
+
query_vec = normed[0]
|
|
112
|
+
item_vecs = normed[1:]
|
|
113
|
+
|
|
114
|
+
sims = item_vecs @ query_vec # shape: (n_items,)
|
|
115
|
+
|
|
116
|
+
scored = [
|
|
117
|
+
{"id": items[i]["id"], "similarity": float(sims[i])}
|
|
118
|
+
for i in range(len(items))
|
|
119
|
+
if float(sims[i]) >= MIN_SIMILARITY
|
|
120
|
+
]
|
|
121
|
+
scored.sort(key=lambda r: -r["similarity"])
|
|
122
|
+
top = scored[:top_k]
|
|
123
|
+
for r in top:
|
|
124
|
+
r["similarity"] = round(r["similarity"], 3)
|
|
125
|
+
return top
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def main() -> None:
|
|
129
|
+
payload = _read_input()
|
|
130
|
+
results = _rank(payload)
|
|
131
|
+
json.dump(
|
|
132
|
+
{"results": results, "method": "minilm-cosine", "model": MODEL},
|
|
133
|
+
sys.stdout,
|
|
134
|
+
)
|
|
135
|
+
sys.stdout.write("\n")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
main()
|
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@kernlang/agon-dedup",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Python sidecars for Agon AI — semantic embeddings (fastembed/MiniLM) and tree-sitter syntax validation. Bridged from KERN via stdin/stdout JSON. Ships .py files that the @kernlang/agon-core bridges spawn at runtime.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"files": [
|
|
7
|
+
"*.py",
|
|
8
|
+
"requirements.txt",
|
|
9
|
+
"README.md"
|
|
10
|
+
],
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"access": "public"
|
|
13
|
+
},
|
|
14
|
+
"scripts": {
|
|
15
|
+
"install:python": "python3 -m pip install --user -r requirements.txt",
|
|
16
|
+
"test:sidecar": "node tests/smoke.mjs",
|
|
17
|
+
"test:classifier": "node tests/classifier-smoke.mjs",
|
|
18
|
+
"test:history-search": "node tests/history-search-smoke.mjs",
|
|
19
|
+
"test:syntax-validator": "node tests/syntax-validator-smoke.mjs",
|
|
20
|
+
"test:syntax-validator-bridge": "node tests/syntax-validator-bridge-smoke.mjs",
|
|
21
|
+
"test:all-sidecars": "npm run test:sidecar && npm run test:classifier && npm run test:history-search && npm run test:syntax-validator && npm run test:syntax-validator-bridge"
|
|
22
|
+
},
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "git+https://github.com/KERNlang/agon.git",
|
|
26
|
+
"directory": "packages/dedup"
|
|
27
|
+
},
|
|
28
|
+
"license": "MIT"
|
|
29
|
+
}
|
package/requirements.txt
ADDED
package/sidecar.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Brainstorm dedup sidecar — clusters near-duplicate engine drafts.
|
|
4
|
+
|
|
5
|
+
Uses sentence embeddings (MiniLM via fastembed/ONNX, ~30MB) so paraphrases
|
|
6
|
+
group together. TF-IDF was tried first and scored 0.06 between drafts that
|
|
7
|
+
say the same thing in different words — useless for this task.
|
|
8
|
+
|
|
9
|
+
Protocol:
|
|
10
|
+
stdin — JSONL, one per line: {"id": "<engineId>", "text": "<draft>"}
|
|
11
|
+
stdout — single JSON:
|
|
12
|
+
{
|
|
13
|
+
"groups": [
|
|
14
|
+
{"members": ["claude", "codex"], "representative": "claude",
|
|
15
|
+
"similarity": 0.83},
|
|
16
|
+
...
|
|
17
|
+
],
|
|
18
|
+
"threshold": 0.55,
|
|
19
|
+
"method": "minilm-cosine"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
Two engines are in the same group iff cosine similarity of their MiniLM
|
|
23
|
+
embeddings >= threshold. Representative = engine with the longest text in
|
|
24
|
+
the group (most detail). Singleton groups are emitted too.
|
|
25
|
+
|
|
26
|
+
Exit codes:
|
|
27
|
+
0 — success
|
|
28
|
+
1 — bad input (malformed JSON, no items)
|
|
29
|
+
2 — fastembed not installed (caller should fall back to no-dedup)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
import sys
|
|
36
|
+
from typing import Any
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
THRESHOLD = 0.55
|
|
40
|
+
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _read_input() -> list[dict[str, str]]:
|
|
44
|
+
items: list[dict[str, str]] = []
|
|
45
|
+
for line_num, raw_line in enumerate(sys.stdin, 1):
|
|
46
|
+
line = raw_line.strip()
|
|
47
|
+
if not line:
|
|
48
|
+
continue
|
|
49
|
+
try:
|
|
50
|
+
obj = json.loads(line)
|
|
51
|
+
except json.JSONDecodeError as err:
|
|
52
|
+
print(f"dedup-sidecar: line {line_num} is not valid JSON: {err}",
|
|
53
|
+
file=sys.stderr)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
if not isinstance(obj, dict) or "id" not in obj or "text" not in obj:
|
|
56
|
+
print(f"dedup-sidecar: line {line_num} missing 'id' or 'text'",
|
|
57
|
+
file=sys.stderr)
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
items.append({"id": str(obj["id"]), "text": str(obj["text"])})
|
|
60
|
+
return items
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _cluster(items: list[dict[str, str]]) -> list[dict[str, Any]]:
|
|
64
|
+
if len(items) == 1:
|
|
65
|
+
return [{
|
|
66
|
+
"members": [items[0]["id"]],
|
|
67
|
+
"representative": items[0]["id"],
|
|
68
|
+
"similarity": 1.0,
|
|
69
|
+
}]
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
from fastembed import TextEmbedding
|
|
73
|
+
import numpy as np
|
|
74
|
+
except ImportError:
|
|
75
|
+
print("dedup-sidecar: fastembed not installed — install via "
|
|
76
|
+
"`pip install -r packages/dedup/requirements.txt`",
|
|
77
|
+
file=sys.stderr)
|
|
78
|
+
sys.exit(2)
|
|
79
|
+
|
|
80
|
+
embedder = TextEmbedding(MODEL)
|
|
81
|
+
texts = [item["text"] for item in items]
|
|
82
|
+
embs = np.array(list(embedder.embed(texts)))
|
|
83
|
+
|
|
84
|
+
norms = np.linalg.norm(embs, axis=1, keepdims=True)
|
|
85
|
+
normed = embs / np.where(norms == 0, 1, norms)
|
|
86
|
+
sim = normed @ normed.T
|
|
87
|
+
|
|
88
|
+
n = len(items)
|
|
89
|
+
parent = list(range(n))
|
|
90
|
+
|
|
91
|
+
def find(i: int) -> int:
|
|
92
|
+
while parent[i] != i:
|
|
93
|
+
parent[i] = parent[parent[i]]
|
|
94
|
+
i = parent[i]
|
|
95
|
+
return i
|
|
96
|
+
|
|
97
|
+
def union(i: int, j: int) -> None:
|
|
98
|
+
ri, rj = find(i), find(j)
|
|
99
|
+
if ri != rj:
|
|
100
|
+
parent[ri] = rj
|
|
101
|
+
|
|
102
|
+
for i in range(n):
|
|
103
|
+
for j in range(i + 1, n):
|
|
104
|
+
if sim[i][j] >= THRESHOLD:
|
|
105
|
+
union(i, j)
|
|
106
|
+
|
|
107
|
+
clusters: dict[int, list[int]] = {}
|
|
108
|
+
for i in range(n):
|
|
109
|
+
clusters.setdefault(find(i), []).append(i)
|
|
110
|
+
|
|
111
|
+
groups = []
|
|
112
|
+
for indices in clusters.values():
|
|
113
|
+
members_items = [items[i] for i in indices]
|
|
114
|
+
rep = max(members_items, key=lambda x: len(x["text"]))
|
|
115
|
+
if len(indices) > 1:
|
|
116
|
+
pair_sims = [float(sim[i][j])
|
|
117
|
+
for i in indices for j in indices if i < j]
|
|
118
|
+
avg_sim = sum(pair_sims) / len(pair_sims)
|
|
119
|
+
else:
|
|
120
|
+
avg_sim = 1.0
|
|
121
|
+
groups.append({
|
|
122
|
+
"members": [item["id"] for item in members_items],
|
|
123
|
+
"representative": rep["id"],
|
|
124
|
+
"similarity": round(avg_sim, 3),
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
groups.sort(key=lambda g: (-len(g["members"]), g["representative"]))
|
|
128
|
+
return groups
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def main() -> None:
|
|
132
|
+
items = _read_input()
|
|
133
|
+
if not items:
|
|
134
|
+
print("dedup-sidecar: no items on stdin", file=sys.stderr)
|
|
135
|
+
sys.exit(1)
|
|
136
|
+
groups = _cluster(items)
|
|
137
|
+
json.dump(
|
|
138
|
+
{"groups": groups, "threshold": THRESHOLD, "method": "minilm-cosine"},
|
|
139
|
+
sys.stdout,
|
|
140
|
+
)
|
|
141
|
+
sys.stdout.write("\n")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
main()
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Syntax validator sidecar — parses files via tree-sitter and reports errors.
|
|
4
|
+
|
|
5
|
+
Why Python: tree-sitter has mature grammar packages on PyPI with prebuilt
|
|
6
|
+
wheels (tree-sitter-python, tree-sitter-typescript, tree-sitter-javascript).
|
|
7
|
+
TypeScript has its own tree-sitter bindings via node-tree-sitter, but they
|
|
8
|
+
require native module compilation at install time and add a heavyweight
|
|
9
|
+
dependency to @kernlang/agon-core for a workflow that runs after patch-apply (not
|
|
10
|
+
per keystroke).
|
|
11
|
+
|
|
12
|
+
The current forge `validate` mode in packages/forge/src/generated/stages.ts
|
|
13
|
+
only inspects engine stdout (regex match for "validated"/"looks good"/etc.).
|
|
14
|
+
Nothing actually parses the resulting code. Tree-sitter fills that gap.
|
|
15
|
+
|
|
16
|
+
Protocol:
|
|
17
|
+
stdin — single JSON:
|
|
18
|
+
{
|
|
19
|
+
"files": [
|
|
20
|
+
{"path": "src/foo.ts", "content": "...", "language": "typescript"},
|
|
21
|
+
...
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
stdout — single JSON:
|
|
25
|
+
{
|
|
26
|
+
"results": [
|
|
27
|
+
{
|
|
28
|
+
"path": "src/foo.ts",
|
|
29
|
+
"valid": true,
|
|
30
|
+
"language": "typescript",
|
|
31
|
+
"errors": []
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"path": "src/bad.ts",
|
|
35
|
+
"valid": false,
|
|
36
|
+
"language": "typescript",
|
|
37
|
+
"errors": [
|
|
38
|
+
{"row": 12, "column": 4, "message": "MISSING ;"}
|
|
39
|
+
]
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"method": "tree-sitter",
|
|
43
|
+
"supported_languages": ["typescript", "tsx", "javascript", ...]
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
Languages: typescript, tsx, javascript, jsx, python, json
|
|
47
|
+
(additional names accepted: ts, ty, py — normalized internally)
|
|
48
|
+
|
|
49
|
+
Exit codes:
|
|
50
|
+
0 — success
|
|
51
|
+
1 — bad input
|
|
52
|
+
2 — tree-sitter or a grammar package not installed
|
|
53
|
+
3 — at least one file's language is unsupported (still produces output;
|
|
54
|
+
caller can decide what to do with unsupported)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
import json
|
|
60
|
+
import sys
|
|
61
|
+
from typing import Any
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
LANGUAGE_ALIASES: dict[str, str] = {
|
|
65
|
+
"ts": "typescript",
|
|
66
|
+
"typescript": "typescript",
|
|
67
|
+
"tsx": "tsx",
|
|
68
|
+
"js": "javascript",
|
|
69
|
+
"javascript": "javascript",
|
|
70
|
+
"jsx": "jsx",
|
|
71
|
+
"py": "python",
|
|
72
|
+
"python": "python",
|
|
73
|
+
"json": "json",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
SUPPORTED_LANGUAGES: frozenset[str] = frozenset({
|
|
78
|
+
"typescript", "tsx", "javascript", "jsx", "python", "json",
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _read_input() -> list[dict[str, str]]:
|
|
83
|
+
raw = sys.stdin.read().strip()
|
|
84
|
+
if not raw:
|
|
85
|
+
print("syntax-validator: empty stdin", file=sys.stderr)
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
try:
|
|
88
|
+
obj = json.loads(raw)
|
|
89
|
+
except json.JSONDecodeError as err:
|
|
90
|
+
print(f"syntax-validator: invalid JSON: {err}", file=sys.stderr)
|
|
91
|
+
sys.exit(1)
|
|
92
|
+
if not isinstance(obj, dict) or "files" not in obj:
|
|
93
|
+
print("syntax-validator: input must be {'files': [...]}",
|
|
94
|
+
file=sys.stderr)
|
|
95
|
+
sys.exit(1)
|
|
96
|
+
files = obj["files"]
|
|
97
|
+
if not isinstance(files, list) or not files:
|
|
98
|
+
print("syntax-validator: 'files' must be a non-empty array",
|
|
99
|
+
file=sys.stderr)
|
|
100
|
+
sys.exit(1)
|
|
101
|
+
normalized: list[dict[str, str]] = []
|
|
102
|
+
for i, raw_item in enumerate(files):
|
|
103
|
+
if not isinstance(raw_item, dict):
|
|
104
|
+
print(f"syntax-validator: files[{i}] must be an object",
|
|
105
|
+
file=sys.stderr)
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
for key in ("path", "content", "language"):
|
|
108
|
+
if key not in raw_item:
|
|
109
|
+
print(f"syntax-validator: files[{i}] missing '{key}'",
|
|
110
|
+
file=sys.stderr)
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
# Reject non-string values explicitly. `str(None)` is "None",
|
|
113
|
+
# which silently lies about the input — surface the bug here.
|
|
114
|
+
if not isinstance(raw_item[key], str):
|
|
115
|
+
print(f"syntax-validator: files[{i}].{key} must be a string "
|
|
116
|
+
f"(got {type(raw_item[key]).__name__})",
|
|
117
|
+
file=sys.stderr)
|
|
118
|
+
sys.exit(1)
|
|
119
|
+
normalized.append({
|
|
120
|
+
"path": raw_item["path"],
|
|
121
|
+
"content": raw_item["content"],
|
|
122
|
+
"language": raw_item["language"].lower().strip(),
|
|
123
|
+
})
|
|
124
|
+
return normalized
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _load_parsers() -> dict[str, Any]:
|
|
128
|
+
"""Lazy-load tree-sitter grammars. Returns dict of language → Parser."""
|
|
129
|
+
try:
|
|
130
|
+
from tree_sitter import Language, Parser
|
|
131
|
+
except ImportError:
|
|
132
|
+
print("syntax-validator: tree-sitter not installed — install via "
|
|
133
|
+
"`pip install -r packages/dedup/requirements.txt`",
|
|
134
|
+
file=sys.stderr)
|
|
135
|
+
sys.exit(2)
|
|
136
|
+
|
|
137
|
+
parsers: dict[str, Any] = {}
|
|
138
|
+
|
|
139
|
+
def _try(name: str, loader) -> None:
|
|
140
|
+
try:
|
|
141
|
+
lang = Language(loader())
|
|
142
|
+
except Exception as err: # noqa: BLE001 — grammar import is fragile
|
|
143
|
+
print(f"syntax-validator: failed to load grammar {name}: {err}",
|
|
144
|
+
file=sys.stderr)
|
|
145
|
+
return
|
|
146
|
+
parsers[name] = Parser(lang)
|
|
147
|
+
|
|
148
|
+
# Each grammar package is OPTIONAL — if one isn't installed, the
|
|
149
|
+
# validator just refuses to parse files of that language (the bridge
|
|
150
|
+
# then flags them grammar_unavailable). Log the absence to stderr so
|
|
151
|
+
# `agon doctor` and curious users can see which grammars are missing
|
|
152
|
+
# rather than silently degrading.
|
|
153
|
+
def _log_missing_grammar(name: str, err: ImportError) -> None:
|
|
154
|
+
print(
|
|
155
|
+
f"syntax-validator: grammar package not installed: {name} ({err})",
|
|
156
|
+
file=sys.stderr,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
import tree_sitter_python as tspy
|
|
161
|
+
_try("python", lambda: tspy.language())
|
|
162
|
+
except ImportError as err:
|
|
163
|
+
_log_missing_grammar("tree_sitter_python", err)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
import tree_sitter_typescript as tsts
|
|
167
|
+
_try("typescript", lambda: tsts.language_typescript())
|
|
168
|
+
_try("tsx", lambda: tsts.language_tsx())
|
|
169
|
+
except ImportError as err:
|
|
170
|
+
_log_missing_grammar("tree_sitter_typescript", err)
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
import tree_sitter_javascript as tsjs
|
|
174
|
+
_try("javascript", lambda: tsjs.language())
|
|
175
|
+
# JS and JSX share a grammar in tree-sitter-javascript.
|
|
176
|
+
_try("jsx", lambda: tsjs.language())
|
|
177
|
+
except ImportError as err:
|
|
178
|
+
_log_missing_grammar("tree_sitter_javascript", err)
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
import tree_sitter_json as tsjson
|
|
182
|
+
_try("json", lambda: tsjson.language())
|
|
183
|
+
except ImportError as err:
|
|
184
|
+
_log_missing_grammar("tree_sitter_json", err)
|
|
185
|
+
|
|
186
|
+
if not parsers:
|
|
187
|
+
print("syntax-validator: no grammar packages installed — install via "
|
|
188
|
+
"`pip install -r packages/dedup/requirements.txt`",
|
|
189
|
+
file=sys.stderr)
|
|
190
|
+
sys.exit(2)
|
|
191
|
+
|
|
192
|
+
return parsers
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _collect_errors(root, cap: int = 10) -> list[dict[str, Any]]:
|
|
196
|
+
"""Iteratively walk the AST collecting ERROR/MISSING nodes up to `cap`.
|
|
197
|
+
Iterative form prevents RecursionError on deeply-nested invalid input
|
|
198
|
+
(e.g. heavily-nested JSON, long arithmetic chains, large method chains)."""
|
|
199
|
+
errors: list[dict[str, Any]] = []
|
|
200
|
+
stack: list[Any] = [root]
|
|
201
|
+
while stack and len(errors) < cap:
|
|
202
|
+
node = stack.pop()
|
|
203
|
+
if node.is_error or node.is_missing:
|
|
204
|
+
msg = "MISSING " + node.type if node.is_missing else "ERROR"
|
|
205
|
+
errors.append({
|
|
206
|
+
"row": node.start_point[0],
|
|
207
|
+
"column": node.start_point[1],
|
|
208
|
+
"message": msg,
|
|
209
|
+
})
|
|
210
|
+
if len(errors) >= cap:
|
|
211
|
+
break
|
|
212
|
+
# Reverse so the depth-first order matches the source order.
|
|
213
|
+
for child in reversed(node.children):
|
|
214
|
+
stack.append(child)
|
|
215
|
+
return errors
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _python_indentation_check(content: str) -> list[dict[str, Any]]:
|
|
219
|
+
"""Tree-sitter's Python grammar is forgiving on indentation — it will
|
|
220
|
+
accept malformed indentation that the CPython parser rejects (e.g.
|
|
221
|
+
`def f():\nreturn 1`). Supplement tree-sitter with `compile(...)` for
|
|
222
|
+
Python files so indentation errors are caught."""
|
|
223
|
+
try:
|
|
224
|
+
compile(content, "<input>", "exec")
|
|
225
|
+
return []
|
|
226
|
+
except SyntaxError as err:
|
|
227
|
+
return [{
|
|
228
|
+
"row": (err.lineno or 1) - 1,
|
|
229
|
+
"column": (err.offset or 1) - 1,
|
|
230
|
+
"message": f"PYTHON {err.msg or 'syntax error'}".strip(),
|
|
231
|
+
}]
|
|
232
|
+
except (ValueError, TypeError) as err:
|
|
233
|
+
# ValueError: null bytes in source; TypeError: non-str source.
|
|
234
|
+
return [{
|
|
235
|
+
"row": 0,
|
|
236
|
+
"column": 0,
|
|
237
|
+
"message": f"PYTHON {type(err).__name__}: {err}",
|
|
238
|
+
}]
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _validate(
|
|
242
|
+
files: list[dict[str, str]],
|
|
243
|
+
parsers: dict[str, Any],
|
|
244
|
+
) -> tuple[list[dict[str, Any]], bool]:
|
|
245
|
+
"""Returns (results, any_unsupported). Files whose language we don't
|
|
246
|
+
know at all get valid=true with `language_unsupported: true` so the
|
|
247
|
+
caller can skip them. Files whose language IS known but whose grammar
|
|
248
|
+
failed to load get valid=false with `grammar_unavailable: true` so the
|
|
249
|
+
caller doesn't mistake a degraded sidecar for clean code."""
|
|
250
|
+
results: list[dict[str, Any]] = []
|
|
251
|
+
any_unsupported = False
|
|
252
|
+
for f in files:
|
|
253
|
+
lang_raw = f["language"]
|
|
254
|
+
lang = LANGUAGE_ALIASES.get(lang_raw)
|
|
255
|
+
if lang is None or lang not in SUPPORTED_LANGUAGES:
|
|
256
|
+
any_unsupported = True
|
|
257
|
+
results.append({
|
|
258
|
+
"path": f["path"],
|
|
259
|
+
"valid": True, # cannot prove invalid without a parser
|
|
260
|
+
"language": lang_raw,
|
|
261
|
+
"errors": [],
|
|
262
|
+
"language_unsupported": True,
|
|
263
|
+
})
|
|
264
|
+
continue
|
|
265
|
+
parser = parsers.get(lang)
|
|
266
|
+
if parser is None:
|
|
267
|
+
# Known language, but the grammar package isn't installed in
|
|
268
|
+
# this environment. Surface it as a per-file failure rather
|
|
269
|
+
# than letting it pass — silent pass-through is the bug
|
|
270
|
+
# codex flagged at 0.86 confidence.
|
|
271
|
+
results.append({
|
|
272
|
+
"path": f["path"],
|
|
273
|
+
"valid": False,
|
|
274
|
+
"language": lang,
|
|
275
|
+
"errors": [{
|
|
276
|
+
"row": 0,
|
|
277
|
+
"column": 0,
|
|
278
|
+
"message": f"grammar-unavailable: install tree-sitter-{lang}",
|
|
279
|
+
}],
|
|
280
|
+
"grammar_unavailable": True,
|
|
281
|
+
})
|
|
282
|
+
continue
|
|
283
|
+
try:
|
|
284
|
+
tree = parser.parse(bytes(f["content"], "utf-8"))
|
|
285
|
+
except Exception as err: # noqa: BLE001
|
|
286
|
+
results.append({
|
|
287
|
+
"path": f["path"],
|
|
288
|
+
"valid": False,
|
|
289
|
+
"language": lang,
|
|
290
|
+
"errors": [{
|
|
291
|
+
"row": 0,
|
|
292
|
+
"column": 0,
|
|
293
|
+
"message": f"parser-threw: {type(err).__name__}",
|
|
294
|
+
}],
|
|
295
|
+
})
|
|
296
|
+
continue
|
|
297
|
+
errors: list[dict[str, Any]] = []
|
|
298
|
+
if tree.root_node.has_error:
|
|
299
|
+
errors = _collect_errors(tree.root_node)
|
|
300
|
+
# Python: tree-sitter accepts malformed indentation that the real
|
|
301
|
+
# parser rejects. Run CPython's parser as a second pass; merge.
|
|
302
|
+
if lang == "python" and not errors:
|
|
303
|
+
errors = _python_indentation_check(f["content"])
|
|
304
|
+
results.append({
|
|
305
|
+
"path": f["path"],
|
|
306
|
+
"valid": not errors,
|
|
307
|
+
"language": lang,
|
|
308
|
+
"errors": errors,
|
|
309
|
+
})
|
|
310
|
+
return results, any_unsupported
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def main() -> None:
|
|
314
|
+
files = _read_input()
|
|
315
|
+
parsers = _load_parsers()
|
|
316
|
+
results, any_unsupported = _validate(files, parsers)
|
|
317
|
+
json.dump(
|
|
318
|
+
{
|
|
319
|
+
"results": results,
|
|
320
|
+
"method": "tree-sitter",
|
|
321
|
+
"supported_languages": sorted(parsers.keys()),
|
|
322
|
+
},
|
|
323
|
+
sys.stdout,
|
|
324
|
+
)
|
|
325
|
+
sys.stdout.write("\n")
|
|
326
|
+
if any_unsupported:
|
|
327
|
+
sys.exit(3)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
if __name__ == "__main__":
|
|
331
|
+
main()
|