@heytherevibin/skillforge 0.8.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/CONTRIBUTING.md +30 -19
- package/README.md +243 -235
- package/RELEASING.md +19 -7
- package/SECURITY.md +61 -13
- package/STRATEGY.md +40 -14
- package/bin/cli.js +112 -5
- package/ci/bundle-gate.json +4 -0
- package/lib/host-setup.js +312 -0
- package/lib/templates/claude-code-skillforge-global.md +19 -0
- package/lib/templates/cursor-skillforge-global.md +16 -0
- package/package.json +3 -2
- package/python/app/eval_cli.py +133 -0
- package/python/app/feedback_meta.py +96 -0
- package/python/app/health_cli.py +160 -0
- package/python/app/main.py +261 -22
- package/python/app/materialize.py +72 -4
- package/python/app/mcp_contract.py +13 -1
- package/python/app/mcp_server.py +124 -27
- package/python/app/route_cli.py +32 -13
- package/python/app/route_eval_harness.py +98 -0
- package/python/app/route_policies.py +110 -0
- package/python/app/route_quality.py +99 -0
- package/python/app/routing_signals.py +60 -0
- package/python/app/weights_cli.py +152 -0
- package/python/fixtures/route_eval/smoke.json +18 -0
- package/python/tests/test_feedback_weights.py +77 -0
- package/python/tests/test_materialize.py +51 -0
- package/python/tests/test_mcp_contract.py +117 -0
- package/python/tests/test_route_eval_harness.py +45 -0
- package/python/tests/test_route_quality.py +120 -0
- package/python/tests/test_routing_overlay.py +55 -0
- package/python/tests/test_routing_signals.py +35 -0
|
@@ -7,6 +7,8 @@ from typing import Any, Protocol
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
|
+
from app.route_quality import coerce_route_float
|
|
11
|
+
|
|
10
12
|
_TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_\-./]{2,}", re.I)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -93,3 +95,61 @@ def keyword_overlap_scores(route_query: str, skill_cards: list[str]) -> np.ndarr
|
|
|
93
95
|
ct = set(tokenize_skills_query(card))
|
|
94
96
|
out.append(float(len(qt & ct)))
|
|
95
97
|
return np.array(out, dtype=np.float64)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def host_pick_shortlist_lines(
|
|
101
|
+
*,
|
|
102
|
+
prompt: str,
|
|
103
|
+
route_query: str,
|
|
104
|
+
facet_rows: list[dict[str, Any]],
|
|
105
|
+
max_candidates: int | None = None,
|
|
106
|
+
line_chars: int | None = None,
|
|
107
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
108
|
+
"""Tight numbered list + structured rows for MCP host-pick phase (no in-process LLM)."""
|
|
109
|
+
mc = max_candidates
|
|
110
|
+
if mc is None:
|
|
111
|
+
mc = max(3, int(os.getenv("SKILLFORGE_HOST_PICK_MAX", "12")))
|
|
112
|
+
lc = line_chars if line_chars is not None else int(os.getenv("SKILLFORGE_HOST_PICK_LINE_CHARS", "120"))
|
|
113
|
+
prompt_one = (prompt or "").strip().replace("\n", " ")
|
|
114
|
+
if len(prompt_one) > 160:
|
|
115
|
+
prompt_one = prompt_one[:157] + "…"
|
|
116
|
+
rows_out: list[dict[str, Any]] = []
|
|
117
|
+
lines: list[str] = [
|
|
118
|
+
"# Host pick — choose skill names only from this list",
|
|
119
|
+
"",
|
|
120
|
+
f"Task: {prompt_one}",
|
|
121
|
+
"",
|
|
122
|
+
"Reply with JSON only:",
|
|
123
|
+
'{"picked": ["exact-skill-id", ...], "reasoning": "one line"}',
|
|
124
|
+
f"Use 0–{mc} names from the numbered lines only (empty picked is allowed). Copy names exactly.",
|
|
125
|
+
"",
|
|
126
|
+
"```",
|
|
127
|
+
]
|
|
128
|
+
for i, f in enumerate(facet_rows[:mc], start=1):
|
|
129
|
+
name = str(f.get("name") or "")
|
|
130
|
+
cos = coerce_route_float(f.get("cosine_similarity"))
|
|
131
|
+
card = f"{f.get('title') or name}: {(f.get('description_preview') or '')[:lc]}".replace("\n", " ").strip()
|
|
132
|
+
if len(card) > lc:
|
|
133
|
+
card = card[: lc - 1] + "…"
|
|
134
|
+
line = f"{i:>2}. {name} | cos={cos:.3f} | {card}"
|
|
135
|
+
lines.append(line)
|
|
136
|
+
rows_out.append({
|
|
137
|
+
"id": name,
|
|
138
|
+
"rank": i,
|
|
139
|
+
"name": name,
|
|
140
|
+
"cosine_similarity": round(cos, 6),
|
|
141
|
+
"routing_score": f.get("routing_score"),
|
|
142
|
+
"sparse_signal": f.get("sparse_signal"),
|
|
143
|
+
"learned_weight": f.get("learned_weight"),
|
|
144
|
+
"router_hybrid": f.get("router_hybrid"),
|
|
145
|
+
"source": f.get("source"),
|
|
146
|
+
"one_liner": card,
|
|
147
|
+
"rationale_one_liner": card,
|
|
148
|
+
})
|
|
149
|
+
lines.append("```")
|
|
150
|
+
rq = (route_query or "").strip()
|
|
151
|
+
if len(rq) > 400:
|
|
152
|
+
rq = rq[:397] + "…"
|
|
153
|
+
if rq:
|
|
154
|
+
lines.extend(["", f"_Retrieval query:_ {rq}"])
|
|
155
|
+
return "\n".join(lines), rows_out
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Export / import per-user skill_weights rows (JSON snapshot)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from app.db_paths import resolve_orchestrator_db
|
|
11
|
+
from app.main import init_db
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse_args(argv: list[str] | None) -> argparse.Namespace:
|
|
15
|
+
p = argparse.ArgumentParser(description="Export or import learned skill_weights (uses, thumbs, routing bias).")
|
|
16
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
17
|
+
|
|
18
|
+
ex = sub.add_parser("export", help="Dump skill_weights rows to JSON (stdout unless -o).")
|
|
19
|
+
ex.add_argument("-o", "--output", type=Path, default=None, help="Output file (default: stdout).")
|
|
20
|
+
ex.add_argument("--user-id", default="", help="Logical user id (default '' = global row set).")
|
|
21
|
+
ex.add_argument(
|
|
22
|
+
"--project-root",
|
|
23
|
+
default="",
|
|
24
|
+
help="Resolve DB from <root>/.skillforge/orchestrator.db (else env / global).",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
im = sub.add_parser("import", help="Load JSON snapshot into skill_weights.")
|
|
28
|
+
im.add_argument("file", type=Path, help="JSON file from skillforge weights export.")
|
|
29
|
+
im.add_argument(
|
|
30
|
+
"--user-id",
|
|
31
|
+
default=None,
|
|
32
|
+
help="Override user_id for all imported rows (default: use file's user_id).",
|
|
33
|
+
)
|
|
34
|
+
im.add_argument(
|
|
35
|
+
"--project-root",
|
|
36
|
+
default="",
|
|
37
|
+
help="Target DB path (same as export).",
|
|
38
|
+
)
|
|
39
|
+
im.add_argument(
|
|
40
|
+
"--replace-user",
|
|
41
|
+
action="store_true",
|
|
42
|
+
help="Delete existing rows for the target user_id before import.",
|
|
43
|
+
)
|
|
44
|
+
return p.parse_args(argv)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def export_weights(con, user_id: str) -> dict:
|
|
48
|
+
cur = con.execute(
|
|
49
|
+
"""
|
|
50
|
+
SELECT skill_name, weight, uses, referenced, thumbs_up, thumbs_down, disabled, updated_at
|
|
51
|
+
FROM skill_weights WHERE user_id = ? ORDER BY skill_name
|
|
52
|
+
""",
|
|
53
|
+
(user_id,),
|
|
54
|
+
)
|
|
55
|
+
rows = []
|
|
56
|
+
for r in cur.fetchall():
|
|
57
|
+
rows.append({
|
|
58
|
+
"skill_name": r[0],
|
|
59
|
+
"weight": float(r[1]),
|
|
60
|
+
"uses": int(r[2]),
|
|
61
|
+
"referenced": int(r[3]),
|
|
62
|
+
"thumbs_up": int(r[4]),
|
|
63
|
+
"thumbs_down": int(r[5]),
|
|
64
|
+
"disabled": int(r[6]),
|
|
65
|
+
"updated_at": float(r[7]) if r[7] is not None else None,
|
|
66
|
+
})
|
|
67
|
+
return {"version": 1, "user_id": user_id, "exported_at": time.time(), "rows": rows}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def import_weights(con, data: dict, *, user_id_override: str | None, replace_user: bool) -> int:
|
|
71
|
+
if not isinstance(data, dict):
|
|
72
|
+
raise ValueError("root must be object")
|
|
73
|
+
rows = data.get("rows")
|
|
74
|
+
if not isinstance(rows, list):
|
|
75
|
+
raise ValueError("rows must be array")
|
|
76
|
+
uid = user_id_override if user_id_override is not None else str(data.get("user_id") or "")
|
|
77
|
+
if replace_user:
|
|
78
|
+
con.execute("DELETE FROM skill_weights WHERE user_id = ?", (uid,))
|
|
79
|
+
n = 0
|
|
80
|
+
now = time.time()
|
|
81
|
+
for raw in rows:
|
|
82
|
+
if not isinstance(raw, dict):
|
|
83
|
+
continue
|
|
84
|
+
name = raw.get("skill_name")
|
|
85
|
+
if not name or not isinstance(name, str):
|
|
86
|
+
continue
|
|
87
|
+
con.execute(
|
|
88
|
+
"""
|
|
89
|
+
INSERT INTO skill_weights
|
|
90
|
+
(user_id, skill_name, weight, uses, referenced, thumbs_up, thumbs_down, disabled, updated_at)
|
|
91
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
92
|
+
ON CONFLICT(user_id, skill_name) DO UPDATE SET
|
|
93
|
+
weight = excluded.weight,
|
|
94
|
+
uses = excluded.uses,
|
|
95
|
+
referenced = excluded.referenced,
|
|
96
|
+
thumbs_up = excluded.thumbs_up,
|
|
97
|
+
thumbs_down = excluded.thumbs_down,
|
|
98
|
+
disabled = excluded.disabled,
|
|
99
|
+
updated_at = excluded.updated_at
|
|
100
|
+
""",
|
|
101
|
+
(
|
|
102
|
+
uid,
|
|
103
|
+
name,
|
|
104
|
+
float(raw.get("weight", 0.0)),
|
|
105
|
+
int(raw.get("uses", 0)),
|
|
106
|
+
int(raw.get("referenced", 0)),
|
|
107
|
+
int(raw.get("thumbs_up", 0)),
|
|
108
|
+
int(raw.get("thumbs_down", 0)),
|
|
109
|
+
int(raw.get("disabled", 0)),
|
|
110
|
+
float(raw.get("updated_at") or now),
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
n += 1
|
|
114
|
+
con.commit()
|
|
115
|
+
return n
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def main(argv: list[str] | None = None) -> None:
|
|
119
|
+
args = _parse_args(argv)
|
|
120
|
+
pr = (getattr(args, "project_root", "") or "").strip() or None
|
|
121
|
+
db_path = resolve_orchestrator_db(pr)
|
|
122
|
+
con = init_db(db_path)
|
|
123
|
+
try:
|
|
124
|
+
if args.cmd == "export":
|
|
125
|
+
payload = export_weights(con, args.user_id)
|
|
126
|
+
text = json.dumps(payload, indent=2)
|
|
127
|
+
if args.output:
|
|
128
|
+
args.output.write_text(text + "\n", encoding="utf-8")
|
|
129
|
+
print(f"Wrote {len(payload['rows'])} rows → {args.output}", file=sys.stderr)
|
|
130
|
+
else:
|
|
131
|
+
print(text)
|
|
132
|
+
raise SystemExit(0)
|
|
133
|
+
if args.cmd == "import":
|
|
134
|
+
path = args.file.expanduser().resolve()
|
|
135
|
+
if not path.is_file():
|
|
136
|
+
print(f"skillforge weights import: not found {path}", file=sys.stderr)
|
|
137
|
+
raise SystemExit(2)
|
|
138
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
139
|
+
n = import_weights(
|
|
140
|
+
con,
|
|
141
|
+
data,
|
|
142
|
+
user_id_override=args.user_id,
|
|
143
|
+
replace_user=bool(args.replace_user),
|
|
144
|
+
)
|
|
145
|
+
print(f"Imported {n} row(s) into {db_path}", file=sys.stderr)
|
|
146
|
+
raise SystemExit(0)
|
|
147
|
+
finally:
|
|
148
|
+
con.close()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
main()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"defaults": {
|
|
4
|
+
"candidate_window": 22
|
|
5
|
+
},
|
|
6
|
+
"cases": [
|
|
7
|
+
{
|
|
8
|
+
"id": "python-testing",
|
|
9
|
+
"prompt": "pytest fixtures caplog and monkeypatch for an API integration test",
|
|
10
|
+
"expect_in_candidates": ["python-testing"]
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "docker-patterns",
|
|
14
|
+
"prompt": "docker compose healthcheck restart policy and rollout",
|
|
15
|
+
"expect_in_candidates": ["docker-patterns"]
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Tests for feedback_effect snapshot and weights export/import."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from app.feedback_meta import build_feedback_effect, get_skill_weight_detail
|
|
5
|
+
from app.main import init_db, update_skill_stat
|
|
6
|
+
from app.weights_cli import export_weights, import_weights
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_get_skill_weight_detail_missing(tmp_path) -> None:
|
|
10
|
+
con = init_db(tmp_path / "a.db")
|
|
11
|
+
assert get_skill_weight_detail(con, "nope", "") is None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_build_feedback_effect_after_use(tmp_path) -> None:
|
|
15
|
+
con = init_db(tmp_path / "b.db")
|
|
16
|
+
update_skill_stat(con, "alpha", "uses", 1, user_id="")
|
|
17
|
+
update_skill_stat(con, "alpha", "thumbs_up", 1, user_id="")
|
|
18
|
+
fe = build_feedback_effect(con, ["alpha", "beta"], user_id="")
|
|
19
|
+
assert fe["schema"] == "feedback_effect/1"
|
|
20
|
+
assert len(fe["picked"]) == 2
|
|
21
|
+
alpha = next(p for p in fe["picked"] if p["skill"] == "alpha")
|
|
22
|
+
assert alpha["has_db_row"] is True
|
|
23
|
+
assert alpha["uses"] >= 1
|
|
24
|
+
beta = next(p for p in fe["picked"] if p["skill"] == "beta")
|
|
25
|
+
assert beta["has_db_row"] is False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_weights_export_import_roundtrip(tmp_path) -> None:
|
|
29
|
+
db1 = tmp_path / "w1.db"
|
|
30
|
+
db2 = tmp_path / "w2.db"
|
|
31
|
+
con = init_db(db1)
|
|
32
|
+
update_skill_stat(con, "x-skill", "uses", 2, user_id="u1")
|
|
33
|
+
update_skill_stat(con, "x-skill", "referenced", 1, user_id="u1")
|
|
34
|
+
con.close()
|
|
35
|
+
|
|
36
|
+
con = init_db(db1)
|
|
37
|
+
blob = export_weights(con, "u1")
|
|
38
|
+
con.close()
|
|
39
|
+
assert any(r["skill_name"] == "x-skill" for r in blob["rows"])
|
|
40
|
+
|
|
41
|
+
con2 = init_db(db2)
|
|
42
|
+
n = import_weights(con2, blob, user_id_override=None, replace_user=False)
|
|
43
|
+
con2.close()
|
|
44
|
+
assert n >= 1
|
|
45
|
+
|
|
46
|
+
con2 = init_db(db2)
|
|
47
|
+
cur = con2.execute(
|
|
48
|
+
"SELECT uses, referenced FROM skill_weights WHERE user_id = ? AND skill_name = ?",
|
|
49
|
+
("u1", "x-skill"),
|
|
50
|
+
)
|
|
51
|
+
row = cur.fetchone()
|
|
52
|
+
con2.close()
|
|
53
|
+
assert row is not None
|
|
54
|
+
assert int(row[0]) == 2
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_weights_import_replace_user(tmp_path) -> None:
|
|
58
|
+
db = tmp_path / "w3.db"
|
|
59
|
+
con = init_db(db)
|
|
60
|
+
update_skill_stat(con, "a", "uses", 1, user_id="")
|
|
61
|
+
update_skill_stat(con, "b", "uses", 1, user_id="")
|
|
62
|
+
con.close()
|
|
63
|
+
|
|
64
|
+
payload = {
|
|
65
|
+
"version": 1,
|
|
66
|
+
"user_id": "",
|
|
67
|
+
"rows": [{"skill_name": "only", "weight": 0.0, "uses": 1, "referenced": 0, "thumbs_up": 0, "thumbs_down": 0, "disabled": 0}],
|
|
68
|
+
}
|
|
69
|
+
con = init_db(db)
|
|
70
|
+
import_weights(con, payload, user_id_override="", replace_user=True)
|
|
71
|
+
con.close()
|
|
72
|
+
|
|
73
|
+
con = init_db(db)
|
|
74
|
+
cur = con.execute("SELECT skill_name FROM skill_weights WHERE user_id = '' ORDER BY skill_name")
|
|
75
|
+
names = [r[0] for r in cur.fetchall()]
|
|
76
|
+
con.close()
|
|
77
|
+
assert names == ["only"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Tests for project bootstrap file writes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from app.materialize import materialize_project_files
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_materialize_writes_cursor_command(tmp_path: Path) -> None:
|
|
11
|
+
root = tmp_path / "proj"
|
|
12
|
+
root.mkdir()
|
|
13
|
+
out = materialize_project_files(
|
|
14
|
+
str(root),
|
|
15
|
+
["alpha"],
|
|
16
|
+
{"alpha": "desc"},
|
|
17
|
+
merge=True,
|
|
18
|
+
)
|
|
19
|
+
rel = {Path(p).as_posix() for p in out["written"]}
|
|
20
|
+
assert ".cursor/commands/skillforge.md" in rel
|
|
21
|
+
assert ".claude/commands/skillforge.md" in rel
|
|
22
|
+
cc = root / ".claude" / "commands" / "skillforge.md"
|
|
23
|
+
assert cc.is_file()
|
|
24
|
+
cct = cc.read_text(encoding="utf-8")
|
|
25
|
+
assert "route_skills" in cct
|
|
26
|
+
assert "alpha" in cct
|
|
27
|
+
cur = root / ".cursor" / "commands" / "skillforge.md"
|
|
28
|
+
assert "alpha" in cur.read_text(encoding="utf-8")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_materialize_merge_false_skips_existing_command(tmp_path: Path) -> None:
|
|
32
|
+
root = tmp_path / "proj"
|
|
33
|
+
root.mkdir()
|
|
34
|
+
cmd = root / ".cursor" / "commands" / "skillforge.md"
|
|
35
|
+
cmd.parent.mkdir(parents=True)
|
|
36
|
+
cmd.write_text("keep-me", encoding="utf-8")
|
|
37
|
+
rule = root / ".cursor" / "rules" / "skillforge.mdc"
|
|
38
|
+
rule.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
rule.write_text("keep-rule", encoding="utf-8")
|
|
40
|
+
ccmd = root / ".claude" / "commands" / "skillforge.md"
|
|
41
|
+
ccmd.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
ccmd.write_text("keep-cc", encoding="utf-8")
|
|
43
|
+
materialize_project_files(
|
|
44
|
+
str(root),
|
|
45
|
+
["b"],
|
|
46
|
+
{},
|
|
47
|
+
merge=False,
|
|
48
|
+
)
|
|
49
|
+
assert cmd.read_text(encoding="utf-8") == "keep-me"
|
|
50
|
+
assert rule.read_text(encoding="utf-8") == "keep-rule"
|
|
51
|
+
assert ccmd.read_text(encoding="utf-8") == "keep-cc"
|
|
@@ -135,3 +135,120 @@ def test_build_route_skills_meta_error_field() -> None:
|
|
|
135
135
|
)
|
|
136
136
|
assert meta["error"] == "empty_prompt"
|
|
137
137
|
assert meta["sources"] == []
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_build_route_skills_meta_includes_route_quality() -> None:
|
|
141
|
+
rq = {"schema": "route_quality/1", "picked_count": 1}
|
|
142
|
+
meta = build_route_skills_meta(
|
|
143
|
+
result={
|
|
144
|
+
"candidates": [],
|
|
145
|
+
"session_id": "s",
|
|
146
|
+
"rerouted": False,
|
|
147
|
+
"change": 0.0,
|
|
148
|
+
"route_ms": 1.0,
|
|
149
|
+
"route_quality": rq,
|
|
150
|
+
},
|
|
151
|
+
picked_names=[],
|
|
152
|
+
user_id="u",
|
|
153
|
+
db_path="db.sqlite",
|
|
154
|
+
skills_map={},
|
|
155
|
+
response_text="x",
|
|
156
|
+
)
|
|
157
|
+
assert meta.get("route_quality") == rq
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_build_route_skills_meta_ignores_non_dict_route_quality() -> None:
|
|
161
|
+
meta = build_route_skills_meta(
|
|
162
|
+
result={
|
|
163
|
+
"candidates": [],
|
|
164
|
+
"session_id": "s",
|
|
165
|
+
"rerouted": False,
|
|
166
|
+
"change": 0.0,
|
|
167
|
+
"route_ms": 1.0,
|
|
168
|
+
"route_quality": "not-a-dict",
|
|
169
|
+
},
|
|
170
|
+
picked_names=[],
|
|
171
|
+
user_id="u",
|
|
172
|
+
db_path="db.sqlite",
|
|
173
|
+
skills_map={},
|
|
174
|
+
response_text="x",
|
|
175
|
+
)
|
|
176
|
+
assert "route_quality" not in meta
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_build_route_skills_meta_includes_feedback_effect() -> None:
|
|
180
|
+
fb = {"schema": "feedback_effect/1", "picked": []}
|
|
181
|
+
meta = build_route_skills_meta(
|
|
182
|
+
result={
|
|
183
|
+
"candidates": [],
|
|
184
|
+
"session_id": "s",
|
|
185
|
+
"rerouted": False,
|
|
186
|
+
"change": 0.0,
|
|
187
|
+
"route_ms": 1.0,
|
|
188
|
+
"feedback_effect": fb,
|
|
189
|
+
},
|
|
190
|
+
picked_names=[],
|
|
191
|
+
user_id="u",
|
|
192
|
+
db_path="db.sqlite",
|
|
193
|
+
skills_map={},
|
|
194
|
+
response_text="x",
|
|
195
|
+
)
|
|
196
|
+
assert meta.get("feedback_effect") == fb
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def test_build_route_skills_meta_ignores_non_dict_feedback_effect() -> None:
|
|
200
|
+
meta = build_route_skills_meta(
|
|
201
|
+
result={
|
|
202
|
+
"candidates": [],
|
|
203
|
+
"session_id": "s",
|
|
204
|
+
"rerouted": False,
|
|
205
|
+
"change": 0.0,
|
|
206
|
+
"route_ms": 1.0,
|
|
207
|
+
"feedback_effect": [1, 2],
|
|
208
|
+
},
|
|
209
|
+
picked_names=[],
|
|
210
|
+
user_id="u",
|
|
211
|
+
db_path="db.sqlite",
|
|
212
|
+
skills_map={},
|
|
213
|
+
response_text="x",
|
|
214
|
+
)
|
|
215
|
+
assert "feedback_effect" not in meta
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_build_route_skills_meta_includes_routing_overlay() -> None:
|
|
219
|
+
ro = {"schema": "routing_overlay/1", "exclude_skills": ["x"]}
|
|
220
|
+
meta = build_route_skills_meta(
|
|
221
|
+
result={
|
|
222
|
+
"candidates": [],
|
|
223
|
+
"session_id": "s",
|
|
224
|
+
"rerouted": False,
|
|
225
|
+
"change": 0.0,
|
|
226
|
+
"route_ms": 1.0,
|
|
227
|
+
"routing_overlay": ro,
|
|
228
|
+
},
|
|
229
|
+
picked_names=[],
|
|
230
|
+
user_id="u",
|
|
231
|
+
db_path="db.sqlite",
|
|
232
|
+
skills_map={},
|
|
233
|
+
response_text="x",
|
|
234
|
+
)
|
|
235
|
+
assert meta.get("routing_overlay") == ro
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def test_build_route_skills_meta_ignores_non_dict_routing_overlay() -> None:
|
|
239
|
+
meta = build_route_skills_meta(
|
|
240
|
+
result={
|
|
241
|
+
"candidates": [],
|
|
242
|
+
"session_id": "s",
|
|
243
|
+
"rerouted": False,
|
|
244
|
+
"change": 0.0,
|
|
245
|
+
"route_ms": 1.0,
|
|
246
|
+
"routing_overlay": "bad",
|
|
247
|
+
},
|
|
248
|
+
picked_names=[],
|
|
249
|
+
user_id="u",
|
|
250
|
+
db_path="db.sqlite",
|
|
251
|
+
skills_map={},
|
|
252
|
+
response_text="x",
|
|
253
|
+
)
|
|
254
|
+
assert "routing_overlay" not in meta
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Unit tests for route eval fixture matcher (no embedding load)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from types import SimpleNamespace
|
|
5
|
+
|
|
6
|
+
from app.route_eval_harness import evaluate_case_result, load_eval_fixture
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _cands(names: list[str]) -> list:
|
|
10
|
+
return [(SimpleNamespace(name=n), 0.9) for n in names]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_evaluate_case_expect_in_candidates() -> None:
|
|
14
|
+
r = {"candidates": _cands(["a", "b", "python-testing"]), "picked_names": ["a"]}
|
|
15
|
+
case = {"id": "t", "prompt": "x", "expect_in_candidates": ["python-testing"]}
|
|
16
|
+
assert evaluate_case_result(r, case, defaults={"candidate_window": 10}) == []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_evaluate_case_missing_candidate() -> None:
|
|
20
|
+
r = {"candidates": _cands(["x", "y"]), "picked_names": ["x"]}
|
|
21
|
+
case = {"id": "t", "prompt": "x", "expect_in_candidates": ["python-testing"]}
|
|
22
|
+
err = evaluate_case_result(r, case, defaults={"candidate_window": 5})
|
|
23
|
+
assert err and "python-testing" in err[0]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_evaluate_picked_any() -> None:
|
|
27
|
+
r = {"candidates": _cands(["a", "b"]), "picked_names": ["b"]}
|
|
28
|
+
case = {"id": "t", "expect_picked_any": ["b"]}
|
|
29
|
+
assert evaluate_case_result(r, case) == []
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_host_shortlist_fails() -> None:
|
|
33
|
+
r = {"host_pick_shortlist": True, "candidates": [], "picked_names": []}
|
|
34
|
+
err = evaluate_case_result(r, {"id": "h"}, defaults={})
|
|
35
|
+
assert any("host shortlist" in e for e in err)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_load_fixture(tmp_path) -> None:
|
|
39
|
+
p = tmp_path / "f.json"
|
|
40
|
+
p.write_text(
|
|
41
|
+
'{"version":1,"cases":[{"prompt":"hi","expect_in_candidates":["z"]}]}',
|
|
42
|
+
encoding="utf-8",
|
|
43
|
+
)
|
|
44
|
+
data = load_eval_fixture(p)
|
|
45
|
+
assert len(data["cases"]) == 1
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Tests for route_quality telemetry (robust coercion, hybrid diagnostics)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from app.route_quality import (
|
|
5
|
+
build_route_quality,
|
|
6
|
+
coerce_route_float,
|
|
7
|
+
policy_includes_added_count,
|
|
8
|
+
top1_cosine_vs_routing_agreement,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_coerce_route_float() -> None:
|
|
13
|
+
assert coerce_route_float("0.5") == 0.5
|
|
14
|
+
assert coerce_route_float(None) == 0.0
|
|
15
|
+
assert coerce_route_float("nope") == 0.0
|
|
16
|
+
assert coerce_route_float(float("nan")) == 0.0
|
|
17
|
+
assert coerce_route_float(float("inf")) == 0.0
|
|
18
|
+
assert coerce_route_float(1.0, default=-1.0) == 1.0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_policy_includes_added_count() -> None:
|
|
22
|
+
assert policy_includes_added_count(None) == 0
|
|
23
|
+
assert policy_includes_added_count([{"effect": "added"}, {"effect": "skip"}, "bad", {}]) == 1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_top1_agreement() -> None:
|
|
27
|
+
assert top1_cosine_vs_routing_agreement([]) is None
|
|
28
|
+
assert top1_cosine_vs_routing_agreement([{"name": "a"}]) is None
|
|
29
|
+
facets = [
|
|
30
|
+
{"name": "high_route", "cosine_similarity": 0.1, "routing_score": 1.0},
|
|
31
|
+
{"name": "high_cos", "cosine_similarity": 0.9, "routing_score": 0.2},
|
|
32
|
+
]
|
|
33
|
+
assert top1_cosine_vs_routing_agreement(facets) is False
|
|
34
|
+
facets2 = [
|
|
35
|
+
{"name": "winner", "cosine_similarity": 0.9, "routing_score": 1.0},
|
|
36
|
+
{"name": "lose", "cosine_similarity": 0.1, "routing_score": 0.2},
|
|
37
|
+
]
|
|
38
|
+
assert top1_cosine_vs_routing_agreement(facets2) is True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_build_route_quality_empty_facets() -> None:
|
|
42
|
+
rq = build_route_quality(
|
|
43
|
+
facet_list=[],
|
|
44
|
+
router_mode="auto",
|
|
45
|
+
router_hybrid="off",
|
|
46
|
+
picked_names=[],
|
|
47
|
+
rerouted=False,
|
|
48
|
+
change=float("nan"),
|
|
49
|
+
policy_rules_loaded="bogus",
|
|
50
|
+
policy_audit=None,
|
|
51
|
+
host_picked=False,
|
|
52
|
+
pick_path="embedding_top",
|
|
53
|
+
)
|
|
54
|
+
assert rq["shortlist"]["size"] == 0
|
|
55
|
+
assert rq["shortlist"]["top_cosine_similarity"] is None
|
|
56
|
+
assert rq["session"]["change_jaccard"] == 0.0
|
|
57
|
+
assert rq["policy"]["rules_loaded"] == 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_build_route_quality_malformed_metrics() -> None:
|
|
61
|
+
facets = [
|
|
62
|
+
{"name": "a", "cosine_similarity": "bad", "routing_score": float("nan")},
|
|
63
|
+
{"name": "b", "cosine_similarity": 0.5, "routing_score": 0.2},
|
|
64
|
+
]
|
|
65
|
+
rq = build_route_quality(
|
|
66
|
+
facet_list=facets,
|
|
67
|
+
router_mode="host",
|
|
68
|
+
router_hybrid="weighted",
|
|
69
|
+
picked_names=["a"],
|
|
70
|
+
rerouted=True,
|
|
71
|
+
change=0.25,
|
|
72
|
+
policy_rules_loaded=-3,
|
|
73
|
+
policy_audit=[{"effect": "added"}, {"effect": "added"}],
|
|
74
|
+
host_picked=False,
|
|
75
|
+
pick_path="haiku_pick",
|
|
76
|
+
)
|
|
77
|
+
assert rq["shortlist"]["top_cosine_similarity"] == 0.0
|
|
78
|
+
assert rq["shortlist"]["top_routing_score"] == 0.0
|
|
79
|
+
assert rq["shortlist"]["second_cosine_similarity"] == 0.5
|
|
80
|
+
assert rq["shortlist"]["cosine_margin"] == round(-0.5, 6)
|
|
81
|
+
assert rq["shortlist"]["top1_dense_and_fused_agree"] is False
|
|
82
|
+
assert rq["policy"]["rules_loaded"] == 0
|
|
83
|
+
assert rq["policy"]["includes_added"] == 2
|
|
84
|
+
assert rq["picked_count"] == 1
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_build_route_quality_hybrid_off_skips_agree() -> None:
|
|
88
|
+
facets = [
|
|
89
|
+
{"name": "x", "cosine_similarity": 0.1},
|
|
90
|
+
{"name": "y", "cosine_similarity": 0.9},
|
|
91
|
+
]
|
|
92
|
+
rq = build_route_quality(
|
|
93
|
+
facet_list=facets,
|
|
94
|
+
router_mode="auto",
|
|
95
|
+
router_hybrid="off",
|
|
96
|
+
picked_names=[],
|
|
97
|
+
rerouted=False,
|
|
98
|
+
change=0.0,
|
|
99
|
+
policy_rules_loaded=0,
|
|
100
|
+
policy_audit=[],
|
|
101
|
+
host_picked=False,
|
|
102
|
+
pick_path="embedding_top",
|
|
103
|
+
)
|
|
104
|
+
assert rq["shortlist"]["top1_dense_and_fused_agree"] is None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_build_route_quality_rules_loaded_ok() -> None:
|
|
108
|
+
rq = build_route_quality(
|
|
109
|
+
facet_list=[],
|
|
110
|
+
router_mode="auto",
|
|
111
|
+
router_hybrid="off",
|
|
112
|
+
picked_names=[],
|
|
113
|
+
rerouted=False,
|
|
114
|
+
change=0.0,
|
|
115
|
+
policy_rules_loaded=12,
|
|
116
|
+
policy_audit=[],
|
|
117
|
+
host_picked=False,
|
|
118
|
+
pick_path="host_shortlist",
|
|
119
|
+
)
|
|
120
|
+
assert rq["policy"]["rules_loaded"] == 12
|