pi-research 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ [
2
+ "authoritative_source_count",
3
+ "blocked_source_count",
4
+ "blog_count",
5
+ "candidate_conflict",
6
+ "file_count",
7
+ "forum_count",
8
+ "github_readme_count",
9
+ "github_repo_count",
10
+ "has_authority_resolution_path",
11
+ "negative_signal_sources",
12
+ "official_doc_count",
13
+ "other_count",
14
+ "paper_count",
15
+ "positive_signal_sources",
16
+ "query_academic",
17
+ "query_comparison",
18
+ "query_procedural",
19
+ "query_temporal",
20
+ "query_versioned",
21
+ "source_count"
22
+ ]
@@ -0,0 +1,5 @@
1
+ {
2
+ "task": "conflict",
3
+ "bestModel": "lr",
4
+ "rows": 80
5
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "macro_f1": 0.5773809523809524,
3
+ "train_size": 122,
4
+ "val_size": 34,
5
+ "high_risk_downgrades": 1,
6
+ "classes": [
7
+ "changelog",
8
+ "github",
9
+ "package-registry",
10
+ "papers",
11
+ "security",
12
+ "specs",
13
+ "vendor-status",
14
+ "web"
15
+ ]
16
+ }
Binary file
@@ -0,0 +1,16 @@
1
+ {
2
+ "macro_f1": 0.41485507246376807,
3
+ "train_size": 122,
4
+ "val_size": 34,
5
+ "high_risk_downgrades": 1,
6
+ "classes": [
7
+ "changelog",
8
+ "github",
9
+ "package-registry",
10
+ "papers",
11
+ "security",
12
+ "specs",
13
+ "vendor-status",
14
+ "web"
15
+ ]
16
+ }
Binary file
@@ -0,0 +1,3 @@
1
+ {
2
+ "confidenceThreshold": 0.75
3
+ }
Binary file
@@ -0,0 +1,22 @@
1
+ [
2
+ "authoritative_source_count",
3
+ "blocked_source_count",
4
+ "blog_count",
5
+ "file_count",
6
+ "forum_count",
7
+ "github_readme_count",
8
+ "github_repo_count",
9
+ "has_authority",
10
+ "has_only_one_good_source",
11
+ "negative_signal_sources",
12
+ "official_doc_count",
13
+ "other_count",
14
+ "paper_count",
15
+ "positive_signal_sources",
16
+ "query_academic",
17
+ "query_comparison",
18
+ "query_procedural",
19
+ "query_temporal",
20
+ "query_versioned",
21
+ "source_count"
22
+ ]
@@ -0,0 +1,5 @@
1
+ {
2
+ "task": "sufficiency",
3
+ "bestModel": "lr",
4
+ "rows": 78
5
+ }
@@ -0,0 +1,106 @@
1
+ # Tiny Router Training Runbook
2
+
3
+ Target budget:
4
+
5
+ - GPU RAM: 2 GB
6
+ - CPU RAM: 20 GB
7
+ - Default path: CPU-first, frozen embeddings, small models
8
+
9
+ ## Environment
10
+
11
+ ```bash
12
+ python3 -m venv .venv-router
13
+ . .venv-router/bin/activate
14
+ pip install -r ml/router/requirements.txt
15
+ ```
16
+
17
+ ## Phase 1 — domain router
18
+
19
+ ```bash
20
+ node scripts/router/audit-cache.mjs
21
+ node scripts/router/export-examples.mjs
22
+ node scripts/router/split-examples.mjs
23
+
24
+ python ml/router/embed_model2vec.py \
25
+ --input data/router/examples.jsonl \
26
+ --gold data/router/gold-domain.jsonl \
27
+ --synthetic data/router/synthetic-train.jsonl
28
+
29
+ python ml/router/train_domain_classifier.py \
30
+ --embeddings data/router/domain-model2vec.npz data/router/synthetic-model2vec.npz \
31
+ --gold-embeddings data/router/gold-model2vec.npz \
32
+ --out .cache/models/pi-research-router/domain \
33
+ --model-type auto
34
+
35
+ python ml/router/evaluate_domain.py \
36
+ --model .cache/models/pi-research-router/domain/model.joblib \
37
+ --embeddings data/router/gold-model2vec.npz \
38
+ --out metrics/router/domain-model2vec-lr.json
39
+
40
+ python ml/router/benchmark_latency.py \
41
+ --model-dir .cache/models/pi-research-router/domain \
42
+ --examples data/router/gold-domain.jsonl \
43
+ --out metrics/router/latency.json
44
+
45
+ python scripts/router/eval_domain_unknown.py \
46
+ --model-dir .cache/models/pi-research-router/domain \
47
+ --input data/router/unknown-domain-smoke.jsonl
48
+ ```
49
+
50
+ ## Phase 2 — structured baselines
51
+
52
+ Build provisional structured rows:
53
+
54
+ ```bash
55
+ node scripts/router/export_structured_provisional.mjs
56
+ node scripts/router/eval_structured_baselines.mjs
57
+ ```
58
+
59
+ Train conservative structured classifiers:
60
+
61
+ ```bash
62
+ python ml/router/train_structured_baseline.py --task conflict
63
+ python ml/router/train_structured_baseline.py --task sufficiency
64
+ ```
65
+
66
+ Outputs:
67
+
68
+ - `.cache/models/pi-research-router/conflict-structured/`
69
+ - `.cache/models/pi-research-router/sufficiency-structured/`
70
+ - `metrics/router/conflict-structured-models.json`
71
+ - `metrics/router/sufficiency-structured-models.json`
72
+
73
+ ## Runtime flags
74
+
75
+ ```bash
76
+ PI_RESEARCH_TINY_ROUTER=1
77
+ PI_RESEARCH_TINY_ROUTER_MODEL=.cache/models/pi-research-router
78
+ PI_RESEARCH_TINY_ROUTER_TIMEOUT_MS=50
79
+ PI_RESEARCH_TINY_ROUTER_DOMAIN=1
80
+ PI_RESEARCH_TINY_ROUTER_FOLLOWUP=1
81
+ PI_RESEARCH_TINY_ROUTER_CONFLICT=0
82
+ PI_RESEARCH_TINY_ROUTER_SUFFICIENCY=0
83
+ ```
84
+
85
+ Keep conflict/sufficiency off until metrics are reviewed.
86
+
87
+ ## Server deploy
88
+
89
+ Safe MCP runtime deploy:
90
+
91
+ ```bash
92
+ scripts/router/deploy-server-runtime.sh \
93
+ blackknight@100.98.190.19 \
94
+ ~/work/pi-research-runtime
95
+ ```
96
+
97
+ This syncs the repo, installs user-local Node if needed, copies trained router models, runs `npm install`, and writes:
98
+
99
+ - `start-mcp-tiny-router-safe.sh`
100
+ - `start-mcp-tiny-router-experimental.sh`
101
+
102
+ Recommended start command:
103
+
104
+ ```bash
105
+ ssh blackknight@100.98.190.19 'cd ~/work/pi-research-runtime && ./start-mcp-tiny-router-safe.sh'
106
+ ```
@@ -0,0 +1,81 @@
1
+ import json
2
+ import argparse
3
+ import time
4
+ import numpy as np
5
+ import joblib
6
+ import os
7
+
8
+ sys_path_added = False
9
+ if not sys_path_added:
10
+ import sys
11
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
12
+ sys_path_added = True
13
+
14
+ from features import load_embedding_model, extract_domain_features
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model-dir", required=True)
19
+ parser.add_argument("--examples", required=True)
20
+ parser.add_argument("--out", required=True)
21
+ args = parser.parse_args()
22
+
23
+ print(f"Loading Model2Vec...")
24
+ emb_model = load_embedding_model()
25
+
26
+ print(f"Loading Classifier...")
27
+ clf = joblib.load(f"{args.model_dir}/model.joblib")
28
+
29
+ # Load a few queries to test
30
+ queries = []
31
+ with open(args.examples, "r") as f:
32
+ for line in f:
33
+ if not line.strip(): continue
34
+ ex = json.loads(line)
35
+ queries.append(ex["query"])
36
+
37
+ # Warmup
38
+ print("Warming up...")
39
+ for q in queries[:10]:
40
+ feats = extract_domain_features([q], ["fast"], emb_model=emb_model, show_progress_bar=False)
41
+ clf.predict(feats)
42
+
43
+ # Benchmark
44
+ print(f"Benchmarking {len(queries)} queries sequentially...")
45
+ latencies = []
46
+
47
+ for q in queries:
48
+ t0 = time.perf_counter()
49
+
50
+ feats = extract_domain_features([q], ["fast"], emb_model=emb_model, show_progress_bar=False)
51
+ pred = clf.predict(feats)[0]
52
+
53
+ t1 = time.perf_counter()
54
+ latencies.append((t1 - t0) * 1000) # ms
55
+
56
+ latencies = np.array(latencies)
57
+ p50 = np.percentile(latencies, 50)
58
+ p95 = np.percentile(latencies, 95)
59
+ mean = np.mean(latencies)
60
+
61
+ print(f"p50: {p50:.2f} ms")
62
+ print(f"p95: {p95:.2f} ms")
63
+ print(f"Mean: {mean:.2f} ms")
64
+
65
+ os.makedirs(os.path.dirname(args.out), exist_ok=True)
66
+
67
+ metrics = {
68
+ "task": "domain",
69
+ "latency_ms": {
70
+ "p50": p50,
71
+ "p95": p95,
72
+ "mean": mean,
73
+ "samples": len(latencies)
74
+ }
75
+ }
76
+
77
+ with open(args.out, "w") as f:
78
+ json.dump(metrics, f, indent=2)
79
+
80
+ if __name__ == "__main__":
81
+ main()
@@ -0,0 +1,140 @@
1
+ import sys
2
+ import json
3
+ import logging
4
+ import joblib
5
+ import numpy as np
6
+ import traceback
7
+ import os
8
+
9
+ # Add the directory containing features.py to sys.path
10
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
11
+ from features import load_embedding_model, extract_domain_features, extract_followup_features
12
+
13
+ logging.basicConfig(level=logging.ERROR)
14
+
15
+
16
+ def load_model(path):
17
+ return joblib.load(path) if os.path.exists(path) else None
18
+
19
+
20
+ def load_feature_names(path):
21
+ if not os.path.exists(path):
22
+ return None
23
+ with open(path, "r") as f:
24
+ return json.load(f)
25
+
26
+
27
+ def predict_proba_like(clf, features):
28
+ if hasattr(clf, "predict_proba"):
29
+ proba = clf.predict_proba(features)[0]
30
+ max_idx = int(np.argmax(proba))
31
+ return clf.classes_[max_idx], float(proba[max_idx])
32
+
33
+ pred = clf.predict(features)[0]
34
+ return pred, 1.0
35
+
36
+
37
+ def vectorize_structured_features(feature_names, features):
38
+ row = [float(features.get(name, 0.0)) for name in feature_names]
39
+ return np.array([row], dtype=np.float32)
40
+
41
+
42
+ def main():
43
+ if len(sys.argv) < 2:
44
+ print(json.dumps({"error": "Missing model path"}))
45
+ sys.exit(1)
46
+
47
+ model_dir = sys.argv[1]
48
+
49
+ try:
50
+ emb_model = load_embedding_model()
51
+ domain_clf = load_model(os.path.join(model_dir, "domain", "model.joblib"))
52
+ followup_clf = load_model(os.path.join(model_dir, "followup", "model.joblib"))
53
+ conflict_clf = load_model(os.path.join(model_dir, "conflict-structured", "model.joblib"))
54
+ sufficiency_clf = load_model(os.path.join(model_dir, "sufficiency-structured", "model.joblib"))
55
+ conflict_feature_names = load_feature_names(os.path.join(model_dir, "conflict-structured", "feature-names.json"))
56
+ sufficiency_feature_names = load_feature_names(os.path.join(model_dir, "sufficiency-structured", "feature-names.json"))
57
+ except Exception as e:
58
+ print(json.dumps({"error": f"Failed to load models: {str(e)}"}))
59
+ sys.exit(1)
60
+
61
+ print("READY", flush=True)
62
+
63
+ for line in sys.stdin:
64
+ line = line.strip()
65
+ if not line:
66
+ continue
67
+
68
+ try:
69
+ req = json.loads(line)
70
+ req_id = req.get("id")
71
+ task = req.get("task", "domain")
72
+ query = req.get("query", "")
73
+ mode = req.get("mode", "fast")
74
+
75
+ if task == "domain":
76
+ if not domain_clf:
77
+ print(json.dumps({"id": req_id, "error": "Domain model not loaded"}), flush=True)
78
+ continue
79
+
80
+ feats = extract_domain_features([query], [mode], emb_model=emb_model, show_progress_bar=False)
81
+ pred, confidence = predict_proba_like(domain_clf, feats)
82
+
83
+ print(json.dumps({
84
+ "id": req_id,
85
+ "domain": str(pred),
86
+ "confidence": confidence
87
+ }), flush=True)
88
+
89
+ elif task == "followup":
90
+ if not followup_clf:
91
+ print(json.dumps({"id": req_id, "error": "Followup model not loaded"}), flush=True)
92
+ continue
93
+
94
+ conflict = req.get("conflict", "none")
95
+ sources = req.get("sources", {})
96
+
97
+ feats = extract_followup_features([query], [mode], [conflict], [sources], emb_model=emb_model, show_progress_bar=False)
98
+ pred, confidence = predict_proba_like(followup_clf, feats)
99
+
100
+ print(json.dumps({
101
+ "id": req_id,
102
+ "action": str(pred),
103
+ "confidence": confidence
104
+ }), flush=True)
105
+
106
+ elif task == "conflict":
107
+ if not conflict_clf or not conflict_feature_names:
108
+ print(json.dumps({"id": req_id, "error": "Conflict model not loaded"}), flush=True)
109
+ continue
110
+
111
+ feats = vectorize_structured_features(conflict_feature_names, req.get("features", {}))
112
+ pred, confidence = predict_proba_like(conflict_clf, feats)
113
+ print(json.dumps({
114
+ "id": req_id,
115
+ "decision": str(pred),
116
+ "confidence": confidence
117
+ }), flush=True)
118
+
119
+ elif task == "sufficiency":
120
+ if not sufficiency_clf or not sufficiency_feature_names:
121
+ print(json.dumps({"id": req_id, "error": "Sufficiency model not loaded"}), flush=True)
122
+ continue
123
+
124
+ feats = vectorize_structured_features(sufficiency_feature_names, req.get("features", {}))
125
+ pred, confidence = predict_proba_like(sufficiency_clf, feats)
126
+ print(json.dumps({
127
+ "id": req_id,
128
+ "decision": str(pred),
129
+ "confidence": confidence
130
+ }), flush=True)
131
+
132
+ else:
133
+ print(json.dumps({"id": req_id, "error": f"Unknown task: {task}"}), flush=True)
134
+
135
+ except Exception as e:
136
+ print(json.dumps({"error": str(e), "trace": traceback.format_exc()}), flush=True)
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()
@@ -0,0 +1,48 @@
1
+ import json
2
+ import argparse
3
+ import numpy as np
4
+ from features import load_embedding_model, extract_domain_features, extract_followup_features
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--input", required=True)
9
+ parser.add_argument("--out", required=True)
10
+ parser.add_argument("--model", default="minishlab/potion-base-8M")
11
+ parser.add_argument("--task", default="domain")
12
+ args = parser.parse_args()
13
+
14
+ examples = []
15
+ with open(args.input, "r") as f:
16
+ for line in f:
17
+ if not line.strip(): continue
18
+ ex = json.loads(line)
19
+ if "task" not in ex or ex["task"] == args.task:
20
+ examples.append(ex)
21
+
22
+ print(f"Loaded {len(examples)} examples for task '{args.task}'")
23
+
24
+ print(f"Loading StaticModel: {args.model}")
25
+ model = load_embedding_model()
26
+
27
+ queries = [ex["query"] for ex in examples]
28
+ modes = [ex.get("mode", ex.get("meta", {}).get("mode", "fast")) for ex in examples]
29
+
30
+ if args.task == "domain":
31
+ print(f"Encoding {len(queries)} queries for domain routing...")
32
+ features = extract_domain_features(queries, modes, emb_model=model, show_progress_bar=True)
33
+ elif args.task == "followup":
34
+ print(f"Encoding {len(queries)} queries for followup action...")
35
+ conflicts = [ex.get("conflict", "none") for ex in examples]
36
+ sources_list = [ex.get("sources", {}) for ex in examples]
37
+ features = extract_followup_features(queries, modes, conflicts, sources_list, emb_model=model, show_progress_bar=True)
38
+ else:
39
+ raise ValueError(f"Unknown task: {args.task}")
40
+
41
+ ids = np.array([ex.get("id", str(i)) for i, ex in enumerate(examples)])
42
+ labels = np.array([ex["label"] for ex in examples])
43
+
44
+ print(f"Saving features shape {features.shape} to {args.out}")
45
+ np.savez(args.out, features=features, ids=ids, labels=labels)
46
+
47
+ if __name__ == "__main__":
48
+ main()
@@ -0,0 +1,67 @@
1
+ import json
2
+ import argparse
3
+ import numpy as np
4
+ import joblib
5
+ import os
6
+ from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("--model", required=True)
11
+ parser.add_argument("--embeddings", required=True)
12
+ parser.add_argument("--out", required=True)
13
+ args = parser.parse_args()
14
+
15
+ # Load model
16
+ print(f"Loading model from {args.model}")
17
+ clf = joblib.load(args.model)
18
+
19
+ # Load data
20
+ data = np.load(args.embeddings)
21
+ X = data["features"]
22
+ y_true = data["labels"]
23
+
24
+ # Evaluate
25
+ y_pred = clf.predict(X)
26
+
27
+ macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
28
+ accuracy = accuracy_score(y_true, y_pred)
29
+
30
+ print("\nGold Validation Report:")
31
+ print(classification_report(y_true, y_pred, zero_division=0))
32
+
33
+ print(f"\nMacro-F1: {macro_f1:.4f}")
34
+
35
+ # Extract high-risk misclassifications
36
+ classes = clf.classes_
37
+ high_risk_classes = ["security", "papers", "specs"]
38
+ cm = confusion_matrix(y_true, y_pred, labels=classes)
39
+
40
+ high_risk_errors = 0
41
+ web_idx = np.where(classes == "web")[0]
42
+ if len(web_idx) > 0:
43
+ web_idx = web_idx[0]
44
+ for hr_class in high_risk_classes:
45
+ hr_idx = np.where(classes == hr_class)[0]
46
+ if len(hr_idx) > 0:
47
+ errors = cm[hr_idx[0], web_idx]
48
+ if errors > 0:
49
+ print(f"HIGH RISK WARNING: {errors} '{hr_class}' queries routed to 'web'")
50
+ high_risk_errors += errors
51
+
52
+ # Save artifacts
53
+ os.makedirs(os.path.dirname(args.out), exist_ok=True)
54
+
55
+ metrics = {
56
+ "task": "domain",
57
+ "eval_set_size": len(X),
58
+ "macro_f1": macro_f1,
59
+ "accuracy": accuracy,
60
+ "high_risk_downgrades": int(high_risk_errors),
61
+ "classes": classes.tolist()
62
+ }
63
+ with open(args.out, "w") as f:
64
+ json.dump(metrics, f, indent=2)
65
+
66
+ if __name__ == "__main__":
67
+ main()
@@ -0,0 +1,60 @@
1
+ import numpy as np
2
+ from model2vec import StaticModel
3
+
4
+ EMBEDDING_MODEL_NAME = "minishlab/potion-base-8M"
5
+
6
+ def load_embedding_model() -> StaticModel:
7
+ """Loads the base static model for feature extraction."""
8
+ return StaticModel.from_pretrained(EMBEDDING_MODEL_NAME)
9
+
10
+ def encode_modes(modes: list) -> np.ndarray:
11
+ """Encodes a list of mode strings into a one-hot float32 numpy array."""
12
+ encoded = []
13
+ for mode in modes:
14
+ encoded.append([
15
+ 1.0 if mode == "fast" else 0.0,
16
+ 1.0 if mode == "deep" else 0.0,
17
+ 1.0 if mode == "academic" else 0.0,
18
+ 1.0 if mode == "code" else 0.0,
19
+ ])
20
+ return np.array(encoded, dtype=np.float32)
21
+
22
+ def extract_domain_features(queries: list, modes: list, emb_model: StaticModel = None, show_progress_bar: bool = False) -> np.ndarray:
23
+ """Extracts the combined feature vector (text embeddings + one-hot mode) for domain routing."""
24
+ if emb_model is None:
25
+ emb_model = load_embedding_model()
26
+
27
+ emb = emb_model.encode(queries, show_progress_bar=show_progress_bar)
28
+ modes_np = encode_modes(modes)
29
+ return np.hstack([emb, modes_np])
30
+
31
+ def encode_followup_meta(conflicts: list, sources_list: list) -> np.ndarray:
32
+ """Encodes conflict and source metadata into a feature array for followup classification."""
33
+ encoded = []
34
+ for conflict, sources in zip(conflicts, sources_list):
35
+ row = [
36
+ 1.0 if conflict == "severe" else 0.0,
37
+ 1.0 if conflict == "minor" else 0.0,
38
+ 1.0 if conflict == "none" else 0.0,
39
+
40
+ 1.0 if sources.get("has_authority", False) else 0.0,
41
+ 1.0 if sources.get("has_forum", False) else 0.0,
42
+ 1.0 if sources.get("has_news", False) else 0.0,
43
+ 1.0 if sources.get("has_recent", False) else 0.0,
44
+
45
+ # Normalize source count (cap at 10)
46
+ min(float(sources.get("source_count", 3)) / 10.0, 1.0)
47
+ ]
48
+ encoded.append(row)
49
+ return np.array(encoded, dtype=np.float32)
50
+
51
+ def extract_followup_features(queries: list, modes: list, conflicts: list, sources_list: list, emb_model: StaticModel = None, show_progress_bar: bool = False) -> np.ndarray:
52
+ """Extracts features for the followup action classifier."""
53
+ if emb_model is None:
54
+ emb_model = load_embedding_model()
55
+
56
+ emb = emb_model.encode(queries, show_progress_bar=show_progress_bar)
57
+ modes_np = encode_modes(modes)
58
+ meta_np = encode_followup_meta(conflicts, sources_list)
59
+
60
+ return np.hstack([emb, modes_np, meta_np])
@@ -0,0 +1,5 @@
1
+ model2vec>=0.2.0
2
+ scikit-learn
3
+ numpy
4
+ joblib
5
+ imbalanced-learn
@@ -0,0 +1,57 @@
1
+ import json
2
+ import argparse
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.svm import LinearSVC
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.calibration import CalibratedClassifierCV
8
+ from sklearn.metrics import classification_report, f1_score, confusion_matrix
9
+ import os
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--embeddings", required=True, nargs="+")
14
+ parser.add_argument("--out", required=True)
15
+ parser.add_argument("--model-type", choices=["svc", "lr"], default="svc")
16
+ args = parser.parse_args()
17
+
18
+ # Load all data files and combine
19
+ X_list, y_list = [], []
20
+ for emb_file in args.embeddings:
21
+ data = np.load(emb_file)
22
+ X_list.append(data["features"])
23
+ y_list.append(data["labels"])
24
+
25
+ X_train = np.vstack(X_list)
26
+ y_train = np.hstack(y_list)
27
+
28
+ print(f"Combined Train size: {len(X_train)}")
29
+
30
+ from imblearn.over_sampling import RandomOverSampler
31
+ ros = RandomOverSampler(random_state=42)
32
+ X_res, y_res = ros.fit_resample(X_train, y_train)
33
+ print(f"Size after OverSampling: {len(X_res)}")
34
+
35
+ # Train model
36
+ print(f"Training {args.model_type} with class_weight='balanced'...")
37
+ if args.model_type == "svc":
38
+ base_clf = LinearSVC(class_weight="balanced", dual=False, max_iter=5000, C=0.5)
39
+ clf = CalibratedClassifierCV(base_clf, method="sigmoid", cv=5)
40
+ else:
41
+ clf = LogisticRegression(class_weight="balanced", max_iter=5000)
42
+
43
+ clf.fit(X_res, y_res)
44
+
45
+ # Save artifacts
46
+ os.makedirs(args.out, exist_ok=True)
47
+ model_path = os.path.join(args.out, "model.joblib")
48
+ joblib.dump(clf, model_path)
49
+ print(f"Model saved to {model_path}")
50
+
51
+ # Save a small report to standard out
52
+ preds = clf.predict(X_res)
53
+ print("Train Report:")
54
+ print(classification_report(y_res, preds))
55
+
56
+ if __name__ == "__main__":
57
+ main()
@@ -0,0 +1,209 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ from collections import Counter
5
+
6
+ import joblib
7
+ import numpy as np
8
+ from imblearn.over_sampling import RandomOverSampler
9
+ from sklearn.calibration import CalibratedClassifierCV
10
+ from sklearn.linear_model import LogisticRegression
11
+ from sklearn.metrics import accuracy_score, f1_score
12
+ from sklearn.svm import LinearSVC
13
+
14
+ HIGH_RISK_CLASSES = {"security", "papers", "specs"}
15
+ PRECISION_TARGETS = (0.95, 0.90, 0.85)
16
+ MIN_DEFAULT_THRESHOLD = 0.35
17
+ MIN_HIGH_RISK_THRESHOLD = 0.55
18
+
19
+
20
+ def load_embeddings(paths):
21
+ features, labels = [], []
22
+ for path in paths:
23
+ data = np.load(path)
24
+ features.append(data["features"])
25
+ labels.append(data["labels"])
26
+ return np.vstack(features), np.hstack(labels)
27
+
28
+
29
+ def build_classifier(model_type):
30
+ if model_type == "svc":
31
+ base = LinearSVC(class_weight="balanced", dual=False, max_iter=5000, C=0.5)
32
+ return CalibratedClassifierCV(base, method="sigmoid", cv=5)
33
+ if model_type == "lr":
34
+ return LogisticRegression(class_weight="balanced", max_iter=5000)
35
+ raise ValueError(f"Unsupported model_type: {model_type}")
36
+
37
+
38
+ def train_classifier(model_type, X_train, y_train):
39
+ ros = RandomOverSampler(random_state=42)
40
+ X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
41
+ clf = build_classifier(model_type)
42
+ clf.fit(X_resampled, y_resampled)
43
+ return clf, len(X_resampled)
44
+
45
+
46
+ def evaluate_classifier(clf, X_eval, y_eval):
47
+ probs = clf.predict_proba(X_eval)
48
+ pred_idx = np.argmax(probs, axis=1)
49
+ preds = clf.classes_[pred_idx]
50
+ confs = np.max(probs, axis=1)
51
+ accuracy = accuracy_score(y_eval, preds)
52
+ macro_f1 = f1_score(y_eval, preds, average="macro", zero_division=0)
53
+
54
+ high_risk_downgrades = 0
55
+ for gold, pred in zip(y_eval, preds):
56
+ if gold in HIGH_RISK_CLASSES and pred == "web":
57
+ high_risk_downgrades += 1
58
+
59
+ return {
60
+ "accuracy": float(accuracy),
61
+ "macro_f1": float(macro_f1),
62
+ "high_risk_downgrades": int(high_risk_downgrades),
63
+ "preds": preds,
64
+ "confs": confs,
65
+ }
66
+
67
+
68
+ def derive_threshold_for_label(label, y_true, preds, confs):
69
+ candidate_thresholds = sorted({0.0, *[float(conf) for pred, conf in zip(preds, confs) if pred == label]})
70
+ best = None
71
+ floor = MIN_HIGH_RISK_THRESHOLD if label in HIGH_RISK_CLASSES else MIN_DEFAULT_THRESHOLD
72
+
73
+ gold_support = sum(1 for gold in y_true if gold == label)
74
+ for target_precision in PRECISION_TARGETS:
75
+ for threshold in candidate_thresholds:
76
+ accepted = [i for i, (pred, conf) in enumerate(zip(preds, confs)) if pred == label and conf >= threshold]
77
+ if not accepted:
78
+ continue
79
+ tp = sum(1 for i in accepted if y_true[i] == label)
80
+ fp = len(accepted) - tp
81
+ precision = tp / len(accepted)
82
+ recall = tp / gold_support if gold_support else 0.0
83
+ score = (precision >= target_precision, recall, precision, -threshold)
84
+ if best is None or score > best[0]:
85
+ best = (score, {
86
+ "threshold": float(threshold),
87
+ "precision": float(precision),
88
+ "recall": float(recall),
89
+ "accepted": len(accepted),
90
+ "tp": int(tp),
91
+ "fp": int(fp),
92
+ "target_precision": float(target_precision),
93
+ })
94
+ if best and best[0][0]:
95
+ best[1]["threshold"] = max(float(best[1]["threshold"]), floor)
96
+ return best[1]
97
+
98
+ if best:
99
+ best[1]["threshold"] = max(float(best[1]["threshold"]), floor)
100
+ return best[1]
101
+ return {
102
+ "threshold": 0.75 if label in HIGH_RISK_CLASSES else 0.80,
103
+ "precision": 0.0,
104
+ "recall": 0.0,
105
+ "accepted": 0,
106
+ "tp": 0,
107
+ "fp": 0,
108
+ "target_precision": PRECISION_TARGETS[-1],
109
+ }
110
+
111
+
112
+ def derive_calibration(clf, X_eval, y_eval):
113
+ evaluation = evaluate_classifier(clf, X_eval, y_eval)
114
+ preds = evaluation["preds"]
115
+ confs = evaluation["confs"]
116
+ thresholds = {}
117
+ diagnostics = {}
118
+
119
+ for label in clf.classes_:
120
+ diag = derive_threshold_for_label(label, y_eval, preds, confs)
121
+ thresholds[str(label)] = float(diag["threshold"])
122
+ diagnostics[str(label)] = diag
123
+
124
+ return {
125
+ "defaultThreshold": 0.80,
126
+ "highRiskThreshold": 0.75,
127
+ "domainThresholds": thresholds,
128
+ "diagnostics": diagnostics,
129
+ }
130
+
131
+
132
+ def choose_best_report(reports):
133
+ return max(
134
+ reports,
135
+ key=lambda item: (-item["metrics"]["high_risk_downgrades"], item["metrics"]["accuracy"], item["metrics"]["macro_f1"], item["model_type"]),
136
+ )
137
+
138
+
139
+ def main():
140
+ parser = argparse.ArgumentParser()
141
+ parser.add_argument("--embeddings", required=True, nargs="+")
142
+ parser.add_argument("--gold-embeddings")
143
+ parser.add_argument("--out", required=True)
144
+ parser.add_argument("--model-type", choices=["svc", "lr", "auto"], default="auto")
145
+ args = parser.parse_args()
146
+
147
+ X_train, y_train = load_embeddings(args.embeddings)
148
+ print(f"Combined Train size: {len(X_train)}")
149
+ print(f"Train label distribution: {dict(Counter(y_train))}")
150
+
151
+ candidate_model_types = [args.model_type] if args.model_type != "auto" else ["svc", "lr"]
152
+ gold_data = load_embeddings([args.gold_embeddings]) if args.gold_embeddings else None
153
+
154
+ reports = []
155
+ for model_type in candidate_model_types:
156
+ print(f"Training {model_type} with class_weight='balanced'...")
157
+ clf, resampled_size = train_classifier(model_type, X_train, y_train)
158
+ report = {
159
+ "model_type": model_type,
160
+ "clf": clf,
161
+ "resampled_size": resampled_size,
162
+ }
163
+ if gold_data:
164
+ X_gold, y_gold = gold_data
165
+ report["metrics"] = evaluate_classifier(clf, X_gold, y_gold)
166
+ report["calibration"] = derive_calibration(clf, X_gold, y_gold)
167
+ print(json.dumps({
168
+ "model_type": model_type,
169
+ "accuracy": report["metrics"]["accuracy"],
170
+ "macro_f1": report["metrics"]["macro_f1"],
171
+ "high_risk_downgrades": report["metrics"]["high_risk_downgrades"],
172
+ }, indent=2))
173
+ reports.append(report)
174
+
175
+ best = reports[0] if len(reports) == 1 or not gold_data else choose_best_report(reports)
176
+ print(f"Selected model: {best['model_type']}")
177
+
178
+ os.makedirs(args.out, exist_ok=True)
179
+ model_path = os.path.join(args.out, "model.joblib")
180
+ joblib.dump(best["clf"], model_path)
181
+ print(f"Model saved to {model_path}")
182
+
183
+ meta = {
184
+ "modelType": best["model_type"],
185
+ "trainSize": int(len(X_train)),
186
+ "resampledTrainSize": int(best["resampled_size"]),
187
+ }
188
+ with open(os.path.join(args.out, "meta.json"), "w") as f:
189
+ json.dump(meta, f, indent=2)
190
+
191
+ if best.get("calibration"):
192
+ with open(os.path.join(args.out, "calibration.json"), "w") as f:
193
+ json.dump(best["calibration"], f, indent=2)
194
+
195
+ if best.get("metrics"):
196
+ metrics = {
197
+ "task": "domain",
198
+ "modelType": best["model_type"],
199
+ "accuracy": best["metrics"]["accuracy"],
200
+ "macro_f1": best["metrics"]["macro_f1"],
201
+ "high_risk_downgrades": best["metrics"]["high_risk_downgrades"],
202
+ "classes": [str(label) for label in best["clf"].classes_],
203
+ }
204
+ with open(os.path.join(args.out, "metrics.json"), "w") as f:
205
+ json.dump(metrics, f, indent=2)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
@@ -0,0 +1,174 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ from collections import Counter
6
+
7
+ import joblib
8
+ import numpy as np
9
+ from imblearn.over_sampling import RandomOverSampler
10
+ from sklearn.linear_model import LogisticRegression
11
+ from sklearn.metrics import accuracy_score, classification_report, f1_score
12
+ from sklearn.model_selection import GroupKFold
13
+ from sklearn.neural_network import MLPClassifier
14
+ from sklearn.pipeline import Pipeline
15
+ from sklearn.preprocessing import StandardScaler
16
+
17
+ TASK_INPUTS = {
18
+ "conflict": os.path.join("data", "router", "gold-conflict-structured.jsonl"),
19
+ "sufficiency": os.path.join("data", "router", "gold-sufficiency-structured.jsonl"),
20
+ }
21
+
22
+ TASK_BASELINES = {
23
+ "conflict": os.path.join("metrics", "router", "conflict-baseline-provisional.json"),
24
+ "sufficiency": os.path.join("metrics", "router", "sufficiency-baseline-provisional.json"),
25
+ }
26
+
27
+ MODEL_BUILDERS = {
28
+ "lr": lambda: Pipeline([
29
+ ("scaler", StandardScaler()),
30
+ ("clf", LogisticRegression(max_iter=5000, class_weight="balanced")),
31
+ ]),
32
+ "mlp": lambda: Pipeline([
33
+ ("scaler", StandardScaler()),
34
+ ("clf", MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=2000, random_state=42, early_stopping=False)),
35
+ ]),
36
+ }
37
+
38
+
39
+ def normalize_query_group(query: str) -> str:
40
+ return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9\s]+", " ", (query or "").lower())).strip()
41
+
42
+
43
+ def load_jsonl(path: str):
44
+ with open(path, "r") as f:
45
+ return [json.loads(line) for line in f if line.strip()]
46
+
47
+
48
+ def build_xy(rows):
49
+ feature_names = sorted(rows[0]["features"].keys())
50
+ X = np.array([[row["features"][name] for name in feature_names] for row in rows], dtype=np.float32)
51
+ y = np.array([row["label"] for row in rows])
52
+ groups = np.array([normalize_query_group(row["query"]) for row in rows])
53
+ return X, y, groups, feature_names
54
+
55
+
56
+ def choose_n_splits(y, groups):
57
+ min_class = min(Counter(y).values())
58
+ return max(2, min(5, len(set(groups)), min_class))
59
+
60
+
61
+ def evaluate_model(model_name, rows):
62
+ X, y, groups, feature_names = build_xy(rows)
63
+ splitter = GroupKFold(n_splits=choose_n_splits(y, groups))
64
+ gold, pred = [], []
65
+ fold_rows = []
66
+
67
+ for fold, (train_idx, test_idx) in enumerate(splitter.split(X, y, groups), start=1):
68
+ ros = RandomOverSampler(random_state=42)
69
+ X_train, y_train = ros.fit_resample(X[train_idx], y[train_idx])
70
+ clf = MODEL_BUILDERS[model_name]()
71
+ clf.fit(X_train, y_train)
72
+ probs = clf.predict_proba(X[test_idx]) if hasattr(clf, "predict_proba") else None
73
+ preds = clf.predict(X[test_idx])
74
+
75
+ for local_idx, pred_label in enumerate(preds):
76
+ idx = test_idx[local_idx]
77
+ confidence = None
78
+ if probs is not None:
79
+ confidence = float(np.max(probs[local_idx]))
80
+ gold.append(str(y[idx]))
81
+ pred.append(str(pred_label))
82
+ fold_rows.append({
83
+ "fold": fold,
84
+ "query": rows[idx]["query"],
85
+ "gold": str(y[idx]),
86
+ "pred": str(pred_label),
87
+ "confidence": confidence,
88
+ })
89
+
90
+ return {
91
+ "model": model_name,
92
+ "accuracy": accuracy_score(gold, pred),
93
+ "macro_f1": f1_score(gold, pred, average="macro"),
94
+ "classification_report": classification_report(gold, pred, output_dict=True),
95
+ "rows": fold_rows,
96
+ "feature_names": feature_names,
97
+ }
98
+
99
+
100
+ def train_full_model(model_name, rows):
101
+ X, y, _, feature_names = build_xy(rows)
102
+ ros = RandomOverSampler(random_state=42)
103
+ X_train, y_train = ros.fit_resample(X, y)
104
+ clf = MODEL_BUILDERS[model_name]()
105
+ clf.fit(X_train, y_train)
106
+ return clf, feature_names
107
+
108
+
109
+ def load_baseline_metrics(task: str):
110
+ path = TASK_BASELINES[task]
111
+ if not os.path.exists(path):
112
+ return None
113
+ with open(path, "r") as f:
114
+ data = json.load(f)
115
+ return {
116
+ "accuracy": data.get("accuracy"),
117
+ "macroF1": data.get("macroF1"),
118
+ "falseSufficient": data.get("falseSufficient"),
119
+ }
120
+
121
+
122
+ def main():
123
+ parser = argparse.ArgumentParser()
124
+ parser.add_argument("--task", choices=["conflict", "sufficiency"], required=True)
125
+ parser.add_argument("--input")
126
+ parser.add_argument("--out-dir")
127
+ args = parser.parse_args()
128
+
129
+ input_path = args.input or TASK_INPUTS[args.task]
130
+ out_dir = args.out_dir or os.path.join(".cache", "models", "pi-research-router", f"{args.task}-structured")
131
+ metrics_path = os.path.join("metrics", "router", f"{args.task}-structured-models.json")
132
+
133
+ rows = load_jsonl(input_path)
134
+ baseline = load_baseline_metrics(args.task)
135
+
136
+ reports = {
137
+ model_name: evaluate_model(model_name, rows)
138
+ for model_name in ["lr", "mlp"]
139
+ }
140
+ best_name = max(reports.keys(), key=lambda name: (reports[name]["macro_f1"], reports[name]["accuracy"]))
141
+ best_model, feature_names = train_full_model(best_name, rows)
142
+
143
+ os.makedirs(out_dir, exist_ok=True)
144
+ os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
145
+ joblib.dump(best_model, os.path.join(out_dir, "model.joblib"))
146
+ with open(os.path.join(out_dir, "feature-names.json"), "w") as f:
147
+ json.dump(feature_names, f, indent=2)
148
+ with open(os.path.join(out_dir, "meta.json"), "w") as f:
149
+ json.dump({"task": args.task, "bestModel": best_name, "rows": len(rows)}, f, indent=2)
150
+
151
+ summary = {
152
+ "task": args.task,
153
+ "rows": len(rows),
154
+ "baseline": baseline,
155
+ "best_model": best_name,
156
+ "models": reports,
157
+ }
158
+ with open(metrics_path, "w") as f:
159
+ json.dump(summary, f, indent=2)
160
+
161
+ print(json.dumps({
162
+ "task": args.task,
163
+ "rows": len(rows),
164
+ "baseline": baseline,
165
+ "best_model": best_name,
166
+ "best_accuracy": reports[best_name]["accuracy"],
167
+ "best_macro_f1": reports[best_name]["macro_f1"],
168
+ "lr_macro_f1": reports["lr"]["macro_f1"],
169
+ "mlp_macro_f1": reports["mlp"]["macro_f1"],
170
+ }, indent=2))
171
+
172
+
173
+ if __name__ == "__main__":
174
+ main()
package/package.json CHANGED
@@ -1,20 +1,21 @@
1
1
  {
2
2
  "name": "pi-research",
3
- "version": "1.4.0",
3
+ "version": "1.4.1",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Zero-setup grounded web research for AI coding agents.",
7
7
  "license": "MIT",
8
8
  "main": "./index.js",
9
9
  "bin": {
10
- "pi-research": "./bin/pi-research.js",
11
- "unblind-mcp": "./bin/unblind-mcp.js"
10
+ "pi-research": "bin/pi-research.js",
11
+ "unblind-mcp": "bin/unblind-mcp.js"
12
12
  },
13
13
  "files": [
14
14
  "bin",
15
15
  "extensions",
16
16
  "index.js",
17
17
  "lib",
18
+ "ml",
18
19
  "mcp",
19
20
  "mcp-server.js",
20
21
  "pi-research.js",