@geravant/sinain 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/package-lock.json +963 -0
  6. package/sinain-core/package.json +1 -0
  7. package/sinain-core/src/buffers/feed-buffer.ts +34 -0
  8. package/sinain-core/src/embedding/service.ts +66 -0
  9. package/sinain-core/src/index.ts +65 -17
  10. package/sinain-core/src/learning/local-curation.ts +137 -7
  11. package/sinain-core/src/server.ts +31 -0
  12. package/sinain-memory/README.md +105 -0
  13. package/sinain-memory/embed_client.py +117 -0
  14. package/sinain-memory/graph_query.py +269 -18
  15. package/sinain-memory/knowledge_integrator.py +551 -74
  16. package/sinain-memory/memory-config.json +1 -1
  17. package/sinain-memory/session_distiller.py +43 -19
  18. package/sinain-memory/triplestore.py +60 -0
  19. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  20. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  21. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  22. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  23. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/__init__.py +0 -0
  25. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/assertions.py +0 -267
  27. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  38. package/sinain-memory/eval/benchmarks/config.py +0 -23
  39. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  40. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  41. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  42. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  43. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  44. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  45. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  46. package/sinain-memory/eval/benchmarks/query.py +0 -172
  47. package/sinain-memory/eval/benchmarks/report.py +0 -87
  48. package/sinain-memory/eval/benchmarks/runner.py +0 -276
  49. package/sinain-memory/eval/judges/__init__.py +0 -0
  50. package/sinain-memory/eval/judges/base_judge.py +0 -61
  51. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  52. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  53. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  54. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  55. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  56. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  57. package/sinain-memory/eval/schemas.py +0 -247
  58. package/sinain-memory/tests/__init__.py +0 -0
  59. package/sinain-memory/tests/conftest.py +0 -189
  60. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  61. package/sinain-memory/tests/test_embedder.py +0 -210
  62. package/sinain-memory/tests/test_extract_json.py +0 -124
  63. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  64. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  65. package/sinain-memory/tests/test_module_management.py +0 -458
  66. package/sinain-memory/tests/test_parsers.py +0 -96
  67. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  68. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  69. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  70. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  71. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,172 +0,0 @@
1
- """Query pipeline — benchmark questions → LLM answers under 3 conditions.
2
-
3
- Condition A (sinain-memory): answer from knowledge graph facts
4
- Condition B (full-context): answer from full conversation history
5
- Condition C (knowledge-doc): answer from portable knowledge document
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import sys
11
- from pathlib import Path
12
-
13
- # Add sinain-memory to path
14
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
- if _koog_dir not in sys.path:
16
- sys.path.insert(0, _koog_dir)
17
-
18
- from common import call_llm # noqa: E402
19
-
20
- from .base_adapter import BenchmarkQuestion
21
- from .config import QA_MODEL, MAX_FACTS_PER_QUERY
22
-
23
-
24
- def _extract_keywords(query: str) -> list[str]:
25
- """Extract search keywords (reuses logic from retrieval_evaluator)."""
26
- import re
27
- words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
28
- stopwords = {
29
- "the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an",
30
- "it", "was", "not", "how", "what", "when", "does", "did", "do", "my",
31
- "your", "their", "have", "has", "had", "are", "were", "been", "being",
32
- "about", "from", "with", "that", "this", "which", "who", "whom",
33
- "where", "why", "can", "could", "would", "should",
34
- }
35
- return [w for w in words if len(w) > 2 and w not in stopwords]
36
-
37
-
38
- def _get_all_facts_text(db_path: str) -> str:
39
- """Dump ALL facts from the knowledge graph as formatted text.
40
-
41
- Sinain triplestores are small (10-50 facts per session), so including
42
- everything is feasible and avoids tag-matching failures.
43
- """
44
- from graph_query import query_top_facts, format_facts_text
45
-
46
- facts = query_top_facts(db_path, limit=50)
47
- if not facts:
48
- return "(no knowledge available)"
49
- return format_facts_text(facts, max_chars=6000)
50
-
51
-
52
- def _query_knowledge(db_path: str, question: str) -> str:
53
- """Query sinain knowledge graph for facts relevant to a question.
54
-
55
- Strategy: first try tag-based retrieval (targeted). If nothing found,
56
- fall back to full DB dump (sinain stores are small enough).
57
- """
58
- from graph_query import query_facts_by_entities, format_facts_text
59
-
60
- keywords = _extract_keywords(question)
61
- if keywords:
62
- facts = query_facts_by_entities(db_path, keywords, max_facts=MAX_FACTS_PER_QUERY)
63
- if facts:
64
- return format_facts_text(facts, max_chars=3000)
65
-
66
- # Fallback: include all facts (DB is small, typically 10-30 facts)
67
- return _get_all_facts_text(db_path)
68
-
69
-
70
- def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
71
- """Get facts retrieved for a question (for retrieval evaluation)."""
72
- from graph_query import query_facts_by_entities, query_top_facts
73
-
74
- keywords = _extract_keywords(question)
75
- if keywords:
76
- facts = query_facts_by_entities(db_path, keywords, max_facts=k)
77
- if facts:
78
- return facts
79
-
80
- # Fallback: top facts by confidence
81
- return query_top_facts(db_path, limit=k)
82
-
83
-
84
- def compute_content_recall(
85
- retrieved_facts: list[dict],
86
- gold_answer: str,
87
- k_values: list[int] | None = None,
88
- ) -> dict:
89
- """Content-based retrieval metric: do retrieved facts contain the answer?
90
-
91
- Instead of matching entity IDs (which don't align between LongMemEval
92
- session IDs and sinain entity IDs), we check whether the gold answer's
93
- key terms appear in any retrieved fact's content.
94
- """
95
- from .config import K_VALUES
96
- ks = k_values or K_VALUES
97
-
98
- gold_terms = set(_extract_keywords(str(gold_answer)))
99
- if not gold_terms:
100
- return {f"content_recall@{k}": 0.0 for k in ks}
101
-
102
- result = {}
103
- for k in ks:
104
- top_k = retrieved_facts[:k]
105
- # Check if any fact in top-k contains gold answer terms
106
- for fact in top_k:
107
- fact_text = f"{fact.get('entity', '')} {fact.get('value', '')}".lower()
108
- fact_terms = set(_extract_keywords(fact_text))
109
- overlap = gold_terms & fact_terms
110
- if len(overlap) >= max(1, len(gold_terms) // 2): # ≥50% of gold terms
111
- result[f"content_recall@{k}"] = 1.0
112
- break
113
- else:
114
- result[f"content_recall@{k}"] = 0.0
115
-
116
- return result
117
-
118
-
119
- def answer_question(
120
- question: BenchmarkQuestion,
121
- condition: str,
122
- *,
123
- db_path: str | None = None,
124
- full_context: str | None = None,
125
- knowledge_doc: str | None = None,
126
- model: str | None = None,
127
- ) -> str:
128
- """Generate an answer for a benchmark question under a specific condition.
129
-
130
- Returns the LLM's answer text.
131
- """
132
- qa_model = model or QA_MODEL
133
-
134
- if condition == "sinain-memory":
135
- assert db_path, "db_path required for sinain-memory condition"
136
- facts = _query_knowledge(db_path, question.text)
137
- system = (
138
- "Answer the question using ONLY the provided knowledge facts. "
139
- "If the facts don't contain enough information to answer, say \"I don't know.\""
140
- )
141
- user = f"## Knowledge Facts\n{facts}\n\n## Question\n{question.text}"
142
-
143
- elif condition == "full-context":
144
- assert full_context, "full_context required for full-context condition"
145
- system = (
146
- "Answer the question based on the conversation history below. "
147
- "Be concise and specific."
148
- )
149
- # Truncate context if too large (some models have limits)
150
- ctx = full_context[:100_000] if len(full_context) > 100_000 else full_context
151
- user = f"## Conversation History\n{ctx}\n\n## Question\n{question.text}"
152
-
153
- elif condition == "knowledge-doc":
154
- assert knowledge_doc, "knowledge_doc required for knowledge-doc condition"
155
- system = (
156
- "Answer the question using ONLY the knowledge document provided. "
157
- "If the document doesn't contain enough information, say \"I don't know.\""
158
- )
159
- user = f"## Knowledge Document\n{knowledge_doc}\n\n## Question\n{question.text}"
160
-
161
- else:
162
- raise ValueError(f"Unknown condition: {condition}")
163
-
164
- try:
165
- return call_llm(
166
- system_prompt=system,
167
- user_prompt=user,
168
- model=qa_model,
169
- max_tokens=300,
170
- ).strip()
171
- except Exception as e:
172
- return f"(error: {e})"
@@ -1,87 +0,0 @@
1
- """Report generation — markdown, JSON, and LaTeX output."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- from datetime import datetime, timezone
7
-
8
-
9
- def generate_markdown(benchmark_name: str, summary: dict, details: list[dict]) -> str:
10
- """Generate a publishable markdown report."""
11
- lines = [
12
- f"# Sinain Knowledge Graph — {benchmark_name} Results",
13
- f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
14
- "",
15
- ]
16
-
17
- # Headline IPR
18
- ipr = summary.get("ipr")
19
- if ipr:
20
- lines.append(f"**Information Preservation Rate (IPR)**: {ipr:.1%}")
21
- lines.append("")
22
-
23
- # Condition scores table
24
- conditions = summary.get("conditions", {})
25
- if conditions:
26
- cond_names = sorted(conditions.keys())
27
- header = "| Condition | Mean Score (1-5) | Mean F1 | N |"
28
- sep = "|-----------|------------------|---------|---|"
29
- lines.extend([header, sep])
30
- for cond in cond_names:
31
- c = conditions[cond]
32
- lines.append(f"| {cond} | {c['mean_score']:.2f} | {c.get('mean_f1', 0):.2f} | {c['n']} |")
33
- lines.append("")
34
-
35
- # Retrieval metrics
36
- retrieval = summary.get("retrieval", {})
37
- if retrieval:
38
- lines.append("## Retrieval Quality")
39
- lines.append("| Metric | Score |")
40
- lines.append("|--------|-------|")
41
- for k, v in sorted(retrieval.items()):
42
- lines.append(f"| {k} | {v:.1%} |")
43
- lines.append("")
44
-
45
- # Category breakdown
46
- categories = summary.get("categories", {})
47
- if categories:
48
- lines.append("## By Category")
49
- cond_names = sorted(set(c for cat in categories.values() for c in cat))
50
- header = "| Category | " + " | ".join(cond_names) + " |"
51
- sep = "|----------|" + "|".join(["------"] * len(cond_names)) + "|"
52
- lines.extend([header, sep])
53
- for cat in sorted(categories):
54
- cells = []
55
- for cond in cond_names:
56
- if cond in categories[cat]:
57
- cells.append(f"{categories[cat][cond]['mean_score']:.2f} (n={categories[cat][cond]['n']})")
58
- else:
59
- cells.append("-")
60
- lines.append(f"| {cat} | " + " | ".join(cells) + " |")
61
- lines.append("")
62
-
63
- # Failures (worst questions)
64
- if details:
65
- sm_details = [d for d in details if d.get("answers", {}).get("sinain-memory", {}).get("score") is not None]
66
- sm_details.sort(key=lambda d: d["answers"]["sinain-memory"]["score"])
67
- if sm_details:
68
- lines.append("## Hardest Questions for sinain-memory (bottom 5)")
69
- for d in sm_details[:5]:
70
- sm = d["answers"]["sinain-memory"]
71
- fc = d["answers"].get("full-context", {})
72
- lines.append(f"- **{d['id']}** [{d['category']}]: score={sm['score']}/5 "
73
- f"(full-context: {fc.get('score', '?')}/5)")
74
- lines.append(f" Q: {d['question'][:100]}...")
75
- lines.append("")
76
-
77
- return "\n".join(lines)
78
-
79
-
80
- def generate_json(benchmark_name: str, summary: dict, details: list[dict]) -> str:
81
- """Generate JSON report."""
82
- return json.dumps({
83
- "benchmark": benchmark_name,
84
- "timestamp": datetime.now(timezone.utc).isoformat(),
85
- "summary": summary,
86
- "details": details,
87
- }, indent=2, ensure_ascii=False)
@@ -1,276 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Benchmark runner — evaluates sinain's knowledge graph against published benchmarks.
3
-
4
- Usage:
5
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --subset 5
6
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --conditions sinain-memory,full-context
7
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --format markdown --resume
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import argparse
13
- import json
14
- import sys
15
- from pathlib import Path
16
-
17
- # Add sinain-memory to path
18
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
19
- if _koog_dir not in sys.path:
20
- sys.path.insert(0, _koog_dir)
21
-
22
- from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
23
- from eval.benchmarks.base_adapter import BenchmarkAdapter, BenchmarkInstance
24
- from eval.benchmarks.longmemeval_adapter import LongMemEvalAdapter
25
- from eval.benchmarks.ingest import ingest_instance, get_knowledge_doc
26
- from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
27
- from eval.benchmarks.evaluate import (
28
- token_f1, aggregate_results,
29
- )
30
- from eval.benchmarks.judges.qa_judge import judge_qa
31
- from eval.benchmarks.report import generate_markdown, generate_json
32
-
33
-
34
- def _get_adapter(name: str) -> BenchmarkAdapter:
35
- if name == "longmemeval":
36
- return LongMemEvalAdapter()
37
- raise ValueError(f"Unknown benchmark: {name}. Available: longmemeval")
38
-
39
-
40
- def _load_resume(resume_path: Path) -> dict[str, dict]:
41
- """Load previously computed results for resume support."""
42
- results = {}
43
- if resume_path.exists():
44
- for line in resume_path.read_text().strip().split("\n"):
45
- if line:
46
- entry = json.loads(line)
47
- results[entry["id"]] = entry
48
- return results
49
-
50
-
51
- def run_benchmark(
52
- benchmark_name: str,
53
- conditions: list[str],
54
- *,
55
- subset: int | None = None,
56
- qa_model: str = QA_MODEL,
57
- judge_model: str = JUDGE_MODEL,
58
- output_dir: Path = RESULTS_DIR,
59
- cache_dir: Path = DATA_DIR,
60
- resume: bool = False,
61
- skip_llm: bool = False,
62
- stratified: bool = False,
63
- ) -> tuple[dict, list[dict]]:
64
- """Run a benchmark end-to-end. Returns (summary, details)."""
65
-
66
- adapter = _get_adapter(benchmark_name)
67
-
68
- # Load dataset
69
- print(f"\n{'='*60}")
70
- print(f" Benchmark: {benchmark_name}")
71
- print(f" Conditions: {', '.join(conditions)}")
72
- print(f" QA model: {qa_model}")
73
- print(f" Judge model: {judge_model}")
74
- print(f"{'='*60}\n")
75
-
76
- instances = adapter.load_dataset(str(cache_dir))
77
-
78
- # Flatten questions
79
- all_questions = []
80
- for inst in instances:
81
- for q in inst.questions:
82
- all_questions.append((inst, q))
83
-
84
- if subset:
85
- if stratified:
86
- # Take equal samples from each question category
87
- from collections import defaultdict
88
- by_cat: dict[str, list] = defaultdict(list)
89
- for pair in all_questions:
90
- by_cat[pair[1].category].append(pair)
91
- per_cat = max(1, subset // len(by_cat))
92
- sampled = []
93
- for cat in sorted(by_cat):
94
- sampled.extend(by_cat[cat][:per_cat])
95
- all_questions = sampled[:subset]
96
- else:
97
- all_questions = all_questions[:subset]
98
-
99
- total = len(all_questions)
100
- print(f"[runner] evaluating {total} questions\n")
101
-
102
- # Resume support
103
- resume_path = output_dir / f"{benchmark_name}_progress.jsonl"
104
- completed = _load_resume(resume_path) if resume else {}
105
- output_dir.mkdir(parents=True, exist_ok=True)
106
-
107
- # Track ingested instances
108
- instance_dbs: dict[str, Path | None] = {}
109
- instance_docs: dict[str, str] = {}
110
-
111
- details: list[dict] = []
112
-
113
- for idx, (inst, question) in enumerate(all_questions):
114
- qid = question.id
115
-
116
- # Skip if already done
117
- if qid in completed:
118
- details.append(completed[qid])
119
- continue
120
-
121
- print(f"[{idx+1}/{total}] {qid} [{question.category}]")
122
-
123
- # Ingest instance if not done yet
124
- if inst.id not in instance_dbs:
125
- if "sinain-memory" in conditions or "knowledge-doc" in conditions:
126
- print(f" ingesting {inst.id} ({len(inst.sessions)} sessions)...")
127
- instance_dbs[inst.id] = ingest_instance(inst, cache_dir / benchmark_name)
128
- db = instance_dbs[inst.id]
129
- if db:
130
- instance_docs[inst.id] = get_knowledge_doc(db)
131
- print(f" -> ingested ({db.stat().st_size} bytes)")
132
- else:
133
- instance_docs[inst.id] = "(ingestion failed)"
134
- print(f" -> ingestion failed")
135
- else:
136
- instance_dbs[inst.id] = None
137
- instance_docs[inst.id] = ""
138
-
139
- db_path = instance_dbs.get(inst.id)
140
- knowledge_doc = instance_docs.get(inst.id, "")
141
- full_context = adapter.format_full_context(inst)
142
-
143
- # Retrieval metrics (content-based: do retrieved facts contain the answer?)
144
- retrieval = {}
145
- if db_path and "sinain-memory" in conditions:
146
- retrieved_facts = _get_retrieved_facts(str(db_path), question.text)
147
- retrieval = compute_content_recall(
148
- retrieved_facts, question.gold_answer,
149
- )
150
-
151
- # Generate answers per condition
152
- answers = {}
153
- for cond in conditions:
154
- if skip_llm:
155
- answers[cond] = {"text": "(skipped)", "score": None, "f1": None}
156
- continue
157
-
158
- # Skip sinain-memory/knowledge-doc if ingestion failed
159
- if cond in ("sinain-memory", "knowledge-doc") and not db_path:
160
- answers[cond] = {"text": "(ingestion failed)", "score": 1, "f1": 0.0, "reasoning": "ingestion failed"}
161
- print(f" [{cond}] skipped (ingestion failed)")
162
- continue
163
-
164
- print(f" [{cond}] generating answer...")
165
- answer_text = answer_question(
166
- question, cond,
167
- db_path=str(db_path) if db_path else None,
168
- full_context=full_context,
169
- knowledge_doc=knowledge_doc,
170
- model=qa_model,
171
- )
172
-
173
- # Score
174
- f1 = token_f1(answer_text, question.gold_answer)
175
-
176
- judge_result = judge_qa(
177
- question.text, question.gold_answer, answer_text,
178
- condition=cond, model=judge_model,
179
- )
180
- score = judge_result["score"] if judge_result else None
181
- reasoning = judge_result["reasoning"] if judge_result else None
182
-
183
- answers[cond] = {
184
- "text": answer_text[:500],
185
- "score": score,
186
- "f1": round(f1, 4),
187
- "reasoning": reasoning,
188
- }
189
- print(f" score={score}/5 f1={f1:.2f}")
190
-
191
- entry = {
192
- "id": qid,
193
- "question": question.text,
194
- "gold_answer": question.gold_answer,
195
- "category": question.category,
196
- "retrieval": retrieval,
197
- "answers": answers,
198
- }
199
- details.append(entry)
200
-
201
- # Save progress incrementally
202
- with open(resume_path, "a") as f:
203
- f.write(json.dumps(entry, ensure_ascii=False) + "\n")
204
-
205
- # Aggregate
206
- summary = aggregate_results(details)
207
- return summary, details
208
-
209
-
210
- def main() -> None:
211
- parser = argparse.ArgumentParser(description="Sinain Knowledge Graph Benchmark Runner")
212
- parser.add_argument("--benchmarks", default="longmemeval",
213
- help="Comma-separated benchmark names (longmemeval, locomo)")
214
- parser.add_argument("--conditions", default="sinain-memory,full-context,knowledge-doc",
215
- help="Comma-separated conditions to evaluate")
216
- parser.add_argument("--subset", type=int, default=None,
217
- help="Run only first N questions (for dev iteration)")
218
- parser.add_argument("--qa-model", default=QA_MODEL, help="Model for QA generation")
219
- parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Model for QA judging")
220
- parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
221
- parser.add_argument("--cache-dir", type=Path, default=DATA_DIR)
222
- parser.add_argument("--format", default="json,markdown",
223
- help="Output formats (json, markdown)")
224
- parser.add_argument("--resume", action="store_true", help="Resume from partial results")
225
- parser.add_argument("--skip-llm", action="store_true",
226
- help="Skip LLM calls (retrieval + mechanical metrics only)")
227
- parser.add_argument("--stratified", action="store_true",
228
- help="Sample equally from each question category (with --subset)")
229
- args = parser.parse_args()
230
-
231
- conditions = [c.strip() for c in args.conditions.split(",")]
232
- formats = [f.strip() for f in args.format.split(",")]
233
-
234
- for bench_name in args.benchmarks.split(","):
235
- bench_name = bench_name.strip()
236
- summary, details = run_benchmark(
237
- bench_name, conditions,
238
- subset=args.subset,
239
- qa_model=args.qa_model,
240
- judge_model=args.judge_model,
241
- output_dir=args.output_dir,
242
- cache_dir=args.cache_dir,
243
- resume=args.resume,
244
- skip_llm=args.skip_llm,
245
- stratified=args.stratified,
246
- )
247
-
248
- # Write outputs
249
- args.output_dir.mkdir(parents=True, exist_ok=True)
250
-
251
- if "json" in formats:
252
- json_path = args.output_dir / f"{bench_name}_results.json"
253
- json_path.write_text(generate_json(bench_name, summary, details))
254
- print(f"\n[output] JSON: {json_path}")
255
-
256
- if "markdown" in formats:
257
- md_path = args.output_dir / f"{bench_name}_results.md"
258
- md_path.write_text(generate_markdown(bench_name, summary, details))
259
- print(f"[output] Markdown: {md_path}")
260
-
261
- # Print summary
262
- print(f"\n{'='*60}")
263
- print(f" {bench_name} — Summary")
264
- print(f"{'='*60}")
265
- ipr = summary.get("ipr")
266
- if ipr:
267
- print(f" IPR: {ipr:.1%}")
268
- for cond, data in summary.get("conditions", {}).items():
269
- print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
270
- for k, v in summary.get("retrieval", {}).items():
271
- print(f" {k}: {v:.1%}")
272
- print()
273
-
274
-
275
- if __name__ == "__main__":
276
- main()
File without changes
@@ -1,61 +0,0 @@
1
- """Shared infrastructure for LLM-as-Judge evaluators.
2
-
3
- Provides ``run_judge()`` which calls the LLM with a rubric prompt and
4
- extracts a ``{"score": 1-4, "reasoning": "..."}`` response.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import json
10
- import sys
11
- from pathlib import Path
12
-
13
- # Add parent dirs so ``common`` is importable when running from anywhere.
14
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
- if _koog_dir not in sys.path:
16
- sys.path.insert(0, _koog_dir)
17
-
18
- from common import LLMError, call_llm, extract_json # noqa: E402
19
-
20
-
21
- def run_judge(
22
- system_prompt: str,
23
- user_prompt: str,
24
- *,
25
- model: str | None = None,
26
- max_tokens: int = 200,
27
- timeout: int = 30,
28
- ) -> dict | None:
29
- """Call LLM with a judge prompt and return ``{"score": int, "reasoning": str}`` or None.
30
-
31
- *model* defaults to the ``eval.judges.model`` setting resolved externally.
32
- When None, falls back to ``common.call_llm`` defaults (which reads koog-config).
33
- """
34
- try:
35
- kwargs: dict = {
36
- "system_prompt": system_prompt,
37
- "user_prompt": user_prompt,
38
- "max_tokens": max_tokens,
39
- "json_mode": True,
40
- }
41
- # Use script-based config resolution if no explicit model
42
- if model:
43
- kwargs["model"] = model
44
- else:
45
- kwargs["script"] = "tick_evaluator"
46
-
47
- raw = call_llm(**kwargs)
48
- result = extract_json(raw)
49
-
50
- score = result.get("score")
51
- reasoning = result.get("reasoning", "")
52
-
53
- if not isinstance(score, (int, float)) or not (1 <= score <= 4):
54
- print(f"[warn] judge returned invalid score: {score}", file=sys.stderr)
55
- return None
56
-
57
- return {"score": int(score), "reasoning": str(reasoning)[:300]}
58
-
59
- except (ValueError, LLMError, KeyError) as e:
60
- print(f"[warn] judge call failed: {e}", file=sys.stderr)
61
- return None
@@ -1,46 +0,0 @@
1
- """LLM-as-Judge: Playbook curation quality evaluator."""
2
-
3
- from __future__ import annotations
4
-
5
- from .base_judge import run_judge
6
-
7
- SYSTEM_PROMPT = """\
8
- You are an evaluator scoring the quality of playbook curation changes.
9
-
10
- The curator follows a directive and three laws:
11
- Law 1: Don't remove error-prevention patterns
12
- Law 2: Preserve high-scoring approaches
13
- Law 3: Then evolve
14
-
15
- Rate the curation on a 1-4 scale:
16
- 4: Changes perfectly match directive + evidence, three laws respected
17
- 3: Good changes, minor alignment issues with directive
18
- 2: Changes misaligned with directive or weak evidence
19
- 1: Destructive changes, violated three laws, or ignored directive entirely
20
-
21
- Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
22
-
23
-
24
- def judge_curation(
25
- curator_result: dict,
26
- directive: str,
27
- playbook_before: str = "",
28
- **kwargs,
29
- ) -> dict | None:
30
- """Evaluate playbook curation quality. Returns {"score": 1-4, "reasoning": str} or None."""
31
- changes = curator_result.get("changes", {})
32
- stale_actions = curator_result.get("staleItemActions", [])
33
- lines = curator_result.get("playbookLines", "?")
34
-
35
- parts = [
36
- f"## Curate Directive\n{directive}",
37
- f"\n## Changes Made\nAdded: {changes.get('added', [])}\nPruned: {changes.get('pruned', [])}\nPromoted: {changes.get('promoted', [])}",
38
- f"\n## Stale Item Actions\n{stale_actions}",
39
- f"\n## Playbook Lines After: {lines}",
40
- ]
41
-
42
- if playbook_before:
43
- # Truncate to keep prompt manageable
44
- parts.append(f"\n## Playbook Before (excerpt)\n{playbook_before[:1500]}")
45
-
46
- return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)