@geravant/sinain 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  6. package/sinain-core/src/index.ts +50 -19
  7. package/sinain-memory/graph_query.py +12 -3
  8. package/sinain-memory/knowledge_integrator.py +194 -10
  9. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  10. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  12. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  13. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  14. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/__init__.py +0 -0
  16. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/assertions.py +0 -267
  18. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  19. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  21. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  31. package/sinain-memory/eval/benchmarks/config.py +0 -23
  32. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  33. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  34. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  35. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  38. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  39. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  40. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  41. package/sinain-memory/eval/benchmarks/query.py +0 -193
  42. package/sinain-memory/eval/benchmarks/report.py +0 -87
  43. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  44. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  45. package/sinain-memory/eval/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/judges/base_judge.py +0 -61
  47. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  48. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  49. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  50. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  51. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  52. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  53. package/sinain-memory/eval/schemas.py +0 -247
  54. package/sinain-memory/tests/__init__.py +0 -0
  55. package/sinain-memory/tests/conftest.py +0 -189
  56. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  57. package/sinain-memory/tests/test_embedder.py +0 -210
  58. package/sinain-memory/tests/test_extract_json.py +0 -124
  59. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  60. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  61. package/sinain-memory/tests/test_module_management.py +0 -458
  62. package/sinain-memory/tests/test_parsers.py +0 -96
  63. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  64. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  65. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  66. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  67. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,283 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Benchmark runner — evaluates sinain's knowledge graph against published benchmarks.
3
-
4
- Usage:
5
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --subset 5
6
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --conditions sinain-memory,full-context
7
- python3 eval/benchmarks/runner.py --benchmarks longmemeval --format markdown --resume
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import argparse
13
- import json
14
- import sys
15
- from pathlib import Path
16
-
17
- # Add sinain-memory to path
18
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
19
- if _koog_dir not in sys.path:
20
- sys.path.insert(0, _koog_dir)
21
-
22
- from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
23
- from eval.benchmarks.base_adapter import BenchmarkAdapter, BenchmarkInstance
24
- from eval.benchmarks.longmemeval_adapter import LongMemEvalAdapter
25
- from eval.benchmarks.ingest import ingest_instance, get_knowledge_doc
26
- from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
27
- from eval.benchmarks.evaluate import (
28
- token_f1, aggregate_results,
29
- )
30
- from eval.benchmarks.judges.qa_judge import judge_qa
31
- from eval.benchmarks.report import generate_markdown, generate_json
32
-
33
-
34
- def _get_adapter(name: str) -> BenchmarkAdapter:
35
- if name == "longmemeval":
36
- return LongMemEvalAdapter()
37
- raise ValueError(f"Unknown benchmark: {name}. Available: longmemeval")
38
-
39
-
40
- def _load_resume(resume_path: Path) -> dict[str, dict]:
41
- """Load previously computed results for resume support."""
42
- results = {}
43
- if resume_path.exists():
44
- for line in resume_path.read_text().strip().split("\n"):
45
- if line:
46
- entry = json.loads(line)
47
- results[entry["id"]] = entry
48
- return results
49
-
50
-
51
- def run_benchmark(
52
- benchmark_name: str,
53
- conditions: list[str],
54
- *,
55
- subset: int | None = None,
56
- qa_model: str = QA_MODEL,
57
- judge_model: str = JUDGE_MODEL,
58
- output_dir: Path = RESULTS_DIR,
59
- cache_dir: Path = DATA_DIR,
60
- resume: bool = False,
61
- skip_llm: bool = False,
62
- stratified: bool = False,
63
- ) -> tuple[dict, list[dict]]:
64
- """Run a benchmark end-to-end. Returns (summary, details)."""
65
-
66
- adapter = _get_adapter(benchmark_name)
67
-
68
- # Load dataset
69
- print(f"\n{'='*60}")
70
- print(f" Benchmark: {benchmark_name}")
71
- print(f" Conditions: {', '.join(conditions)}")
72
- print(f" QA model: {qa_model}")
73
- print(f" Judge model: {judge_model}")
74
- print(f"{'='*60}\n")
75
-
76
- instances = adapter.load_dataset(str(cache_dir))
77
-
78
- # Flatten questions
79
- all_questions = []
80
- for inst in instances:
81
- for q in inst.questions:
82
- all_questions.append((inst, q))
83
-
84
- if subset:
85
- if stratified:
86
- # Take equal samples from each question category
87
- from collections import defaultdict
88
- by_cat: dict[str, list] = defaultdict(list)
89
- for pair in all_questions:
90
- by_cat[pair[1].category].append(pair)
91
- per_cat = max(1, subset // len(by_cat))
92
- sampled = []
93
- for cat in sorted(by_cat):
94
- sampled.extend(by_cat[cat][:per_cat])
95
- all_questions = sampled[:subset]
96
- else:
97
- all_questions = all_questions[:subset]
98
-
99
- total = len(all_questions)
100
- print(f"[runner] evaluating {total} questions\n")
101
-
102
- # Resume support
103
- resume_path = output_dir / f"{benchmark_name}_progress.jsonl"
104
- completed = _load_resume(resume_path) if resume else {}
105
- output_dir.mkdir(parents=True, exist_ok=True)
106
-
107
- # Track ingested instances
108
- instance_dbs: dict[str, Path | None] = {}
109
- instance_docs: dict[str, str] = {}
110
-
111
- details: list[dict] = []
112
-
113
- for idx, (inst, question) in enumerate(all_questions):
114
- qid = question.id
115
-
116
- # Skip if already done (with all conditions scored)
117
- if qid in completed:
118
- prev = completed[qid]
119
- all_scored = all(
120
- prev.get("answers", {}).get(c, {}).get("score") is not None
121
- for c in conditions
122
- )
123
- if all_scored:
124
- details.append(prev)
125
- continue
126
- # Otherwise re-run this question (previous attempt had failures)
127
-
128
- print(f"[{idx+1}/{total}] {qid} [{question.category}]")
129
-
130
- # Ingest instance if not done yet
131
- if inst.id not in instance_dbs:
132
- if "sinain-memory" in conditions or "knowledge-doc" in conditions:
133
- print(f" ingesting {inst.id} ({len(inst.sessions)} sessions)...")
134
- instance_dbs[inst.id] = ingest_instance(inst, cache_dir / benchmark_name)
135
- db = instance_dbs[inst.id]
136
- if db:
137
- instance_docs[inst.id] = get_knowledge_doc(db)
138
- print(f" -> ingested ({db.stat().st_size} bytes)")
139
- else:
140
- instance_docs[inst.id] = "(ingestion failed)"
141
- print(f" -> ingestion failed")
142
- else:
143
- instance_dbs[inst.id] = None
144
- instance_docs[inst.id] = ""
145
-
146
- db_path = instance_dbs.get(inst.id)
147
- knowledge_doc = instance_docs.get(inst.id, "")
148
- full_context = adapter.format_full_context(inst)
149
-
150
- # Retrieval metrics (content-based: do retrieved facts contain the answer?)
151
- retrieval = {}
152
- if db_path and "sinain-memory" in conditions:
153
- retrieved_facts = _get_retrieved_facts(str(db_path), question.text)
154
- retrieval = compute_content_recall(
155
- retrieved_facts, question.gold_answer,
156
- )
157
-
158
- # Generate answers per condition
159
- answers = {}
160
- for cond in conditions:
161
- if skip_llm:
162
- answers[cond] = {"text": "(skipped)", "score": None, "f1": None}
163
- continue
164
-
165
- # Skip sinain-memory/knowledge-doc if ingestion failed
166
- if cond in ("sinain-memory", "knowledge-doc") and not db_path:
167
- answers[cond] = {"text": "(ingestion failed)", "score": 1, "f1": 0.0, "reasoning": "ingestion failed"}
168
- print(f" [{cond}] skipped (ingestion failed)")
169
- continue
170
-
171
- print(f" [{cond}] generating answer...")
172
- answer_text = answer_question(
173
- question, cond,
174
- db_path=str(db_path) if db_path else None,
175
- full_context=full_context,
176
- knowledge_doc=knowledge_doc,
177
- model=qa_model,
178
- )
179
-
180
- # Score
181
- f1 = token_f1(answer_text, question.gold_answer)
182
-
183
- judge_result = judge_qa(
184
- question.text, question.gold_answer, answer_text,
185
- condition=cond, model=judge_model,
186
- )
187
- score = judge_result["score"] if judge_result else None
188
- reasoning = judge_result["reasoning"] if judge_result else None
189
-
190
- answers[cond] = {
191
- "text": answer_text[:500],
192
- "score": score,
193
- "f1": round(f1, 4),
194
- "reasoning": reasoning,
195
- }
196
- print(f" score={score}/5 f1={f1:.2f}")
197
-
198
- entry = {
199
- "id": qid,
200
- "question": question.text,
201
- "gold_answer": question.gold_answer,
202
- "category": question.category,
203
- "retrieval": retrieval,
204
- "answers": answers,
205
- }
206
- details.append(entry)
207
-
208
- # Save progress incrementally
209
- with open(resume_path, "a") as f:
210
- f.write(json.dumps(entry, ensure_ascii=False) + "\n")
211
-
212
- # Aggregate
213
- summary = aggregate_results(details)
214
- return summary, details
215
-
216
-
217
- def main() -> None:
218
- parser = argparse.ArgumentParser(description="Sinain Knowledge Graph Benchmark Runner")
219
- parser.add_argument("--benchmarks", default="longmemeval",
220
- help="Comma-separated benchmark names (longmemeval, locomo)")
221
- parser.add_argument("--conditions", default="sinain-memory,full-context,knowledge-doc",
222
- help="Comma-separated conditions to evaluate")
223
- parser.add_argument("--subset", type=int, default=None,
224
- help="Run only first N questions (for dev iteration)")
225
- parser.add_argument("--qa-model", default=QA_MODEL, help="Model for QA generation")
226
- parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Model for QA judging")
227
- parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
228
- parser.add_argument("--cache-dir", type=Path, default=DATA_DIR)
229
- parser.add_argument("--format", default="json,markdown",
230
- help="Output formats (json, markdown)")
231
- parser.add_argument("--resume", action="store_true", help="Resume from partial results")
232
- parser.add_argument("--skip-llm", action="store_true",
233
- help="Skip LLM calls (retrieval + mechanical metrics only)")
234
- parser.add_argument("--stratified", action="store_true",
235
- help="Sample equally from each question category (with --subset)")
236
- args = parser.parse_args()
237
-
238
- conditions = [c.strip() for c in args.conditions.split(",")]
239
- formats = [f.strip() for f in args.format.split(",")]
240
-
241
- for bench_name in args.benchmarks.split(","):
242
- bench_name = bench_name.strip()
243
- summary, details = run_benchmark(
244
- bench_name, conditions,
245
- subset=args.subset,
246
- qa_model=args.qa_model,
247
- judge_model=args.judge_model,
248
- output_dir=args.output_dir,
249
- cache_dir=args.cache_dir,
250
- resume=args.resume,
251
- skip_llm=args.skip_llm,
252
- stratified=args.stratified,
253
- )
254
-
255
- # Write outputs
256
- args.output_dir.mkdir(parents=True, exist_ok=True)
257
-
258
- if "json" in formats:
259
- json_path = args.output_dir / f"{bench_name}_results.json"
260
- json_path.write_text(generate_json(bench_name, summary, details))
261
- print(f"\n[output] JSON: {json_path}")
262
-
263
- if "markdown" in formats:
264
- md_path = args.output_dir / f"{bench_name}_results.md"
265
- md_path.write_text(generate_markdown(bench_name, summary, details))
266
- print(f"[output] Markdown: {md_path}")
267
-
268
- # Print summary
269
- print(f"\n{'='*60}")
270
- print(f" {bench_name} — Summary")
271
- print(f"{'='*60}")
272
- ipr = summary.get("ipr")
273
- if ipr:
274
- print(f" IPR: {ipr:.1%}")
275
- for cond, data in summary.get("conditions", {}).items():
276
- print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
277
- for k, v in summary.get("retrieval", {}).items():
278
- print(f" {k}: {v:.1%}")
279
- print()
280
-
281
-
282
- if __name__ == "__main__":
283
- main()
File without changes
@@ -1,61 +0,0 @@
1
- """Shared infrastructure for LLM-as-Judge evaluators.
2
-
3
- Provides ``run_judge()`` which calls the LLM with a rubric prompt and
4
- extracts a ``{"score": 1-4, "reasoning": "..."}`` response.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import json
10
- import sys
11
- from pathlib import Path
12
-
13
- # Add parent dirs so ``common`` is importable when running from anywhere.
14
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
- if _koog_dir not in sys.path:
16
- sys.path.insert(0, _koog_dir)
17
-
18
- from common import LLMError, call_llm, extract_json # noqa: E402
19
-
20
-
21
- def run_judge(
22
- system_prompt: str,
23
- user_prompt: str,
24
- *,
25
- model: str | None = None,
26
- max_tokens: int = 200,
27
- timeout: int = 30,
28
- ) -> dict | None:
29
- """Call LLM with a judge prompt and return ``{"score": int, "reasoning": str}`` or None.
30
-
31
- *model* defaults to the ``eval.judges.model`` setting resolved externally.
32
- When None, falls back to ``common.call_llm`` defaults (which reads koog-config).
33
- """
34
- try:
35
- kwargs: dict = {
36
- "system_prompt": system_prompt,
37
- "user_prompt": user_prompt,
38
- "max_tokens": max_tokens,
39
- "json_mode": True,
40
- }
41
- # Use script-based config resolution if no explicit model
42
- if model:
43
- kwargs["model"] = model
44
- else:
45
- kwargs["script"] = "tick_evaluator"
46
-
47
- raw = call_llm(**kwargs)
48
- result = extract_json(raw)
49
-
50
- score = result.get("score")
51
- reasoning = result.get("reasoning", "")
52
-
53
- if not isinstance(score, (int, float)) or not (1 <= score <= 4):
54
- print(f"[warn] judge returned invalid score: {score}", file=sys.stderr)
55
- return None
56
-
57
- return {"score": int(score), "reasoning": str(reasoning)[:300]}
58
-
59
- except (ValueError, LLMError, KeyError) as e:
60
- print(f"[warn] judge call failed: {e}", file=sys.stderr)
61
- return None
@@ -1,46 +0,0 @@
1
- """LLM-as-Judge: Playbook curation quality evaluator."""
2
-
3
- from __future__ import annotations
4
-
5
- from .base_judge import run_judge
6
-
7
- SYSTEM_PROMPT = """\
8
- You are an evaluator scoring the quality of playbook curation changes.
9
-
10
- The curator follows a directive and three laws:
11
- Law 1: Don't remove error-prevention patterns
12
- Law 2: Preserve high-scoring approaches
13
- Law 3: Then evolve
14
-
15
- Rate the curation on a 1-4 scale:
16
- 4: Changes perfectly match directive + evidence, three laws respected
17
- 3: Good changes, minor alignment issues with directive
18
- 2: Changes misaligned with directive or weak evidence
19
- 1: Destructive changes, violated three laws, or ignored directive entirely
20
-
21
- Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
22
-
23
-
24
- def judge_curation(
25
- curator_result: dict,
26
- directive: str,
27
- playbook_before: str = "",
28
- **kwargs,
29
- ) -> dict | None:
30
- """Evaluate playbook curation quality. Returns {"score": 1-4, "reasoning": str} or None."""
31
- changes = curator_result.get("changes", {})
32
- stale_actions = curator_result.get("staleItemActions", [])
33
- lines = curator_result.get("playbookLines", "?")
34
-
35
- parts = [
36
- f"## Curate Directive\n{directive}",
37
- f"\n## Changes Made\nAdded: {changes.get('added', [])}\nPruned: {changes.get('pruned', [])}\nPromoted: {changes.get('promoted', [])}",
38
- f"\n## Stale Item Actions\n{stale_actions}",
39
- f"\n## Playbook Lines After: {lines}",
40
- ]
41
-
42
- if playbook_before:
43
- # Truncate to keep prompt manageable
44
- parts.append(f"\n## Playbook Before (excerpt)\n{playbook_before[:1500]}")
45
-
46
- return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -1,48 +0,0 @@
1
- """LLM-as-Judge: Insight synthesis quality evaluator."""
2
-
3
- from __future__ import annotations
4
-
5
- from .base_judge import run_judge
6
-
7
- SYSTEM_PROMPT = """\
8
- You are an evaluator scoring the quality of an insight synthesizer's output.
9
-
10
- The synthesizer produces two parts:
11
- - Suggestion: actionable recommendation grounded in playbook/data
12
- - Insight: surprising cross-domain connection from accumulated observations
13
-
14
- Rate the output on a 1-4 scale:
15
- 4: Suggestion is actionable with specific reference, insight connects 2+ distinct observations
16
- 3: One component is excellent, the other adequate
17
- 2: Generic suggestion or obvious insight
18
- 1: Hallucinated content, not grounded in playbook/logs
19
-
20
- If the output was skipped, rate the skip decision:
21
- 4: Skip is well-justified with specific references to what was checked
22
- 3: Skip is reasonable
23
- 2: Should not have skipped — there was material to work with
24
- 1: Skip reason is generic/lazy
25
-
26
- Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
27
-
28
-
29
- def judge_insight(
30
- synth_result: dict,
31
- playbook_excerpt: str = "",
32
- **kwargs,
33
- ) -> dict | None:
34
- """Evaluate insight synthesis quality. Returns {"score": 1-4, "reasoning": str} or None."""
35
- skipped = synth_result.get("skip", False)
36
-
37
- parts = []
38
- if skipped:
39
- parts.append(f"## Status: SKIPPED\nReason: {synth_result.get('skipReason', 'none given')}")
40
- else:
41
- parts.append(f"## Suggestion\n{synth_result.get('suggestion', '')}")
42
- parts.append(f"\n## Insight\n{synth_result.get('insight', '')}")
43
- parts.append(f"\n## Total Chars: {synth_result.get('totalChars', '?')}")
44
-
45
- if playbook_excerpt:
46
- parts.append(f"\n## Playbook Context (excerpt)\n{playbook_excerpt[:1000]}")
47
-
48
- return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -1,42 +0,0 @@
1
- """LLM-as-Judge: Memory mining quality evaluator."""
2
-
3
- from __future__ import annotations
4
-
5
- from .base_judge import run_judge
6
-
7
- SYSTEM_PROMPT = """\
8
- You are an evaluator scoring the quality of a memory mining agent's findings.
9
-
10
- The miner reads daily memory files and extracts patterns, preferences, and insights
11
- that should be added to the evolving playbook.
12
-
13
- Rate the mining output on a 1-4 scale:
14
- 4: Found non-obvious cross-day patterns, all grounded in source files
15
- 3: Valid patterns found, properly grounded in provided daily files
16
- 2: Only surface-level observations from source files
17
- 1: Hallucinated patterns not present in provided daily files
18
-
19
- Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
20
-
21
-
22
- def judge_mining(
23
- miner_result: dict,
24
- mined_file_excerpts: dict[str, str] | None = None,
25
- **kwargs,
26
- ) -> dict | None:
27
- """Evaluate memory mining quality. Returns {"score": 1-4, "reasoning": str} or None."""
28
- parts = [
29
- f"## Findings\n{miner_result.get('findings', '')}",
30
- f"\n## New Patterns\n{miner_result.get('newPatterns', [])}",
31
- f"\n## Contradictions\n{miner_result.get('contradictions', [])}",
32
- f"\n## Preferences\n{miner_result.get('preferences', [])}",
33
- f"\n## Mined Sources\n{miner_result.get('minedSources', [])}",
34
- ]
35
-
36
- if mined_file_excerpts:
37
- for name, content in mined_file_excerpts.items():
38
- # Truncate large files
39
- excerpt = content[:1500] if len(content) > 1500 else content
40
- parts.append(f"\n## Source File: {name}\n{excerpt}")
41
-
42
- return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -1,45 +0,0 @@
1
- """LLM-as-Judge: Signal detection quality evaluator."""
2
-
3
- from __future__ import annotations
4
-
5
- from .base_judge import run_judge
6
-
7
- SYSTEM_PROMPT = """\
8
- You are an evaluator scoring the quality of a signal detection system.
9
-
10
- Rate the signal analysis on a 1-4 scale:
11
- 4: All real signals detected, action is highly relevant and specific
12
- 3: Key signals detected, action is reasonable
13
- 2: Missed important signals or action is vague
14
- 1: Hallucinated signals or inappropriate action
15
-
16
- Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
17
-
18
-
19
- def judge_signal(
20
- signal_result: dict,
21
- session_summary: str,
22
- recent_logs: list[dict] | None = None,
23
- **kwargs,
24
- ) -> dict | None:
25
- """Evaluate signal detection quality. Returns {"score": 1-4, "reasoning": str} or None."""
26
- parts = [f"## Session Summary\n{session_summary}"]
27
-
28
- signals = signal_result.get("signals", [])
29
- action = signal_result.get("recommendedAction")
30
- idle = signal_result.get("idle", False)
31
-
32
- parts.append(f"\n## Detected Signals\n{signals}")
33
- parts.append(f"\n## Recommended Action\n{action}")
34
- parts.append(f"\n## Idle: {idle}")
35
-
36
- if recent_logs:
37
- recent_actions = []
38
- for log in recent_logs[:3]:
39
- for a in log.get("actionsConsidered", []):
40
- if a.get("chosen"):
41
- recent_actions.append(a)
42
- if recent_actions:
43
- parts.append(f"\n## Recent Actions (should not repeat)\n{recent_actions}")
44
-
45
- return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -1,12 +0,0 @@
1
- {"query": "OCR pipeline stalls on macOS 14", "expected_entities": ["fact:ocr-backpressure", "fact:sck-capture"], "category": "error-resolution"}
2
- {"query": "camera conflicts with screen capture", "expected_entities": ["fact:camera-conflict", "fact:coremediaio"], "category": "error-resolution"}
3
- {"query": "audio gain not applied in pipeline", "expected_entities": ["fact:audio-gain"], "category": "bug-fix"}
4
- {"query": "Flutter ProviderNotFoundException in secondary window", "expected_entities": ["fact:flutter-provider", "fact:multi-window"], "category": "error-resolution"}
5
- {"query": "user prefers concise Telegram messages", "expected_entities": ["fact:telegram-preference"], "category": "user-preference"}
6
- {"query": "PyObjC performRequests_error_ returns bool not tuple", "expected_entities": ["fact:pyobjc-api"], "category": "bug-fix"}
7
- {"query": "ScreenCaptureKit zero-copy IOSurface", "expected_entities": ["fact:sck-capture", "fact:iosurface"], "category": "tool-knowledge"}
8
- {"query": "OpenClaw gateway workspace not initialized", "expected_entities": ["fact:workspace-init"], "category": "error-resolution"}
9
- {"query": "react-native metro bundler cache invalidation", "expected_entities": ["fact:react-native-metro"], "category": "tool-knowledge"}
10
- {"query": "sinain agent session key format", "expected_entities": ["fact:session-key"], "category": "tool-knowledge"}
11
- {"query": "what was the OCR backend last month", "expected_entities": ["fact:ocr-backend"], "category": "temporal"}
12
- {"query": "when did we switch from CGDisplayCreateImage to ScreenCaptureKit", "expected_entities": ["fact:sck-capture", "fact:cgdisplay-deprecation"], "category": "temporal"}