@geravant/sinain 1.13.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +33 -27
- package/cli.js +30 -14
- package/config-shared.js +173 -30
- package/launcher.js +38 -21
- package/onboard.js +36 -20
- package/package.json +4 -1
- package/sinain-agent/run.sh +600 -127
- package/sinain-core/src/agents-loader.ts +254 -0
- package/sinain-core/src/buffers/feed-buffer.ts +6 -4
- package/sinain-core/src/config.ts +77 -15
- package/sinain-core/src/escalation/escalator.ts +178 -18
- package/sinain-core/src/index.ts +218 -31
- package/sinain-core/src/learning/local-curation.ts +81 -27
- package/sinain-core/src/overlay/commands.ts +25 -0
- package/sinain-core/src/overlay/ws-handler.ts +3 -0
- package/sinain-core/src/server.ts +101 -10
- package/sinain-core/src/types.ts +29 -3
- package/sinain-memory/graph_query.py +12 -3
- package/sinain-memory/knowledge_integrator.py +194 -10
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
- package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
- package/sinain-memory/eval/benchmarks/query.py +0 -193
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
- package/sinain-memory/eval/benchmarks/runner.py +0 -283
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
"""LLM-as-Judge: Playbook curation quality evaluator."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from .base_judge import run_judge
|
|
6
|
-
|
|
7
|
-
SYSTEM_PROMPT = """\
|
|
8
|
-
You are an evaluator scoring the quality of playbook curation changes.
|
|
9
|
-
|
|
10
|
-
The curator follows a directive and three laws:
|
|
11
|
-
Law 1: Don't remove error-prevention patterns
|
|
12
|
-
Law 2: Preserve high-scoring approaches
|
|
13
|
-
Law 3: Then evolve
|
|
14
|
-
|
|
15
|
-
Rate the curation on a 1-4 scale:
|
|
16
|
-
4: Changes perfectly match directive + evidence, three laws respected
|
|
17
|
-
3: Good changes, minor alignment issues with directive
|
|
18
|
-
2: Changes misaligned with directive or weak evidence
|
|
19
|
-
1: Destructive changes, violated three laws, or ignored directive entirely
|
|
20
|
-
|
|
21
|
-
Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def judge_curation(
|
|
25
|
-
curator_result: dict,
|
|
26
|
-
directive: str,
|
|
27
|
-
playbook_before: str = "",
|
|
28
|
-
**kwargs,
|
|
29
|
-
) -> dict | None:
|
|
30
|
-
"""Evaluate playbook curation quality. Returns {"score": 1-4, "reasoning": str} or None."""
|
|
31
|
-
changes = curator_result.get("changes", {})
|
|
32
|
-
stale_actions = curator_result.get("staleItemActions", [])
|
|
33
|
-
lines = curator_result.get("playbookLines", "?")
|
|
34
|
-
|
|
35
|
-
parts = [
|
|
36
|
-
f"## Curate Directive\n{directive}",
|
|
37
|
-
f"\n## Changes Made\nAdded: {changes.get('added', [])}\nPruned: {changes.get('pruned', [])}\nPromoted: {changes.get('promoted', [])}",
|
|
38
|
-
f"\n## Stale Item Actions\n{stale_actions}",
|
|
39
|
-
f"\n## Playbook Lines After: {lines}",
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
if playbook_before:
|
|
43
|
-
# Truncate to keep prompt manageable
|
|
44
|
-
parts.append(f"\n## Playbook Before (excerpt)\n{playbook_before[:1500]}")
|
|
45
|
-
|
|
46
|
-
return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""LLM-as-Judge: Insight synthesis quality evaluator."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from .base_judge import run_judge
|
|
6
|
-
|
|
7
|
-
SYSTEM_PROMPT = """\
|
|
8
|
-
You are an evaluator scoring the quality of an insight synthesizer's output.
|
|
9
|
-
|
|
10
|
-
The synthesizer produces two parts:
|
|
11
|
-
- Suggestion: actionable recommendation grounded in playbook/data
|
|
12
|
-
- Insight: surprising cross-domain connection from accumulated observations
|
|
13
|
-
|
|
14
|
-
Rate the output on a 1-4 scale:
|
|
15
|
-
4: Suggestion is actionable with specific reference, insight connects 2+ distinct observations
|
|
16
|
-
3: One component is excellent, the other adequate
|
|
17
|
-
2: Generic suggestion or obvious insight
|
|
18
|
-
1: Hallucinated content, not grounded in playbook/logs
|
|
19
|
-
|
|
20
|
-
If the output was skipped, rate the skip decision:
|
|
21
|
-
4: Skip is well-justified with specific references to what was checked
|
|
22
|
-
3: Skip is reasonable
|
|
23
|
-
2: Should not have skipped — there was material to work with
|
|
24
|
-
1: Skip reason is generic/lazy
|
|
25
|
-
|
|
26
|
-
Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def judge_insight(
|
|
30
|
-
synth_result: dict,
|
|
31
|
-
playbook_excerpt: str = "",
|
|
32
|
-
**kwargs,
|
|
33
|
-
) -> dict | None:
|
|
34
|
-
"""Evaluate insight synthesis quality. Returns {"score": 1-4, "reasoning": str} or None."""
|
|
35
|
-
skipped = synth_result.get("skip", False)
|
|
36
|
-
|
|
37
|
-
parts = []
|
|
38
|
-
if skipped:
|
|
39
|
-
parts.append(f"## Status: SKIPPED\nReason: {synth_result.get('skipReason', 'none given')}")
|
|
40
|
-
else:
|
|
41
|
-
parts.append(f"## Suggestion\n{synth_result.get('suggestion', '')}")
|
|
42
|
-
parts.append(f"\n## Insight\n{synth_result.get('insight', '')}")
|
|
43
|
-
parts.append(f"\n## Total Chars: {synth_result.get('totalChars', '?')}")
|
|
44
|
-
|
|
45
|
-
if playbook_excerpt:
|
|
46
|
-
parts.append(f"\n## Playbook Context (excerpt)\n{playbook_excerpt[:1000]}")
|
|
47
|
-
|
|
48
|
-
return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
"""LLM-as-Judge: Memory mining quality evaluator."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from .base_judge import run_judge
|
|
6
|
-
|
|
7
|
-
SYSTEM_PROMPT = """\
|
|
8
|
-
You are an evaluator scoring the quality of a memory mining agent's findings.
|
|
9
|
-
|
|
10
|
-
The miner reads daily memory files and extracts patterns, preferences, and insights
|
|
11
|
-
that should be added to the evolving playbook.
|
|
12
|
-
|
|
13
|
-
Rate the mining output on a 1-4 scale:
|
|
14
|
-
4: Found non-obvious cross-day patterns, all grounded in source files
|
|
15
|
-
3: Valid patterns found, properly grounded in provided daily files
|
|
16
|
-
2: Only surface-level observations from source files
|
|
17
|
-
1: Hallucinated patterns not present in provided daily files
|
|
18
|
-
|
|
19
|
-
Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def judge_mining(
|
|
23
|
-
miner_result: dict,
|
|
24
|
-
mined_file_excerpts: dict[str, str] | None = None,
|
|
25
|
-
**kwargs,
|
|
26
|
-
) -> dict | None:
|
|
27
|
-
"""Evaluate memory mining quality. Returns {"score": 1-4, "reasoning": str} or None."""
|
|
28
|
-
parts = [
|
|
29
|
-
f"## Findings\n{miner_result.get('findings', '')}",
|
|
30
|
-
f"\n## New Patterns\n{miner_result.get('newPatterns', [])}",
|
|
31
|
-
f"\n## Contradictions\n{miner_result.get('contradictions', [])}",
|
|
32
|
-
f"\n## Preferences\n{miner_result.get('preferences', [])}",
|
|
33
|
-
f"\n## Mined Sources\n{miner_result.get('minedSources', [])}",
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
if mined_file_excerpts:
|
|
37
|
-
for name, content in mined_file_excerpts.items():
|
|
38
|
-
# Truncate large files
|
|
39
|
-
excerpt = content[:1500] if len(content) > 1500 else content
|
|
40
|
-
parts.append(f"\n## Source File: {name}\n{excerpt}")
|
|
41
|
-
|
|
42
|
-
return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"""LLM-as-Judge: Signal detection quality evaluator."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from .base_judge import run_judge
|
|
6
|
-
|
|
7
|
-
SYSTEM_PROMPT = """\
|
|
8
|
-
You are an evaluator scoring the quality of a signal detection system.
|
|
9
|
-
|
|
10
|
-
Rate the signal analysis on a 1-4 scale:
|
|
11
|
-
4: All real signals detected, action is highly relevant and specific
|
|
12
|
-
3: Key signals detected, action is reasonable
|
|
13
|
-
2: Missed important signals or action is vague
|
|
14
|
-
1: Hallucinated signals or inappropriate action
|
|
15
|
-
|
|
16
|
-
Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def judge_signal(
|
|
20
|
-
signal_result: dict,
|
|
21
|
-
session_summary: str,
|
|
22
|
-
recent_logs: list[dict] | None = None,
|
|
23
|
-
**kwargs,
|
|
24
|
-
) -> dict | None:
|
|
25
|
-
"""Evaluate signal detection quality. Returns {"score": 1-4, "reasoning": str} or None."""
|
|
26
|
-
parts = [f"## Session Summary\n{session_summary}"]
|
|
27
|
-
|
|
28
|
-
signals = signal_result.get("signals", [])
|
|
29
|
-
action = signal_result.get("recommendedAction")
|
|
30
|
-
idle = signal_result.get("idle", False)
|
|
31
|
-
|
|
32
|
-
parts.append(f"\n## Detected Signals\n{signals}")
|
|
33
|
-
parts.append(f"\n## Recommended Action\n{action}")
|
|
34
|
-
parts.append(f"\n## Idle: {idle}")
|
|
35
|
-
|
|
36
|
-
if recent_logs:
|
|
37
|
-
recent_actions = []
|
|
38
|
-
for log in recent_logs[:3]:
|
|
39
|
-
for a in log.get("actionsConsidered", []):
|
|
40
|
-
if a.get("chosen"):
|
|
41
|
-
recent_actions.append(a)
|
|
42
|
-
if recent_actions:
|
|
43
|
-
parts.append(f"\n## Recent Actions (should not repeat)\n{recent_actions}")
|
|
44
|
-
|
|
45
|
-
return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
{"query": "OCR pipeline stalls on macOS 14", "expected_entities": ["fact:ocr-backpressure", "fact:sck-capture"], "category": "error-resolution"}
|
|
2
|
-
{"query": "camera conflicts with screen capture", "expected_entities": ["fact:camera-conflict", "fact:coremediaio"], "category": "error-resolution"}
|
|
3
|
-
{"query": "audio gain not applied in pipeline", "expected_entities": ["fact:audio-gain"], "category": "bug-fix"}
|
|
4
|
-
{"query": "Flutter ProviderNotFoundException in secondary window", "expected_entities": ["fact:flutter-provider", "fact:multi-window"], "category": "error-resolution"}
|
|
5
|
-
{"query": "user prefers concise Telegram messages", "expected_entities": ["fact:telegram-preference"], "category": "user-preference"}
|
|
6
|
-
{"query": "PyObjC performRequests_error_ returns bool not tuple", "expected_entities": ["fact:pyobjc-api"], "category": "bug-fix"}
|
|
7
|
-
{"query": "ScreenCaptureKit zero-copy IOSurface", "expected_entities": ["fact:sck-capture", "fact:iosurface"], "category": "tool-knowledge"}
|
|
8
|
-
{"query": "OpenClaw gateway workspace not initialized", "expected_entities": ["fact:workspace-init"], "category": "error-resolution"}
|
|
9
|
-
{"query": "react-native metro bundler cache invalidation", "expected_entities": ["fact:react-native-metro"], "category": "tool-knowledge"}
|
|
10
|
-
{"query": "sinain agent session key format", "expected_entities": ["fact:session-key"], "category": "tool-knowledge"}
|
|
11
|
-
{"query": "what was the OCR backend last month", "expected_entities": ["fact:ocr-backend"], "category": "temporal"}
|
|
12
|
-
{"query": "when did we switch from CGDisplayCreateImage to ScreenCaptureKit", "expected_entities": ["fact:sck-capture", "fact:cgdisplay-deprecation"], "category": "temporal"}
|
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Retrieval Quality Evaluator — Recall@k and NDCG@k for knowledge graph queries.
|
|
3
|
-
|
|
4
|
-
Inspired by mempalace's LongMemEval benchmark infrastructure. Measures whether the
|
|
5
|
-
right knowledge surfaces when the agent needs it, complementing sinain's existing
|
|
6
|
-
output quality evaluation (schemas + assertions + LLM judges).
|
|
7
|
-
|
|
8
|
-
Usage:
|
|
9
|
-
python3 eval/retrieval_evaluator.py \
|
|
10
|
-
--db memory/knowledge-graph.db \
|
|
11
|
-
--benchmark eval/retrieval_benchmark.jsonl \
|
|
12
|
-
[--k 1,3,5] [--format json|text]
|
|
13
|
-
|
|
14
|
-
Benchmark dataset format (JSONL):
|
|
15
|
-
{"query": "OCR pipeline stalls on macOS 14", "expected_entities": ["fact:sck-capture-fix"], "category": "error-resolution"}
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import argparse
|
|
19
|
-
import json
|
|
20
|
-
import math
|
|
21
|
-
import sys
|
|
22
|
-
from collections import defaultdict
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def load_benchmark(path: str) -> list[dict]:
|
|
27
|
-
"""Load benchmark QA pairs from JSONL."""
|
|
28
|
-
items = []
|
|
29
|
-
with open(path) as f:
|
|
30
|
-
for line in f:
|
|
31
|
-
line = line.strip()
|
|
32
|
-
if line:
|
|
33
|
-
items.append(json.loads(line))
|
|
34
|
-
return items
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def extract_keywords(query: str) -> list[str]:
|
|
38
|
-
"""Extract search keywords from a natural language query."""
|
|
39
|
-
import re
|
|
40
|
-
words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
|
|
41
|
-
stopwords = {"the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an", "it", "was", "not", "how", "what", "when", "does"}
|
|
42
|
-
return [w for w in words if len(w) > 2 and w not in stopwords]
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def dcg_at_k(relevant_positions: list[int], k: int) -> float:
|
|
46
|
-
"""Compute Discounted Cumulative Gain at k."""
|
|
47
|
-
score = 0.0
|
|
48
|
-
for pos in relevant_positions:
|
|
49
|
-
if pos < k:
|
|
50
|
-
score += 1.0 / math.log2(pos + 2) # +2 because position is 0-indexed
|
|
51
|
-
return score
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
|
|
55
|
-
"""Compute Normalized DCG at k."""
|
|
56
|
-
dcg = dcg_at_k(relevant_positions, k)
|
|
57
|
-
# Ideal DCG: all relevant items at top positions
|
|
58
|
-
ideal_positions = list(range(min(num_relevant, k)))
|
|
59
|
-
idcg = dcg_at_k(ideal_positions, k)
|
|
60
|
-
return dcg / idcg if idcg > 0 else 0.0
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def evaluate_retrieval(
|
|
64
|
-
benchmark_path: str,
|
|
65
|
-
db_path: str,
|
|
66
|
-
k_values: list[int] = [1, 3, 5],
|
|
67
|
-
) -> dict:
|
|
68
|
-
"""Run benchmark queries against graph_query.py, compute Recall@k and NDCG@k."""
|
|
69
|
-
# Import graph_query from parent dir
|
|
70
|
-
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
71
|
-
from graph_query import query_facts_by_entities
|
|
72
|
-
|
|
73
|
-
items = load_benchmark(benchmark_path)
|
|
74
|
-
if not items:
|
|
75
|
-
return {"error": "Empty benchmark dataset"}
|
|
76
|
-
|
|
77
|
-
max_k = max(k_values)
|
|
78
|
-
metrics: dict[str, list[float]] = defaultdict(list)
|
|
79
|
-
category_metrics: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
|
80
|
-
details: list[dict] = []
|
|
81
|
-
|
|
82
|
-
for item in items:
|
|
83
|
-
query = item["query"]
|
|
84
|
-
expected = set(item.get("expected_entities", []))
|
|
85
|
-
category = item.get("category", "general")
|
|
86
|
-
keywords = extract_keywords(query)
|
|
87
|
-
|
|
88
|
-
if not keywords or not expected:
|
|
89
|
-
continue
|
|
90
|
-
|
|
91
|
-
results = query_facts_by_entities(db_path, keywords, max_facts=max_k)
|
|
92
|
-
result_ids = [r["entityId"] for r in results]
|
|
93
|
-
|
|
94
|
-
# Find positions of relevant results
|
|
95
|
-
relevant_positions = []
|
|
96
|
-
for i, rid in enumerate(result_ids):
|
|
97
|
-
if rid in expected:
|
|
98
|
-
relevant_positions.append(i)
|
|
99
|
-
|
|
100
|
-
for k in k_values:
|
|
101
|
-
hit = any(pos < k for pos in relevant_positions)
|
|
102
|
-
recall = 1.0 if hit else 0.0
|
|
103
|
-
ndcg = ndcg_at_k(relevant_positions, len(expected), k)
|
|
104
|
-
|
|
105
|
-
metrics[f"recall@{k}"].append(recall)
|
|
106
|
-
metrics[f"ndcg@{k}"].append(ndcg)
|
|
107
|
-
category_metrics[category][f"recall@{k}"].append(recall)
|
|
108
|
-
category_metrics[category][f"ndcg@{k}"].append(ndcg)
|
|
109
|
-
|
|
110
|
-
details.append({
|
|
111
|
-
"query": query,
|
|
112
|
-
"category": category,
|
|
113
|
-
"expected": list(expected),
|
|
114
|
-
"retrieved": result_ids[:max_k],
|
|
115
|
-
"hit@1": any(pos < 1 for pos in relevant_positions),
|
|
116
|
-
"hit@5": any(pos < 5 for pos in relevant_positions),
|
|
117
|
-
})
|
|
118
|
-
|
|
119
|
-
# Aggregate
|
|
120
|
-
summary = {
|
|
121
|
-
"total_queries": len(items),
|
|
122
|
-
"evaluated": len(details),
|
|
123
|
-
}
|
|
124
|
-
for metric_name, values in sorted(metrics.items()):
|
|
125
|
-
summary[metric_name] = round(sum(values) / len(values), 4) if values else 0.0
|
|
126
|
-
|
|
127
|
-
# Per-category breakdown
|
|
128
|
-
categories = {}
|
|
129
|
-
for cat, cat_metrics in sorted(category_metrics.items()):
|
|
130
|
-
categories[cat] = {
|
|
131
|
-
"count": len(next(iter(cat_metrics.values()))),
|
|
132
|
-
}
|
|
133
|
-
for metric_name, values in sorted(cat_metrics.items()):
|
|
134
|
-
categories[cat][metric_name] = round(sum(values) / len(values), 4) if values else 0.0
|
|
135
|
-
|
|
136
|
-
return {
|
|
137
|
-
"summary": summary,
|
|
138
|
-
"categories": categories,
|
|
139
|
-
"details": details,
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def format_report_text(result: dict) -> str:
|
|
144
|
-
"""Format evaluation result as human-readable text for daily report injection."""
|
|
145
|
-
lines = ["## Retrieval Quality"]
|
|
146
|
-
s = result["summary"]
|
|
147
|
-
for key in sorted(s):
|
|
148
|
-
if key.startswith("recall@") or key.startswith("ndcg@"):
|
|
149
|
-
lines.append(f"- {key}: {s[key]:.2%}")
|
|
150
|
-
|
|
151
|
-
if result.get("categories"):
|
|
152
|
-
lines.append("")
|
|
153
|
-
lines.append("**By category:**")
|
|
154
|
-
for cat, cm in sorted(result["categories"].items()):
|
|
155
|
-
r5 = cm.get("recall@5", 0)
|
|
156
|
-
lines.append(f"- {cat} (n={cm['count']}): recall@5={r5:.0%}")
|
|
157
|
-
|
|
158
|
-
# Weakest category
|
|
159
|
-
cats = result.get("categories", {})
|
|
160
|
-
if cats:
|
|
161
|
-
weakest = min(cats.items(), key=lambda x: x[1].get("recall@5", 1.0))
|
|
162
|
-
if weakest[1].get("recall@5", 1.0) < 0.8:
|
|
163
|
-
lines.append(f"\n**Weakest**: {weakest[0]} ({weakest[1].get('recall@5', 0):.0%})")
|
|
164
|
-
|
|
165
|
-
return "\n".join(lines)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def main() -> None:
|
|
169
|
-
parser = argparse.ArgumentParser(description="Retrieval Quality Evaluator")
|
|
170
|
-
parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
|
|
171
|
-
parser.add_argument("--benchmark", required=True, help="Path to retrieval_benchmark.jsonl")
|
|
172
|
-
parser.add_argument("--k", default="1,3,5", help="Comma-separated k values for Recall@k")
|
|
173
|
-
parser.add_argument("--format", choices=["json", "text"], default="json", help="Output format")
|
|
174
|
-
args = parser.parse_args()
|
|
175
|
-
|
|
176
|
-
k_values = [int(k) for k in args.k.split(",")]
|
|
177
|
-
result = evaluate_retrieval(args.benchmark, args.db, k_values)
|
|
178
|
-
|
|
179
|
-
if args.format == "text":
|
|
180
|
-
print(format_report_text(result))
|
|
181
|
-
else:
|
|
182
|
-
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
if __name__ == "__main__":
|
|
186
|
-
main()
|
|
@@ -1,247 +0,0 @@
|
|
|
1
|
-
"""JSON Schema definitions for all sinain-koog script outputs.
|
|
2
|
-
|
|
3
|
-
Each schema corresponds to the JSON printed by output_json() in its respective
|
|
4
|
-
script. Used by tick_evaluator.py for mechanical validation (Tier 1 eval).
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
from typing import Any
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# ---------------------------------------------------------------------------
|
|
12
|
-
# signal_analyzer.py output
|
|
13
|
-
# ---------------------------------------------------------------------------
|
|
14
|
-
|
|
15
|
-
SIGNAL_ANALYZER_SCHEMA: dict = {
|
|
16
|
-
"type": "object",
|
|
17
|
-
"required": ["signals", "recommendedAction", "idle"],
|
|
18
|
-
"properties": {
|
|
19
|
-
"signals": {
|
|
20
|
-
"type": "array",
|
|
21
|
-
"items": {"type": "string"},
|
|
22
|
-
},
|
|
23
|
-
"recommendedAction": {
|
|
24
|
-
"oneOf": [
|
|
25
|
-
{"type": "null"},
|
|
26
|
-
{
|
|
27
|
-
"type": "object",
|
|
28
|
-
"required": ["action"],
|
|
29
|
-
"properties": {
|
|
30
|
-
"action": {"enum": ["sessions_spawn", "telegram_tip", "skip"]},
|
|
31
|
-
"task": {"type": "string"},
|
|
32
|
-
"confidence": {"type": "number", "minimum": 0, "maximum": 1},
|
|
33
|
-
},
|
|
34
|
-
},
|
|
35
|
-
],
|
|
36
|
-
},
|
|
37
|
-
"idle": {"type": "boolean"},
|
|
38
|
-
},
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
# ---------------------------------------------------------------------------
|
|
42
|
-
# feedback_analyzer.py output
|
|
43
|
-
# ---------------------------------------------------------------------------
|
|
44
|
-
|
|
45
|
-
FEEDBACK_ANALYZER_SCHEMA: dict = {
|
|
46
|
-
"type": "object",
|
|
47
|
-
"required": ["feedbackScores", "effectiveness", "curateDirective"],
|
|
48
|
-
"properties": {
|
|
49
|
-
"feedbackScores": {
|
|
50
|
-
"type": "object",
|
|
51
|
-
"required": ["avg"],
|
|
52
|
-
"properties": {
|
|
53
|
-
"avg": {"type": "number"},
|
|
54
|
-
"high": {"type": "array", "items": {"type": "string"}},
|
|
55
|
-
"low": {"type": "array", "items": {"type": "string"}},
|
|
56
|
-
},
|
|
57
|
-
},
|
|
58
|
-
"effectiveness": {
|
|
59
|
-
"type": "object",
|
|
60
|
-
"required": ["outputs", "positive", "negative", "neutral", "rate"],
|
|
61
|
-
"properties": {
|
|
62
|
-
"outputs": {"type": "integer", "minimum": 0},
|
|
63
|
-
"positive": {"type": "integer", "minimum": 0},
|
|
64
|
-
"negative": {"type": "integer", "minimum": 0},
|
|
65
|
-
"neutral": {"type": "integer", "minimum": 0},
|
|
66
|
-
"rate": {"type": "number", "minimum": 0, "maximum": 1},
|
|
67
|
-
},
|
|
68
|
-
},
|
|
69
|
-
"curateDirective": {
|
|
70
|
-
"enum": ["aggressive_prune", "normal", "stability", "insufficient_data"],
|
|
71
|
-
},
|
|
72
|
-
"interpretation": {"type": "string"},
|
|
73
|
-
},
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
# ---------------------------------------------------------------------------
|
|
77
|
-
# memory_miner.py output
|
|
78
|
-
# ---------------------------------------------------------------------------
|
|
79
|
-
|
|
80
|
-
MEMORY_MINER_SCHEMA: dict = {
|
|
81
|
-
"type": "object",
|
|
82
|
-
"required": ["findings", "newPatterns"],
|
|
83
|
-
"properties": {
|
|
84
|
-
"findings": {"type": "string"},
|
|
85
|
-
"newPatterns": {"type": "array", "items": {"type": "string"}},
|
|
86
|
-
"contradictions": {"type": "array", "items": {"type": "string"}},
|
|
87
|
-
"preferences": {"type": "array", "items": {"type": "string"}},
|
|
88
|
-
"minedSources": {"type": "array", "items": {"type": "string"}},
|
|
89
|
-
},
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
# ---------------------------------------------------------------------------
|
|
93
|
-
# playbook_curator.py output
|
|
94
|
-
# ---------------------------------------------------------------------------
|
|
95
|
-
|
|
96
|
-
PLAYBOOK_CURATOR_SCHEMA: dict = {
|
|
97
|
-
"type": "object",
|
|
98
|
-
"required": ["changes", "playbookLines"],
|
|
99
|
-
"properties": {
|
|
100
|
-
"changes": {
|
|
101
|
-
"type": "object",
|
|
102
|
-
"required": ["added", "pruned", "promoted"],
|
|
103
|
-
"properties": {
|
|
104
|
-
"added": {"type": "array", "items": {"type": "string"}},
|
|
105
|
-
"pruned": {"type": "array", "items": {"type": "string"}},
|
|
106
|
-
"promoted": {"type": "array", "items": {"type": "string"}},
|
|
107
|
-
},
|
|
108
|
-
},
|
|
109
|
-
"staleItemActions": {"type": "array", "items": {"type": "string"}},
|
|
110
|
-
"playbookLines": {"type": "integer", "minimum": 0},
|
|
111
|
-
"error": {"type": "string"},
|
|
112
|
-
},
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
# ---------------------------------------------------------------------------
|
|
116
|
-
# insight_synthesizer.py output (non-skip case)
|
|
117
|
-
# ---------------------------------------------------------------------------
|
|
118
|
-
|
|
119
|
-
INSIGHT_SYNTHESIZER_SCHEMA: dict = {
|
|
120
|
-
"type": "object",
|
|
121
|
-
"required": ["skip"],
|
|
122
|
-
"properties": {
|
|
123
|
-
"skip": {"type": "boolean"},
|
|
124
|
-
"suggestion": {"type": "string"},
|
|
125
|
-
"insight": {"type": "string"},
|
|
126
|
-
"totalChars": {"type": "integer", "minimum": 0},
|
|
127
|
-
"skipReason": {"type": "string"},
|
|
128
|
-
},
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
# ---------------------------------------------------------------------------
|
|
132
|
-
# module_manager.py extract output
|
|
133
|
-
# ---------------------------------------------------------------------------
|
|
134
|
-
|
|
135
|
-
MODULE_EXTRACT_SCHEMA: dict = {
|
|
136
|
-
"type": "object",
|
|
137
|
-
"required": ["extracted", "domain", "status"],
|
|
138
|
-
"properties": {
|
|
139
|
-
"extracted": {"type": "string"},
|
|
140
|
-
"domain": {"type": "string"},
|
|
141
|
-
"patternsEstablished": {"type": "integer", "minimum": 0},
|
|
142
|
-
"patternsEmerging": {"type": "integer", "minimum": 0},
|
|
143
|
-
"vocabularyTerms": {"type": "integer", "minimum": 0},
|
|
144
|
-
"modulePath": {"type": "string"},
|
|
145
|
-
"status": {"enum": ["suspended", "active"]},
|
|
146
|
-
"activateWith": {"type": "string"},
|
|
147
|
-
},
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
# ---------------------------------------------------------------------------
|
|
152
|
-
# Registry: script name → schema
|
|
153
|
-
# ---------------------------------------------------------------------------
|
|
154
|
-
|
|
155
|
-
SCHEMA_REGISTRY: dict[str, dict] = {
|
|
156
|
-
"signal_analyzer": SIGNAL_ANALYZER_SCHEMA,
|
|
157
|
-
"feedback_analyzer": FEEDBACK_ANALYZER_SCHEMA,
|
|
158
|
-
"memory_miner": MEMORY_MINER_SCHEMA,
|
|
159
|
-
"playbook_curator": PLAYBOOK_CURATOR_SCHEMA,
|
|
160
|
-
"insight_synthesizer": INSIGHT_SYNTHESIZER_SCHEMA,
|
|
161
|
-
"module_extract": MODULE_EXTRACT_SCHEMA,
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
# ---------------------------------------------------------------------------
|
|
166
|
-
# Lightweight JSON Schema validator (no external dependency)
|
|
167
|
-
# ---------------------------------------------------------------------------
|
|
168
|
-
|
|
169
|
-
def validate(instance: Any, schema: dict) -> list[str]:
|
|
170
|
-
"""Validate *instance* against a JSON Schema subset.
|
|
171
|
-
|
|
172
|
-
Returns a list of error strings (empty = valid). Supports:
|
|
173
|
-
type, required, properties, items, enum, oneOf, minimum, maximum.
|
|
174
|
-
"""
|
|
175
|
-
errors: list[str] = []
|
|
176
|
-
_validate(instance, schema, "", errors)
|
|
177
|
-
return errors
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def _validate(instance: Any, schema: dict, path: str, errors: list[str]) -> None:
|
|
181
|
-
# --- oneOf ---
|
|
182
|
-
if "oneOf" in schema:
|
|
183
|
-
matches = 0
|
|
184
|
-
for sub in schema["oneOf"]:
|
|
185
|
-
sub_errors: list[str] = []
|
|
186
|
-
_validate(instance, sub, path, sub_errors)
|
|
187
|
-
if not sub_errors:
|
|
188
|
-
matches += 1
|
|
189
|
-
if matches == 0:
|
|
190
|
-
errors.append(f"{path or '.'}: does not match any oneOf variant")
|
|
191
|
-
return
|
|
192
|
-
|
|
193
|
-
# --- enum ---
|
|
194
|
-
if "enum" in schema:
|
|
195
|
-
if instance not in schema["enum"]:
|
|
196
|
-
errors.append(f"{path or '.'}: {instance!r} not in {schema['enum']}")
|
|
197
|
-
return
|
|
198
|
-
|
|
199
|
-
# --- type ---
|
|
200
|
-
expected_type = schema.get("type")
|
|
201
|
-
if expected_type:
|
|
202
|
-
ok = _type_check(instance, expected_type)
|
|
203
|
-
if not ok:
|
|
204
|
-
errors.append(f"{path or '.'}: expected {expected_type}, got {type(instance).__name__}")
|
|
205
|
-
return
|
|
206
|
-
|
|
207
|
-
# --- required ---
|
|
208
|
-
if "required" in schema and isinstance(instance, dict):
|
|
209
|
-
for key in schema["required"]:
|
|
210
|
-
if key not in instance:
|
|
211
|
-
errors.append(f"{path}.{key}: required field missing")
|
|
212
|
-
|
|
213
|
-
# --- properties ---
|
|
214
|
-
if "properties" in schema and isinstance(instance, dict):
|
|
215
|
-
for key, sub_schema in schema["properties"].items():
|
|
216
|
-
if key in instance:
|
|
217
|
-
_validate(instance[key], sub_schema, f"{path}.{key}", errors)
|
|
218
|
-
|
|
219
|
-
# --- items ---
|
|
220
|
-
if "items" in schema and isinstance(instance, list):
|
|
221
|
-
for i, item in enumerate(instance):
|
|
222
|
-
_validate(item, schema["items"], f"{path}[{i}]", errors)
|
|
223
|
-
|
|
224
|
-
# --- minimum / maximum ---
|
|
225
|
-
if isinstance(instance, (int, float)):
|
|
226
|
-
if "minimum" in schema and instance < schema["minimum"]:
|
|
227
|
-
errors.append(f"{path or '.'}: {instance} < minimum {schema['minimum']}")
|
|
228
|
-
if "maximum" in schema and instance > schema["maximum"]:
|
|
229
|
-
errors.append(f"{path or '.'}: {instance} > maximum {schema['maximum']}")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def _type_check(instance: Any, expected: str) -> bool:
|
|
233
|
-
if expected == "object":
|
|
234
|
-
return isinstance(instance, dict)
|
|
235
|
-
if expected == "array":
|
|
236
|
-
return isinstance(instance, list)
|
|
237
|
-
if expected == "string":
|
|
238
|
-
return isinstance(instance, str)
|
|
239
|
-
if expected == "number":
|
|
240
|
-
return isinstance(instance, (int, float))
|
|
241
|
-
if expected == "integer":
|
|
242
|
-
return isinstance(instance, int) and not isinstance(instance, bool)
|
|
243
|
-
if expected == "boolean":
|
|
244
|
-
return isinstance(instance, bool)
|
|
245
|
-
if expected == "null":
|
|
246
|
-
return instance is None
|
|
247
|
-
return True
|
|
File without changes
|