@geravant/sinain 1.10.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python3
2
+ """Retrieval Quality Evaluator — Recall@k and NDCG@k for knowledge graph queries.
3
+
4
+ Inspired by mempalace's LongMemEval benchmark infrastructure. Measures whether the
5
+ right knowledge surfaces when the agent needs it, complementing sinain's existing
6
+ output quality evaluation (schemas + assertions + LLM judges).
7
+
8
+ Usage:
9
+ python3 eval/retrieval_evaluator.py \
10
+ --db memory/knowledge-graph.db \
11
+ --benchmark eval/retrieval_benchmark.jsonl \
12
+ [--k 1,3,5] [--format json|text]
13
+
14
+ Benchmark dataset format (JSONL):
15
+ {"query": "OCR pipeline stalls on macOS 14", "expected_entities": ["fact:sck-capture-fix"], "category": "error-resolution"}
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import math
21
+ import sys
22
+ from collections import defaultdict
23
+ from pathlib import Path
24
+
25
+
26
+ def load_benchmark(path: str) -> list[dict]:
27
+ """Load benchmark QA pairs from JSONL."""
28
+ items = []
29
+ with open(path) as f:
30
+ for line in f:
31
+ line = line.strip()
32
+ if line:
33
+ items.append(json.loads(line))
34
+ return items
35
+
36
+
37
+ def extract_keywords(query: str) -> list[str]:
38
+ """Extract search keywords from a natural language query."""
39
+ import re
40
+ words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
41
+ stopwords = {"the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an", "it", "was", "not", "how", "what", "when", "does"}
42
+ return [w for w in words if len(w) > 2 and w not in stopwords]
43
+
44
+
45
+ def dcg_at_k(relevant_positions: list[int], k: int) -> float:
46
+ """Compute Discounted Cumulative Gain at k."""
47
+ score = 0.0
48
+ for pos in relevant_positions:
49
+ if pos < k:
50
+ score += 1.0 / math.log2(pos + 2) # +2 because position is 0-indexed
51
+ return score
52
+
53
+
54
+ def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
55
+ """Compute Normalized DCG at k."""
56
+ dcg = dcg_at_k(relevant_positions, k)
57
+ # Ideal DCG: all relevant items at top positions
58
+ ideal_positions = list(range(min(num_relevant, k)))
59
+ idcg = dcg_at_k(ideal_positions, k)
60
+ return dcg / idcg if idcg > 0 else 0.0
61
+
62
+
63
+ def evaluate_retrieval(
64
+ benchmark_path: str,
65
+ db_path: str,
66
+ k_values: list[int] = [1, 3, 5],
67
+ ) -> dict:
68
+ """Run benchmark queries against graph_query.py, compute Recall@k and NDCG@k."""
69
+ # Import graph_query from parent dir
70
+ sys.path.insert(0, str(Path(__file__).parent.parent))
71
+ from graph_query import query_facts_by_entities
72
+
73
+ items = load_benchmark(benchmark_path)
74
+ if not items:
75
+ return {"error": "Empty benchmark dataset"}
76
+
77
+ max_k = max(k_values)
78
+ metrics: dict[str, list[float]] = defaultdict(list)
79
+ category_metrics: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
80
+ details: list[dict] = []
81
+
82
+ for item in items:
83
+ query = item["query"]
84
+ expected = set(item.get("expected_entities", []))
85
+ category = item.get("category", "general")
86
+ keywords = extract_keywords(query)
87
+
88
+ if not keywords or not expected:
89
+ continue
90
+
91
+ results = query_facts_by_entities(db_path, keywords, max_facts=max_k)
92
+ result_ids = [r["entityId"] for r in results]
93
+
94
+ # Find positions of relevant results
95
+ relevant_positions = []
96
+ for i, rid in enumerate(result_ids):
97
+ if rid in expected:
98
+ relevant_positions.append(i)
99
+
100
+ for k in k_values:
101
+ hit = any(pos < k for pos in relevant_positions)
102
+ recall = 1.0 if hit else 0.0
103
+ ndcg = ndcg_at_k(relevant_positions, len(expected), k)
104
+
105
+ metrics[f"recall@{k}"].append(recall)
106
+ metrics[f"ndcg@{k}"].append(ndcg)
107
+ category_metrics[category][f"recall@{k}"].append(recall)
108
+ category_metrics[category][f"ndcg@{k}"].append(ndcg)
109
+
110
+ details.append({
111
+ "query": query,
112
+ "category": category,
113
+ "expected": list(expected),
114
+ "retrieved": result_ids[:max_k],
115
+ "hit@1": any(pos < 1 for pos in relevant_positions),
116
+ "hit@5": any(pos < 5 for pos in relevant_positions),
117
+ })
118
+
119
+ # Aggregate
120
+ summary = {
121
+ "total_queries": len(items),
122
+ "evaluated": len(details),
123
+ }
124
+ for metric_name, values in sorted(metrics.items()):
125
+ summary[metric_name] = round(sum(values) / len(values), 4) if values else 0.0
126
+
127
+ # Per-category breakdown
128
+ categories = {}
129
+ for cat, cat_metrics in sorted(category_metrics.items()):
130
+ categories[cat] = {
131
+ "count": len(next(iter(cat_metrics.values()))),
132
+ }
133
+ for metric_name, values in sorted(cat_metrics.items()):
134
+ categories[cat][metric_name] = round(sum(values) / len(values), 4) if values else 0.0
135
+
136
+ return {
137
+ "summary": summary,
138
+ "categories": categories,
139
+ "details": details,
140
+ }
141
+
142
+
143
+ def format_report_text(result: dict) -> str:
144
+ """Format evaluation result as human-readable text for daily report injection."""
145
+ lines = ["## Retrieval Quality"]
146
+ s = result["summary"]
147
+ for key in sorted(s):
148
+ if key.startswith("recall@") or key.startswith("ndcg@"):
149
+ lines.append(f"- {key}: {s[key]:.2%}")
150
+
151
+ if result.get("categories"):
152
+ lines.append("")
153
+ lines.append("**By category:**")
154
+ for cat, cm in sorted(result["categories"].items()):
155
+ r5 = cm.get("recall@5", 0)
156
+ lines.append(f"- {cat} (n={cm['count']}): recall@5={r5:.0%}")
157
+
158
+ # Weakest category
159
+ cats = result.get("categories", {})
160
+ if cats:
161
+ weakest = min(cats.items(), key=lambda x: x[1].get("recall@5", 1.0))
162
+ if weakest[1].get("recall@5", 1.0) < 0.8:
163
+ lines.append(f"\n**Weakest**: {weakest[0]} ({weakest[1].get('recall@5', 0):.0%})")
164
+
165
+ return "\n".join(lines)
166
+
167
+
168
+ def main() -> None:
169
+ parser = argparse.ArgumentParser(description="Retrieval Quality Evaluator")
170
+ parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
171
+ parser.add_argument("--benchmark", required=True, help="Path to retrieval_benchmark.jsonl")
172
+ parser.add_argument("--k", default="1,3,5", help="Comma-separated k values for Recall@k")
173
+ parser.add_argument("--format", choices=["json", "text"], default="json", help="Output format")
174
+ args = parser.parse_args()
175
+
176
+ k_values = [int(k) for k in args.k.split(",")]
177
+ result = evaluate_retrieval(args.benchmark, args.db, k_values)
178
+
179
+ if args.format == "text":
180
+ print(format_report_text(result))
181
+ else:
182
+ print(json.dumps(result, indent=2, ensure_ascii=False))
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
@@ -154,6 +154,37 @@ def format_facts_text(facts: list[dict], max_chars: int = 500) -> str:
154
154
  return "\n".join(lines)
155
155
 
156
156
 
157
+ def format_facts_compact(facts: list[dict], max_chars: int = 400) -> str:
158
+ """Encode facts for efficient escalation context injection.
159
+
160
+ Compact format: domain/entity: value (conf, Nx)
161
+ Inspired by mempalace AAAK compression — fits 3-5x more facts per token budget.
162
+ """
163
+ if not facts:
164
+ return ""
165
+
166
+ lines = []
167
+ total = 0
168
+ for f in facts:
169
+ entity = f.get("entityId", "").split(":")[-1][:20]
170
+ value = f.get("value", "")[:60]
171
+ conf = f.get("confidence", "?")
172
+ count = f.get("reinforce_count", "1")
173
+ domain = f.get("domain", "")
174
+
175
+ if domain:
176
+ line = f"{domain}/{entity}: {value} ({conf},{count}x)"
177
+ else:
178
+ line = f"{entity}: {value} ({conf},{count}x)"
179
+
180
+ if total + len(line) + 2 > max_chars:
181
+ break
182
+ lines.append(line)
183
+ total += len(line) + 2 # account for "; " separator
184
+
185
+ return "; ".join(lines)
186
+
187
+
157
188
  def domain_fact_counts(db_path: str) -> dict[str, int]:
158
189
  """Count facts per domain for module emergence detection."""
159
190
  if not Path(db_path).exists():
@@ -184,7 +215,7 @@ def main() -> None:
184
215
  parser.add_argument("--top", type=int, default=None, help="Query top-N facts by confidence")
185
216
  parser.add_argument("--domain-counts", action="store_true", help="Show fact counts per domain")
186
217
  parser.add_argument("--max-facts", type=int, default=5, help="Maximum facts to return")
187
- parser.add_argument("--format", choices=["text", "json"], default="json", help="Output format")
218
+ parser.add_argument("--format", choices=["text", "json", "compact"], default="json", help="Output format")
188
219
  args = parser.parse_args()
189
220
 
190
221
  if args.domain_counts:
@@ -202,6 +233,8 @@ def main() -> None:
202
233
 
203
234
  if args.format == "text":
204
235
  print(format_facts_text(facts))
236
+ elif args.format == "compact":
237
+ print(format_facts_compact(facts))
205
238
  else:
206
239
  print(json.dumps({"facts": facts, "count": len(facts)}, indent=2, ensure_ascii=False))
207
240
 
@@ -117,6 +117,55 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
117
117
  return f"fact:{slug}-{h}"
118
118
 
119
119
 
120
+ def _normalize_entity(name: str) -> str:
121
+ """Normalize entity name to canonical form: lowercase, hyphenated, no punctuation."""
122
+ return re.sub(r"[^a-z0-9-]", "", name.lower().replace(" ", "-").replace("_", "-"))
123
+
124
+
125
+ def _canonicalize_ops(ops: list[dict], existing_entities: list[str]) -> list[dict]:
126
+ """Map variant entity names to canonical forms before graph execution.
127
+
128
+ Inspired by mempalace entity detection — uses simple heuristic instead of
129
+ rule-based signal detection: normalize names, merge on edit distance or substring match.
130
+ Converts duplicate assert → reinforce when a near-match exists.
131
+ """
132
+ canonical_map: dict[str, str] = {} # normalized → existing entity name
133
+ for eid in existing_entities:
134
+ # Extract entity name from the entity_id's attributes (stored as "entity" attr)
135
+ canonical_map[_normalize_entity(eid)] = eid
136
+
137
+ result = []
138
+ for op in ops:
139
+ if op.get("op") != "assert":
140
+ result.append(op)
141
+ continue
142
+
143
+ entity = op.get("entity", "")
144
+ normalized = _normalize_entity(entity)
145
+
146
+ # Check for near-match in existing entities
147
+ matched_id = None
148
+ for existing_norm, existing_eid in canonical_map.items():
149
+ if existing_norm == normalized:
150
+ matched_id = existing_eid
151
+ break
152
+ # Substring match: "react-router" matches "react-router-dom"
153
+ if len(normalized) >= 4 and (normalized in existing_norm or existing_norm in normalized):
154
+ matched_id = existing_eid
155
+ break
156
+
157
+ if matched_id:
158
+ # Convert assert → reinforce (entity already exists under different name)
159
+ result.append({"op": "reinforce", "entityId": matched_id})
160
+ print(f" [canon] merged '{entity}' → existing '{matched_id}'", file=sys.stderr)
161
+ else:
162
+ result.append(op)
163
+ # Register the new canonical form
164
+ canonical_map[normalized] = _fact_id(entity, op.get("attribute", ""), op.get("value", ""))
165
+
166
+ return result
167
+
168
+
120
169
  def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: int = 50) -> list[dict]:
121
170
  """Load relevant facts from the knowledge graph for LLM context."""
122
171
  if not Path(db_path).exists():
@@ -180,6 +229,11 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
180
229
  try:
181
230
  from triplestore import TripleStore
182
231
  store = TripleStore(db_path)
232
+
233
+ # Canonicalize entity names to prevent fragmentation
234
+ existing_ids = [r[0] for r in store.entities_with_attr("entity")]
235
+ ops = _canonicalize_ops(ops, existing_ids)
236
+
183
237
  stats = {"asserted": 0, "reinforced": 0, "retracted": 0}
184
238
 
185
239
  for op_data in ops:
@@ -21,6 +21,7 @@ Self-test:
21
21
  """
22
22
 
23
23
  import json
24
+ import math
24
25
  import os
25
26
  import sqlite3
26
27
  import sys
@@ -51,6 +52,7 @@ CREATE TABLE IF NOT EXISTS triples (
51
52
  value_type TEXT NOT NULL DEFAULT 'string',
52
53
  retracted INTEGER NOT NULL DEFAULT 0,
53
54
  retracted_tx INTEGER,
55
+ valid_to TEXT,
54
56
  created_at TEXT NOT NULL
55
57
  );
56
58
 
@@ -88,6 +90,24 @@ def _entity_type(entity_id: str) -> str:
88
90
  return entity_id[:colon] if colon > 0 else "unknown"
89
91
 
90
92
 
93
+ def decayed_confidence(
94
+ confidence: float, created_at: str, half_life_days: int = 60
95
+ ) -> float:
96
+ """Apply exponential time-decay to a confidence score.
97
+
98
+ Facts lose half their confidence every `half_life_days` without reinforcement.
99
+ Inspired by mempalace's temporal validity model.
100
+ """
101
+ try:
102
+ created = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
103
+ age_days = (datetime.now(timezone.utc) - created).days
104
+ if age_days <= 0:
105
+ return confidence
106
+ return confidence * math.exp(-0.693 * age_days / half_life_days)
107
+ except (ValueError, TypeError):
108
+ return confidence
109
+
110
+
91
111
  class TripleStore:
92
112
  """SQLite-backed EAV triple store with WAL mode and 4 covering indexes."""
93
113
 
@@ -99,8 +119,16 @@ class TripleStore:
99
119
  self._conn.execute("PRAGMA journal_mode=WAL")
100
120
  self._conn.execute("PRAGMA busy_timeout=10000")
101
121
  self._conn.executescript(_SCHEMA_SQL)
122
+ self._migrate()
102
123
  self._conn.commit()
103
124
 
125
+ def _migrate(self) -> None:
126
+ """Run schema migrations for existing databases."""
127
+ # Add valid_to column if missing (added in memory-improvements)
128
+ cols = [r[1] for r in self._conn.execute("PRAGMA table_info(triples)").fetchall()]
129
+ if "valid_to" not in cols:
130
+ self._conn.execute("ALTER TABLE triples ADD COLUMN valid_to TEXT")
131
+
104
132
  def close(self) -> None:
105
133
  self._conn.close()
106
134
 
@@ -173,21 +201,22 @@ class TripleStore:
173
201
  ) -> int:
174
202
  """Retract triples matching entity+attribute (and optionally value).
175
203
 
176
- Sets retracted=1 and retracted_tx to the retraction transaction.
204
+ Sets retracted=1, retracted_tx, and valid_to (temporal closure).
177
205
  The original tx_id is preserved for temporal (as_of_tx) queries.
178
206
  Returns the count of triples retracted.
179
207
  """
208
+ now = _now_iso()
180
209
  if value is not None:
181
210
  cur = self._conn.execute(
182
- "UPDATE triples SET retracted = 1, retracted_tx = ? "
211
+ "UPDATE triples SET retracted = 1, retracted_tx = ?, valid_to = ? "
183
212
  "WHERE entity_id = ? AND attribute = ? AND value = ? AND retracted = 0",
184
- (tx_id, entity_id, attribute, value),
213
+ (tx_id, now, entity_id, attribute, value),
185
214
  )
186
215
  else:
187
216
  cur = self._conn.execute(
188
- "UPDATE triples SET retracted = 1, retracted_tx = ? "
217
+ "UPDATE triples SET retracted = 1, retracted_tx = ?, valid_to = ? "
189
218
  "WHERE entity_id = ? AND attribute = ? AND retracted = 0",
190
- (tx_id, entity_id, attribute),
219
+ (tx_id, now, entity_id, attribute),
191
220
  )
192
221
  self._conn.commit()
193
222
  return cur.rowcount
@@ -220,6 +249,26 @@ class TripleStore:
220
249
  result.setdefault(row["attribute"], []).append(row["value"])
221
250
  return result
222
251
 
252
+ def entity_as_of(self, entity_id: str, date: datetime) -> dict[str, list[str]]:
253
+ """Return entity attributes as they were on a specific date.
254
+
255
+ Uses created_at and valid_to for date-based temporal queries
256
+ (vs as_of_tx which uses transaction ordering).
257
+ """
258
+ date_iso = date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
259
+ rows = self._conn.execute(
260
+ "SELECT attribute, value FROM triples "
261
+ "WHERE entity_id = ? AND created_at <= ? "
262
+ "AND (valid_to IS NULL OR valid_to > ?) "
263
+ "AND retracted = 0 "
264
+ "ORDER BY attribute, id",
265
+ (entity_id, date_iso, date_iso),
266
+ ).fetchall()
267
+ result: dict[str, list[str]] = {}
268
+ for row in rows:
269
+ result.setdefault(row["attribute"], []).append(row["value"])
270
+ return result
271
+
223
272
  # ----- Query: AEVT (attribute scan) -----
224
273
 
225
274
  def entities_with_attr(
@@ -473,6 +522,28 @@ def _self_test() -> None:
473
522
  assert "priority" in ent_before, "as_of_tx should see pre-retraction state"
474
523
  print(" [OK] as_of_tx isolation")
475
524
 
525
+ # valid_to set on retraction
526
+ retracted_row = store._conn.execute(
527
+ "SELECT valid_to FROM triples WHERE entity_id = 'signal:2026-03-01' AND attribute = 'priority'"
528
+ ).fetchone()
529
+ assert retracted_row and retracted_row["valid_to"] is not None, "valid_to should be set on retraction"
530
+ print(" [OK] valid_to set on retraction")
531
+
532
+ # entity_as_of
533
+ future = datetime.now(timezone.utc) + timedelta(days=1)
534
+ ent_future = store.entity_as_of("signal:2026-03-01", future)
535
+ assert "description" in ent_future, "entity_as_of should find active triples"
536
+ assert "priority" not in ent_future, "entity_as_of should exclude retracted triples"
537
+ print(" [OK] entity_as_of temporal query")
538
+
539
+ # Confidence decay utility
540
+ fresh_conf = decayed_confidence(0.8, _now_iso())
541
+ assert abs(fresh_conf - 0.8) < 0.01, f"Fresh fact should keep confidence: {fresh_conf}"
542
+ old_date = (datetime.now(timezone.utc) - timedelta(days=60)).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
543
+ old_conf = decayed_confidence(0.8, old_date)
544
+ assert 0.35 < old_conf < 0.45, f"60-day-old fact should decay to ~0.4: {old_conf}"
545
+ print(f" [OK] Confidence decay: fresh=0.8→{fresh_conf:.2f}, 60d=0.8→{old_conf:.2f}")
546
+
476
547
  # GC (retracted triples are fresh, so gc with 0 days should get them)
477
548
  gc_count = store.gc(older_than_days=0)
478
549
  assert gc_count >= 1