@geravant/sinain 1.13.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/config-shared.js +1 -0
- package/package.json +4 -1
- package/sinain-agent/run.sh +36 -4
- package/sinain-core/src/buffers/feed-buffer.ts +6 -4
- package/sinain-core/src/index.ts +50 -19
- package/sinain-memory/graph_query.py +12 -3
- package/sinain-memory/knowledge_integrator.py +194 -10
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
- package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
- package/sinain-memory/eval/benchmarks/query.py +0 -193
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
- package/sinain-memory/eval/benchmarks/runner.py +0 -283
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
package/.env.example
CHANGED
|
@@ -23,9 +23,11 @@ PRIVACY_MODE=standard # off | standard | strict | paranoid
|
|
|
23
23
|
# paranoid: almost nothing leaves your machine
|
|
24
24
|
|
|
25
25
|
# ── Agent ────────────────────────────────────────────────────────────────────
|
|
26
|
-
SINAIN_AGENT=claude # claude | codex | junie | goose | aider | <custom command>
|
|
27
|
-
# MCP agents (claude, codex, junie, goose) call sinain tools directly
|
|
26
|
+
SINAIN_AGENT=claude # claude | openclaude | codex | junie | goose | aider | <custom command>
|
|
27
|
+
# MCP agents (claude, openclaude, codex, junie, goose) call sinain tools directly
|
|
28
28
|
# Pipe agents (aider, custom) receive escalation text on stdin
|
|
29
|
+
# openclaude: set OPENAI_BASE_URL=http://localhost:11434/v1 + OPENAI_MODEL=<ollama-model>
|
|
30
|
+
# to route through local Ollama. Run.sh auto-warms the model on startup.
|
|
29
31
|
SINAIN_CORE_URL=http://localhost:9500
|
|
30
32
|
SINAIN_POLL_INTERVAL=5 # seconds between escalation polls
|
|
31
33
|
SINAIN_HEARTBEAT_INTERVAL=900 # seconds between heartbeat ticks (15 min)
|
package/config-shared.js
CHANGED
|
@@ -459,6 +459,7 @@ export async function stepAgent(existing, label = "Bare agent") {
|
|
|
459
459
|
message: label,
|
|
460
460
|
options: [
|
|
461
461
|
{ value: "claude", label: "Claude Code", hint: "Calls sinain tools directly — recommended" },
|
|
462
|
+
{ value: "openclaude", label: "OpenClaude", hint: "Claude Code clone, local-first (Ollama/OpenAI-compat)" },
|
|
462
463
|
{ value: "codex", label: "Codex", hint: "Calls sinain tools directly" },
|
|
463
464
|
{ value: "goose", label: "Goose", hint: "Calls sinain tools directly" },
|
|
464
465
|
{ value: "junie", label: "Junie", hint: "JetBrains IDE agent" },
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@geravant/sinain",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.14.0",
|
|
4
4
|
"description": "Ambient intelligence that sees what you see, hears what you hear, and acts on your behalf",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -25,6 +25,9 @@
|
|
|
25
25
|
"index.ts",
|
|
26
26
|
"openclaw.plugin.json",
|
|
27
27
|
"sinain-memory",
|
|
28
|
+
"!sinain-memory/eval",
|
|
29
|
+
"!sinain-memory/tests",
|
|
30
|
+
"!sinain-memory/**/__pycache__",
|
|
28
31
|
"sinain-knowledge",
|
|
29
32
|
"sinain-core/src",
|
|
30
33
|
"sinain-core/package.json",
|
package/sinain-agent/run.sh
CHANGED
|
@@ -55,7 +55,7 @@ fi
|
|
|
55
55
|
JUNIE_HAS_MCP=false # set during startup checks
|
|
56
56
|
agent_has_mcp() {
|
|
57
57
|
case "$AGENT" in
|
|
58
|
-
claude|codex|goose) return 0 ;;
|
|
58
|
+
claude|openclaude|codex|goose) return 0 ;;
|
|
59
59
|
junie) $JUNIE_HAS_MCP ;;
|
|
60
60
|
*) return 1 ;;
|
|
61
61
|
esac
|
|
@@ -67,11 +67,11 @@ agent_has_mcp() {
|
|
|
67
67
|
invoke_agent() {
|
|
68
68
|
local prompt="$1"
|
|
69
69
|
case "$AGENT" in
|
|
70
|
-
claude)
|
|
70
|
+
claude|openclaude)
|
|
71
71
|
local turns="${2:-$AGENT_MAX_TURNS}"
|
|
72
72
|
if [ -n "${SINAIN_SPAWN:-}" ]; then
|
|
73
73
|
# Spawn: PreToolUse hook routes permission prompts to overlay HUD
|
|
74
|
-
|
|
74
|
+
"$AGENT" \
|
|
75
75
|
--mcp-config "$MCP_CONFIG" \
|
|
76
76
|
--settings "$SCRIPT_DIR/.claude/settings.json" \
|
|
77
77
|
${ALLOWED_TOOLS:+--allowedTools $ALLOWED_TOOLS} \
|
|
@@ -79,7 +79,7 @@ invoke_agent() {
|
|
|
79
79
|
-p "$prompt"
|
|
80
80
|
else
|
|
81
81
|
# Escalation: auto-approve for speed (short-lived, read-heavy)
|
|
82
|
-
|
|
82
|
+
"$AGENT" --enable-auto-mode \
|
|
83
83
|
--mcp-config "$MCP_CONFIG" \
|
|
84
84
|
${ALLOWED_TOOLS:+--allowedTools $ALLOWED_TOOLS} \
|
|
85
85
|
--max-turns "$turns" --output-format text \
|
|
@@ -227,6 +227,38 @@ print(' sinain extension added to ' + config_path)
|
|
|
227
227
|
fi
|
|
228
228
|
fi
|
|
229
229
|
|
|
230
|
+
# Ollama warmup — pin the backing model so each agent invocation hits hot weights.
|
|
231
|
+
# openclaude + Ollama via the OpenAI-compat endpoint does NOT forward keep_alive,
|
|
232
|
+
# so we ping Ollama's native /api/generate once with keep_alive=-1 (persistent).
|
|
233
|
+
# Applies to any agent pointed at an Ollama-compatible endpoint via OPENAI_BASE_URL.
|
|
234
|
+
OLLAMA_WARMUP="${OLLAMA_WARMUP:-true}"
|
|
235
|
+
if [ "$OLLAMA_WARMUP" = "true" ] && [ -n "${OPENAI_BASE_URL:-}" ]; then
|
|
236
|
+
if [[ "$OPENAI_BASE_URL" == *"11434"* ]] || [[ "$OPENAI_BASE_URL" == *"ollama"* ]]; then
|
|
237
|
+
# Derive Ollama host by stripping /v1 suffix from OPENAI_BASE_URL
|
|
238
|
+
OLLAMA_HOST="${OLLAMA_HOST:-${OPENAI_BASE_URL%/v1*}}"
|
|
239
|
+
OLLAMA_MODEL="${OLLAMA_MODEL:-${OPENAI_MODEL:-}}"
|
|
240
|
+
OLLAMA_KEEP_ALIVE="${OLLAMA_KEEP_ALIVE:--1}" # -1 = persistent, or "24h", "30m", etc.
|
|
241
|
+
if [ -n "$OLLAMA_MODEL" ]; then
|
|
242
|
+
echo "Warming Ollama model $OLLAMA_MODEL at $OLLAMA_HOST (keep_alive=$OLLAMA_KEEP_ALIVE)..."
|
|
243
|
+
# Ollama accepts keep_alive as int (-1 = persistent) or duration string ("24h", "30m").
|
|
244
|
+
if [[ "$OLLAMA_KEEP_ALIVE" =~ ^-?[0-9]+$ ]]; then
|
|
245
|
+
WARMUP_PAYLOAD="{\"model\":\"$OLLAMA_MODEL\",\"prompt\":\"\",\"keep_alive\":$OLLAMA_KEEP_ALIVE,\"stream\":false}"
|
|
246
|
+
else
|
|
247
|
+
WARMUP_PAYLOAD="{\"model\":\"$OLLAMA_MODEL\",\"prompt\":\"\",\"keep_alive\":\"$OLLAMA_KEEP_ALIVE\",\"stream\":false}"
|
|
248
|
+
fi
|
|
249
|
+
if curl -sf -m 60 -X POST "$OLLAMA_HOST/api/generate" \
|
|
250
|
+
-H 'Content-Type: application/json' \
|
|
251
|
+
-d "$WARMUP_PAYLOAD" >/dev/null 2>&1; then
|
|
252
|
+
echo " ✓ Model pinned in memory"
|
|
253
|
+
else
|
|
254
|
+
echo " ⚠ Warmup failed — first request will cold-start the model"
|
|
255
|
+
fi
|
|
256
|
+
else
|
|
257
|
+
echo " ⚠ OLLAMA_WARMUP=true but OPENAI_MODEL not set — skipping warmup"
|
|
258
|
+
fi
|
|
259
|
+
fi
|
|
260
|
+
fi
|
|
261
|
+
|
|
230
262
|
# Agent mode label
|
|
231
263
|
if agent_has_mcp; then
|
|
232
264
|
AGENT_MODE="MCP"
|
|
@@ -48,12 +48,14 @@ export class FeedBuffer {
|
|
|
48
48
|
this.items.push(item);
|
|
49
49
|
if (this.items.length > this._hwm) this._hwm = this.items.length;
|
|
50
50
|
|
|
51
|
-
// Fire
|
|
52
|
-
//
|
|
51
|
+
// Fire when enough new items have arrived since last distillation.
|
|
52
|
+
// 20 items ≈ 1.7 min of audio at ~12 items/min transcription rate.
|
|
53
|
+
// Distillation takes ~7s, so 20-item threshold gives 100s gap — safe margin.
|
|
54
|
+
// This means ~35 passes/hour, leaving <20 items undistilled at shutdown.
|
|
53
55
|
const newSinceRearm = this._version - this._onFullVersion;
|
|
54
|
-
if (this.items.length >=
|
|
56
|
+
if (this.items.length >= 20
|
|
55
57
|
&& this._onFullCb && this._onFullArmed
|
|
56
|
-
&& newSinceRearm >=
|
|
58
|
+
&& newSinceRearm >= 20) {
|
|
57
59
|
this._onFullArmed = false;
|
|
58
60
|
const snapshot = [...this.items];
|
|
59
61
|
queueMicrotask(() => this._onFullCb!(snapshot));
|
package/sinain-core/src/index.ts
CHANGED
|
@@ -67,35 +67,66 @@ async function queryKnowledgeFactsMulti(entities: string[], maxFacts: number): P
|
|
|
67
67
|
];
|
|
68
68
|
const scriptPath = scriptCandidates.find(p => existsSync(p)) || scriptCandidates[0];
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
// Step 1: Get candidates from Python (RRF-ranked, no embedding — avoids deadlock)
|
|
71
|
+
// Request 2x candidates in JSON for re-ranking in Node.js
|
|
72
|
+
const candidateFacts: Array<Record<string, string>> = [];
|
|
71
73
|
for (const dbPath of dbPaths) {
|
|
72
74
|
if (!existsSync(dbPath)) continue;
|
|
73
75
|
try {
|
|
74
|
-
const args = [scriptPath, "--db", dbPath, "--max-facts", String(maxFacts), "--format", "
|
|
76
|
+
const args = [scriptPath, "--db", dbPath, "--max-facts", String(maxFacts * 2), "--format", "json"];
|
|
75
77
|
if (entities.length > 0) args.push("--entities", JSON.stringify(entities));
|
|
76
78
|
const out = execFileSync("python3", args, { timeout: 5000, encoding: "utf-8" }).trim();
|
|
77
|
-
if (out)
|
|
79
|
+
if (out) {
|
|
80
|
+
const parsed = JSON.parse(out);
|
|
81
|
+
const facts = parsed.facts || parsed;
|
|
82
|
+
if (Array.isArray(facts)) candidateFacts.push(...facts);
|
|
83
|
+
}
|
|
78
84
|
} catch { /* skip failed db */ }
|
|
79
85
|
}
|
|
80
86
|
|
|
81
|
-
if (
|
|
82
|
-
if (results.length === 1) return results[0];
|
|
87
|
+
if (candidateFacts.length === 0) return "";
|
|
83
88
|
|
|
84
|
-
//
|
|
85
|
-
const
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
const
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
89
|
+
// Step 2: Re-rank by embedding similarity in-process (no deadlock — model is in this process)
|
|
90
|
+
const queryText = entities.join(" ");
|
|
91
|
+
try {
|
|
92
|
+
if (embeddingService?.ready) {
|
|
93
|
+
const allTexts = [queryText, ...candidateFacts.map(f => f.value || "")];
|
|
94
|
+
const embeddings = await embeddingService.embed(allTexts);
|
|
95
|
+
const queryEmb = embeddings[0];
|
|
96
|
+
const scored = candidateFacts.map((f, i) => ({
|
|
97
|
+
fact: f,
|
|
98
|
+
sim: EmbeddingService.cosine(queryEmb, embeddings[i + 1]),
|
|
99
|
+
}));
|
|
100
|
+
scored.sort((a, b) => b.sim - a.sim);
|
|
101
|
+
candidateFacts.length = 0;
|
|
102
|
+
candidateFacts.push(...scored.slice(0, maxFacts).map(s => s.fact));
|
|
94
103
|
}
|
|
104
|
+
} catch { /* embedding unavailable — use RRF order */ }
|
|
105
|
+
|
|
106
|
+
// Step 3: Format as compact text
|
|
107
|
+
const seen = new Set<string>();
|
|
108
|
+
const lines: string[] = [];
|
|
109
|
+
let total = 0;
|
|
110
|
+
const maxChars = 1200;
|
|
111
|
+
for (const f of candidateFacts.slice(0, maxFacts)) {
|
|
112
|
+
const eid = ((f as any).entity_id || (f as any).entityId || "").split(":").pop()?.slice(0, 20) || "?";
|
|
113
|
+
const value = (f as any).value || "";
|
|
114
|
+
const conf = (f as any).confidence || "?";
|
|
115
|
+
const count = (f as any).reinforce_count || "1";
|
|
116
|
+
const line = `${eid}: ${value} (${conf},${count}x)`;
|
|
117
|
+
const key = value.slice(0, 60);
|
|
118
|
+
if (seen.has(key)) continue;
|
|
119
|
+
seen.add(key);
|
|
120
|
+
if (total + line.length + 2 > maxChars) break;
|
|
121
|
+
lines.push(line);
|
|
122
|
+
total += line.length + 2;
|
|
95
123
|
}
|
|
96
|
-
return
|
|
124
|
+
return lines.join("; ");
|
|
97
125
|
}
|
|
98
126
|
|
|
127
|
+
// Reference to embedding service — set during init
|
|
128
|
+
let embeddingService: import("./embedding/service.js").EmbeddingService | null = null;
|
|
129
|
+
|
|
99
130
|
/** List all entities from both local and workspace knowledge graphs. */
|
|
100
131
|
async function listKnowledgeEntitiesMulti(max: number): Promise<string> {
|
|
101
132
|
const { execFileSync } = await import("node:child_process");
|
|
@@ -340,7 +371,7 @@ async function main() {
|
|
|
340
371
|
: null;
|
|
341
372
|
|
|
342
373
|
// ── Initialize embedding service (non-blocking) ──
|
|
343
|
-
|
|
374
|
+
embeddingService = new EmbeddingService();
|
|
344
375
|
embeddingService.loadAsync(); // ~9s background load, server starts immediately
|
|
345
376
|
|
|
346
377
|
// ── Initialize local knowledge pipeline ──
|
|
@@ -683,8 +714,8 @@ async function main() {
|
|
|
683
714
|
},
|
|
684
715
|
getSpawnPending: () => escalator.getSpawnPending(),
|
|
685
716
|
respondSpawn: (id: string, result: string) => escalator.respondSpawn(id, result),
|
|
686
|
-
embedTexts: (texts: string[]) => embeddingService
|
|
687
|
-
isEmbeddingReady: () => embeddingService
|
|
717
|
+
embedTexts: (texts: string[]) => embeddingService!.embed(texts),
|
|
718
|
+
isEmbeddingReady: () => embeddingService?.ready ?? false,
|
|
688
719
|
});
|
|
689
720
|
|
|
690
721
|
// ── Wire overlay profiling ──
|
|
@@ -330,6 +330,10 @@ def query_facts_hybrid(
|
|
|
330
330
|
if eid and eid not in fact_map:
|
|
331
331
|
fact_map[eid] = f
|
|
332
332
|
|
|
333
|
+
# Return top RRF candidates. Embedding re-ranking is done by the caller
|
|
334
|
+
# (sinain-core Node.js) to avoid deadlock — the Python subprocess can't call
|
|
335
|
+
# back to sinain-core's /embed endpoint while sinain-core is blocked waiting
|
|
336
|
+
# for the subprocess.
|
|
333
337
|
results = [fact_map[eid] for eid in sorted_ids[:max_facts] if eid in fact_map]
|
|
334
338
|
|
|
335
339
|
# Expand top results with 1-hop graph neighbors
|
|
@@ -396,7 +400,7 @@ def format_facts_text(facts: list[dict], max_chars: int = 500) -> str:
|
|
|
396
400
|
return "\n".join(lines)
|
|
397
401
|
|
|
398
402
|
|
|
399
|
-
def format_facts_compact(facts: list[dict], max_chars: int =
|
|
403
|
+
def format_facts_compact(facts: list[dict], max_chars: int = 1200) -> str:
|
|
400
404
|
"""Encode facts for efficient escalation context injection.
|
|
401
405
|
|
|
402
406
|
Compact format: domain/entity: value (conf, Nx)
|
|
@@ -409,7 +413,7 @@ def format_facts_compact(facts: list[dict], max_chars: int = 400) -> str:
|
|
|
409
413
|
total = 0
|
|
410
414
|
for f in facts:
|
|
411
415
|
entity = f.get("entityId", "").split(":")[-1][:20]
|
|
412
|
-
value = f.get("value", "")
|
|
416
|
+
value = f.get("value", "")
|
|
413
417
|
conf = f.get("confidence", "?")
|
|
414
418
|
count = f.get("reinforce_count", "1")
|
|
415
419
|
domain = f.get("domain", "")
|
|
@@ -469,7 +473,12 @@ def main() -> None:
|
|
|
469
473
|
facts = query_top_facts(args.db, limit=args.top)
|
|
470
474
|
elif args.entities:
|
|
471
475
|
entities = json.loads(args.entities)
|
|
472
|
-
|
|
476
|
+
# Use hybrid retrieval (FTS5 + tags + entity graph + RRF) for best results
|
|
477
|
+
query_text = " ".join(entities)
|
|
478
|
+
facts = query_facts_hybrid(args.db, query_text, max_facts=args.max_facts)
|
|
479
|
+
# Fallback to tag-only if hybrid returns nothing
|
|
480
|
+
if not facts:
|
|
481
|
+
facts = query_facts_by_entities(args.db, entities, max_facts=args.max_facts)
|
|
473
482
|
else:
|
|
474
483
|
facts = query_top_facts(args.db, limit=args.max_facts)
|
|
475
484
|
|
|
@@ -21,7 +21,9 @@ import json
|
|
|
21
21
|
import re
|
|
22
22
|
import shutil
|
|
23
23
|
import sys
|
|
24
|
+
import unicodedata
|
|
24
25
|
from datetime import datetime, timezone
|
|
26
|
+
from difflib import SequenceMatcher
|
|
25
27
|
from pathlib import Path
|
|
26
28
|
|
|
27
29
|
from common import (
|
|
@@ -121,9 +123,50 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
|
|
|
121
123
|
return f"fact:{slug}-{h}"
|
|
122
124
|
|
|
123
125
|
|
|
126
|
+
_UNICODE_PRE_MAP = str.maketrans({"ß": "ss", "ẞ": "SS"})
|
|
127
|
+
|
|
128
|
+
|
|
124
129
|
def _normalize_entity(name: str) -> str:
|
|
125
|
-
"""Normalize entity name to canonical form: lowercase, hyphenated,
|
|
126
|
-
|
|
130
|
+
"""Normalize entity name to canonical form: lowercase, hyphenated, ASCII-transliterated."""
|
|
131
|
+
s = name.translate(_UNICODE_PRE_MAP)
|
|
132
|
+
s = unicodedata.normalize("NFKD", s)
|
|
133
|
+
s = s.encode("ascii", "ignore").decode("ascii")
|
|
134
|
+
s = s.lower().replace(" ", "-").replace("_", "-")
|
|
135
|
+
s = re.sub(r"[^a-z0-9-]", "", s)
|
|
136
|
+
s = re.sub(r"-{2,}", "-", s)
|
|
137
|
+
return s.strip("-")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _find_matching_entity(
|
|
141
|
+
name: str,
|
|
142
|
+
existing_names: dict[str, str],
|
|
143
|
+
) -> str | None:
|
|
144
|
+
"""Find an existing entity that fuzzy-matches `name`. Returns entity_node_id or None."""
|
|
145
|
+
if name in existing_names:
|
|
146
|
+
return existing_names[name]
|
|
147
|
+
|
|
148
|
+
# Hyphen-insensitive exact match (chatgpt == chat-gpt)
|
|
149
|
+
name_compact = name.replace("-", "")
|
|
150
|
+
for existing_name, node_id in existing_names.items():
|
|
151
|
+
if existing_name.replace("-", "") == name_compact:
|
|
152
|
+
return node_id
|
|
153
|
+
|
|
154
|
+
# Edit-distance fuzzy match
|
|
155
|
+
if len(name) < 3:
|
|
156
|
+
return None
|
|
157
|
+
threshold = 0.90
|
|
158
|
+
best_match = None
|
|
159
|
+
best_ratio = threshold
|
|
160
|
+
for existing_name, node_id in existing_names.items():
|
|
161
|
+
if len(existing_name) < 3:
|
|
162
|
+
continue
|
|
163
|
+
if frozenset({name, existing_name}) in _DEDUP_SKIP_PAIRS:
|
|
164
|
+
continue
|
|
165
|
+
ratio = SequenceMatcher(None, name, existing_name).ratio()
|
|
166
|
+
if ratio >= best_ratio:
|
|
167
|
+
best_ratio = ratio
|
|
168
|
+
best_match = node_id
|
|
169
|
+
return best_match
|
|
127
170
|
|
|
128
171
|
|
|
129
172
|
def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
|
|
@@ -528,7 +571,14 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
528
571
|
# --- Build entity graph layer (two-layer model) ---
|
|
529
572
|
if digest_entities and stats["asserted"] > 0:
|
|
530
573
|
try:
|
|
531
|
-
#
|
|
574
|
+
# Load existing entity names for fuzzy matching
|
|
575
|
+
all_entity_nodes: dict[str, str] = {} # {name: entity_node_id}
|
|
576
|
+
for r in store.entities_with_attr("name"):
|
|
577
|
+
if r[0].startswith("entity:"):
|
|
578
|
+
all_entity_nodes[r[1]] = r[0]
|
|
579
|
+
|
|
580
|
+
# Create entity:* nodes from digest entities (with fuzzy dedup)
|
|
581
|
+
entity_resolve: dict[str, str] = {} # {normalized_name: resolved_node_id}
|
|
532
582
|
for ent in (digest_entities or []):
|
|
533
583
|
if isinstance(ent, dict):
|
|
534
584
|
ename = _normalize_entity(ent.get("name", ""))
|
|
@@ -539,12 +589,22 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
539
589
|
if not ename or len(ename) < 2:
|
|
540
590
|
continue
|
|
541
591
|
|
|
592
|
+
# Check for fuzzy match against existing entities
|
|
593
|
+
matched_id = _find_matching_entity(ename, all_entity_nodes)
|
|
594
|
+
if matched_id:
|
|
595
|
+
entity_resolve[ename] = matched_id
|
|
596
|
+
if matched_id != f"entity:{ename}":
|
|
597
|
+
print(f" [graph] alias: \"{ename}\" → {matched_id}", file=sys.stderr)
|
|
598
|
+
continue
|
|
599
|
+
|
|
542
600
|
entity_node_id = f"entity:{ename}"
|
|
543
601
|
existing = store.entity(entity_node_id)
|
|
544
602
|
if not existing:
|
|
545
603
|
tx = store.begin_tx("entity_graph")
|
|
546
604
|
store.assert_triple(tx, entity_node_id, "name", ename)
|
|
547
605
|
store.assert_triple(tx, entity_node_id, "type", etype)
|
|
606
|
+
all_entity_nodes[ename] = entity_node_id
|
|
607
|
+
entity_resolve[ename] = entity_node_id
|
|
548
608
|
|
|
549
609
|
# Link facts to their entity nodes via "about" ref edges
|
|
550
610
|
for op_data in ops:
|
|
@@ -554,18 +614,13 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
554
614
|
value = op_data.get("value", "")
|
|
555
615
|
attribute = op_data.get("attribute", "")
|
|
556
616
|
fact_eid = _fact_id(entity, attribute, value)
|
|
557
|
-
|
|
617
|
+
norm_entity = _normalize_entity(entity)
|
|
618
|
+
entity_node_id = entity_resolve.get(norm_entity, f"entity:{norm_entity}")
|
|
558
619
|
# Only link if entity node exists
|
|
559
620
|
if store.entity(entity_node_id):
|
|
560
621
|
tx = store.begin_tx("entity_graph")
|
|
561
622
|
store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
|
|
562
623
|
|
|
563
|
-
# Infer cross-entity refs from fact content
|
|
564
|
-
all_entity_nodes = {}
|
|
565
|
-
for r in store.entities_with_attr("name"):
|
|
566
|
-
if r[0].startswith("entity:"):
|
|
567
|
-
all_entity_nodes[r[1]] = r[0] # {name: entity_id}
|
|
568
|
-
|
|
569
624
|
ref_count = 0
|
|
570
625
|
for fact_eid_row in store.entities_with_attr("value"):
|
|
571
626
|
fact_eid = fact_eid_row[0]
|
|
@@ -695,17 +750,146 @@ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
|
|
|
695
750
|
return {"bootstrapped": stats.get("asserted", 0)}
|
|
696
751
|
|
|
697
752
|
|
|
753
|
+
# Pairs that fuzzy matching incorrectly clusters — reviewed and confirmed distinct.
|
|
754
|
+
_DEDUP_SKIP_PAIRS = {
|
|
755
|
+
frozenset({"ai-driven-development", "spac-driven-development"}),
|
|
756
|
+
frozenset({"german", "germany"}),
|
|
757
|
+
frozenset({"llama", "ollama"}),
|
|
758
|
+
frozenset({"gemma", "gemma4"}),
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def merge_entity_duplicates(db_path: str, dry_run: bool = True) -> dict:
|
|
763
|
+
"""Merge fragmented entity nodes using fuzzy matching.
|
|
764
|
+
|
|
765
|
+
Idempotent: checks for migration:entity-dedup-v1 stamp.
|
|
766
|
+
"""
|
|
767
|
+
from triplestore import TripleStore
|
|
768
|
+
store = TripleStore(db_path)
|
|
769
|
+
|
|
770
|
+
# Idempotency check
|
|
771
|
+
stamp = store.entity("migration:entity-dedup-v1")
|
|
772
|
+
if stamp:
|
|
773
|
+
print("migration:entity-dedup-v1 already applied — skipping", file=sys.stderr)
|
|
774
|
+
return {"status": "already_applied"}
|
|
775
|
+
|
|
776
|
+
# Load all entity nodes
|
|
777
|
+
all_entities: dict[str, str] = {} # {name: entity_node_id}
|
|
778
|
+
for entity_id, name in store.entities_with_attr("name"):
|
|
779
|
+
if entity_id.startswith("entity:"):
|
|
780
|
+
all_entities[name] = entity_id
|
|
781
|
+
|
|
782
|
+
print(f"Total entity nodes: {len(all_entities)}", file=sys.stderr)
|
|
783
|
+
|
|
784
|
+
# Build clusters via greedy matching
|
|
785
|
+
remaining = dict(all_entities) # copy
|
|
786
|
+
clusters: list[list[tuple[str, str]]] = [] # [[( name, node_id ), ...], ...]
|
|
787
|
+
|
|
788
|
+
while remaining:
|
|
789
|
+
seed_name, seed_id = next(iter(remaining.items()))
|
|
790
|
+
cluster = [(seed_name, seed_id)]
|
|
791
|
+
del remaining[seed_name]
|
|
792
|
+
|
|
793
|
+
# Find all matches for this seed
|
|
794
|
+
to_remove = []
|
|
795
|
+
for other_name, other_id in remaining.items():
|
|
796
|
+
matched = _find_matching_entity(other_name, {seed_name: seed_id})
|
|
797
|
+
if matched:
|
|
798
|
+
cluster.append((other_name, other_id))
|
|
799
|
+
to_remove.append(other_name)
|
|
800
|
+
for name in to_remove:
|
|
801
|
+
del remaining[name]
|
|
802
|
+
|
|
803
|
+
if len(cluster) > 1:
|
|
804
|
+
# Filter out known false-positive pairs
|
|
805
|
+
names_set = {n for n, _ in cluster}
|
|
806
|
+
if any(pair <= names_set for pair in _DEDUP_SKIP_PAIRS):
|
|
807
|
+
continue
|
|
808
|
+
clusters.append(cluster)
|
|
809
|
+
|
|
810
|
+
print(f"Found {len(clusters)} duplicate clusters", file=sys.stderr)
|
|
811
|
+
|
|
812
|
+
merge_count = 0
|
|
813
|
+
repoint_count = 0
|
|
814
|
+
|
|
815
|
+
for cluster in clusters:
|
|
816
|
+
# Canonical selection: if any entity has significantly more backrefs (5+),
|
|
817
|
+
# use it. Otherwise prefer longest name (most complete spelling).
|
|
818
|
+
max_refs = max(len(store.backrefs(nid)) for _, nid in cluster)
|
|
819
|
+
if max_refs >= 5:
|
|
820
|
+
cluster.sort(key=lambda x: (-len(store.backrefs(x[1])), -len(x[0]), x[0]))
|
|
821
|
+
else:
|
|
822
|
+
cluster.sort(key=lambda x: (-len(x[0]), x[0]))
|
|
823
|
+
canonical_name, canonical_id = cluster[0]
|
|
824
|
+
duplicates = cluster[1:]
|
|
825
|
+
|
|
826
|
+
dup_names = [d[0] for d in duplicates]
|
|
827
|
+
print(f" cluster: {canonical_name} ← {dup_names}", file=sys.stderr)
|
|
828
|
+
|
|
829
|
+
if dry_run:
|
|
830
|
+
merge_count += len(duplicates)
|
|
831
|
+
continue
|
|
832
|
+
|
|
833
|
+
for dup_name, dup_id in duplicates:
|
|
834
|
+
# Re-point all refs pointing to this duplicate
|
|
835
|
+
refs = store.backrefs(dup_id)
|
|
836
|
+
for src_entity, attr in refs:
|
|
837
|
+
tx = store.begin_tx("entity_dedup")
|
|
838
|
+
store.retract_triple(tx, src_entity, attr, dup_id)
|
|
839
|
+
store.assert_triple(tx, src_entity, attr, canonical_id, value_type="ref")
|
|
840
|
+
repoint_count += 1
|
|
841
|
+
|
|
842
|
+
# Retract all triples of the duplicate entity itself
|
|
843
|
+
dup_attrs = store.entity(dup_id)
|
|
844
|
+
tx = store.begin_tx("entity_dedup")
|
|
845
|
+
for attr, values in dup_attrs.items():
|
|
846
|
+
if not isinstance(values, list):
|
|
847
|
+
values = [values]
|
|
848
|
+
for val in values:
|
|
849
|
+
store.retract_triple(tx, dup_id, attr, str(val))
|
|
850
|
+
|
|
851
|
+
merge_count += 1
|
|
852
|
+
|
|
853
|
+
# Stamp migration
|
|
854
|
+
if not dry_run and clusters:
|
|
855
|
+
tx = store.begin_tx("entity_dedup")
|
|
856
|
+
store.assert_triple(tx, "migration:entity-dedup-v1", "applied_at",
|
|
857
|
+
datetime.now(timezone.utc).isoformat())
|
|
858
|
+
store.assert_triple(tx, "migration:entity-dedup-v1", "clusters_merged",
|
|
859
|
+
str(len(clusters)))
|
|
860
|
+
|
|
861
|
+
result = {
|
|
862
|
+
"status": "dry_run" if dry_run else "applied",
|
|
863
|
+
"clusters": len(clusters),
|
|
864
|
+
"entities_merged": merge_count,
|
|
865
|
+
"refs_repointed": repoint_count,
|
|
866
|
+
}
|
|
867
|
+
print(json.dumps(result, indent=2), file=sys.stderr)
|
|
868
|
+
return result
|
|
869
|
+
|
|
870
|
+
|
|
698
871
|
def main() -> None:
|
|
699
872
|
parser = argparse.ArgumentParser(description="Knowledge Integrator")
|
|
700
873
|
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
701
874
|
parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
|
|
702
875
|
parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
|
|
703
876
|
parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
|
|
877
|
+
parser.add_argument("--dedup-entities", action="store_true", help="Merge fragmented entity nodes")
|
|
878
|
+
parser.add_argument("--dry-run", action="store_true", help="Preview changes without applying")
|
|
704
879
|
args = parser.parse_args()
|
|
705
880
|
|
|
706
881
|
memory_dir = args.memory_dir
|
|
707
882
|
db_path = str(Path(memory_dir) / "knowledge-graph.db")
|
|
708
883
|
|
|
884
|
+
# Entity dedup mode: merge fragmented entity nodes
|
|
885
|
+
if args.dedup_entities:
|
|
886
|
+
if not Path(db_path).exists():
|
|
887
|
+
output_json({"error": "knowledge-graph.db not found"})
|
|
888
|
+
return
|
|
889
|
+
result = merge_entity_duplicates(db_path, dry_run=args.dry_run)
|
|
890
|
+
output_json(result)
|
|
891
|
+
return
|
|
892
|
+
|
|
709
893
|
# Bootstrap mode: seed graph from current playbook
|
|
710
894
|
if args.bootstrap:
|
|
711
895
|
result = _bootstrap_graph(memory_dir, db_path)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
Binary file
|