@geravant/sinain 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -0
- package/index.ts +2096 -0
- package/install.js +155 -0
- package/openclaw.plugin.json +59 -0
- package/package.json +21 -0
- package/sinain-memory/common.py +403 -0
- package/sinain-memory/demo_knowledge_transfer.sh +85 -0
- package/sinain-memory/embedder.py +268 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/assertions.py +288 -0
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +61 -0
- package/sinain-memory/eval/judges/curation_judge.py +46 -0
- package/sinain-memory/eval/judges/insight_judge.py +48 -0
- package/sinain-memory/eval/judges/mining_judge.py +42 -0
- package/sinain-memory/eval/judges/signal_judge.py +45 -0
- package/sinain-memory/eval/schemas.py +247 -0
- package/sinain-memory/eval_delta.py +109 -0
- package/sinain-memory/eval_reporter.py +642 -0
- package/sinain-memory/feedback_analyzer.py +221 -0
- package/sinain-memory/git_backup.sh +19 -0
- package/sinain-memory/insight_synthesizer.py +181 -0
- package/sinain-memory/memory/2026-03-01.md +11 -0
- package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
- package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
- package/sinain-memory/memory/sinain-playbook.md +21 -0
- package/sinain-memory/memory-config.json +39 -0
- package/sinain-memory/memory_miner.py +183 -0
- package/sinain-memory/module_manager.py +695 -0
- package/sinain-memory/playbook_curator.py +225 -0
- package/sinain-memory/requirements.txt +3 -0
- package/sinain-memory/signal_analyzer.py +141 -0
- package/sinain-memory/test_local.py +402 -0
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +189 -0
- package/sinain-memory/tests/test_curator_helpers.py +94 -0
- package/sinain-memory/tests/test_embedder.py +210 -0
- package/sinain-memory/tests/test_extract_json.py +124 -0
- package/sinain-memory/tests/test_feedback_computation.py +121 -0
- package/sinain-memory/tests/test_miner_helpers.py +71 -0
- package/sinain-memory/tests/test_module_management.py +458 -0
- package/sinain-memory/tests/test_parsers.py +96 -0
- package/sinain-memory/tests/test_tick_evaluator.py +430 -0
- package/sinain-memory/tests/test_triple_extractor.py +255 -0
- package/sinain-memory/tests/test_triple_ingest.py +191 -0
- package/sinain-memory/tests/test_triple_migrate.py +138 -0
- package/sinain-memory/tests/test_triplestore.py +248 -0
- package/sinain-memory/tick_evaluator.py +392 -0
- package/sinain-memory/triple_extractor.py +402 -0
- package/sinain-memory/triple_ingest.py +290 -0
- package/sinain-memory/triple_migrate.py +275 -0
- package/sinain-memory/triple_query.py +184 -0
- package/sinain-memory/triplestore.py +498 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Triple Extractor — 3-tier extraction from sinain data into EAV triples.
|
|
3
|
+
|
|
4
|
+
Tier 1: JSON direct (~70%) — structured data maps directly to triples.
|
|
5
|
+
Tier 2: Regex + validate (~20%) — semi-structured text (playbooks, patterns.md).
|
|
6
|
+
Tier 3: LLM fallback (~10%) — free-form text where regex fails.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from triple_extractor import TripleExtractor
|
|
10
|
+
extractor = TripleExtractor(store)
|
|
11
|
+
triples = extractor.extract_signal(signal_data, tick_ts)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from triplestore import TripleStore
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Triple:
|
|
27
|
+
"""A single entity-attribute-value triple to be asserted."""
|
|
28
|
+
entity_id: str
|
|
29
|
+
attribute: str
|
|
30
|
+
value: str
|
|
31
|
+
value_type: str = "string"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _make_slug(text: str) -> str:
|
|
35
|
+
"""Convert text to a lowercase hyphen-separated slug.
|
|
36
|
+
|
|
37
|
+
>>> _make_slug("Frame Batching Improves OCR")
|
|
38
|
+
'frame-batching-improves-ocr'
|
|
39
|
+
"""
|
|
40
|
+
slug = re.sub(r"[^a-z0-9]+", "-", text.lower().strip())
|
|
41
|
+
return slug.strip("-")[:80] # cap length
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TripleExtractor:
|
|
45
|
+
"""Extracts EAV triples from various sinain data sources."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, store: "TripleStore") -> None:
|
|
48
|
+
self.store = store
|
|
49
|
+
self._vocab_cache: list[tuple[str, str]] | None = None
|
|
50
|
+
|
|
51
|
+
# ----- Tier 1: JSON direct extraction -----
|
|
52
|
+
|
|
53
|
+
def extract_signal(self, signal_data: dict, tick_ts: str) -> list[Triple]:
|
|
54
|
+
"""Extract triples from a signal analysis result (Tier 1).
|
|
55
|
+
|
|
56
|
+
Creates signal:{tick_ts} entity + concept refs for signals.
|
|
57
|
+
"""
|
|
58
|
+
triples: list[Triple] = []
|
|
59
|
+
entity_id = f"signal:{tick_ts}"
|
|
60
|
+
|
|
61
|
+
# Core signal attributes
|
|
62
|
+
if "sessionSummary" in signal_data:
|
|
63
|
+
triples.append(Triple(entity_id, "summary", str(signal_data["sessionSummary"])))
|
|
64
|
+
if "idle" in signal_data:
|
|
65
|
+
triples.append(Triple(entity_id, "idle", str(signal_data["idle"]).lower()))
|
|
66
|
+
|
|
67
|
+
# Individual signals
|
|
68
|
+
for i, sig in enumerate(signal_data.get("signals", [])):
|
|
69
|
+
desc = sig.get("description", "")
|
|
70
|
+
priority = sig.get("priority", "medium")
|
|
71
|
+
if desc:
|
|
72
|
+
triples.append(Triple(entity_id, "description", desc))
|
|
73
|
+
triples.append(Triple(entity_id, "priority", priority))
|
|
74
|
+
# Extract concepts from signal description
|
|
75
|
+
for concept_triple in self.extract_concepts(desc):
|
|
76
|
+
triples.append(concept_triple)
|
|
77
|
+
if concept_triple.entity_id.startswith("concept:"):
|
|
78
|
+
triples.append(Triple(
|
|
79
|
+
entity_id, "related_to", concept_triple.entity_id, "ref"
|
|
80
|
+
))
|
|
81
|
+
|
|
82
|
+
# Recommended action
|
|
83
|
+
action = signal_data.get("recommendedAction")
|
|
84
|
+
if action and isinstance(action, dict):
|
|
85
|
+
triples.append(Triple(entity_id, "action", action.get("action", "")))
|
|
86
|
+
if "task" in action:
|
|
87
|
+
triples.append(Triple(entity_id, "action_task", action["task"]))
|
|
88
|
+
if "confidence" in action:
|
|
89
|
+
triples.append(Triple(entity_id, "action_confidence", str(action["confidence"])))
|
|
90
|
+
|
|
91
|
+
# Playbook changes
|
|
92
|
+
changes = (signal_data.get("playbookChanges") or {}).get("changes", {})
|
|
93
|
+
for added in changes.get("added", []):
|
|
94
|
+
slug = _make_slug(added)
|
|
95
|
+
if slug:
|
|
96
|
+
pattern_id = f"pattern:{slug}"
|
|
97
|
+
triples.append(Triple(pattern_id, "text", added))
|
|
98
|
+
triples.append(Triple(pattern_id, "source", "signal_analyzer"))
|
|
99
|
+
triples.append(Triple(entity_id, "added_pattern", pattern_id, "ref"))
|
|
100
|
+
|
|
101
|
+
# Output
|
|
102
|
+
output = signal_data.get("output", {})
|
|
103
|
+
if isinstance(output, dict):
|
|
104
|
+
if output.get("suggestion"):
|
|
105
|
+
triples.append(Triple(entity_id, "suggestion", output["suggestion"]))
|
|
106
|
+
if output.get("insight"):
|
|
107
|
+
triples.append(Triple(entity_id, "insight", output["insight"]))
|
|
108
|
+
|
|
109
|
+
return triples
|
|
110
|
+
|
|
111
|
+
def extract_session(self, session_data: dict) -> list[Triple]:
|
|
112
|
+
"""Extract triples from a session summary (Tier 1).
|
|
113
|
+
|
|
114
|
+
Creates session:{ts} entity with summary, tool refs, etc.
|
|
115
|
+
"""
|
|
116
|
+
triples: list[Triple] = []
|
|
117
|
+
ts = session_data.get("ts", session_data.get("timestamp", "unknown"))
|
|
118
|
+
entity_id = f"session:{ts}"
|
|
119
|
+
|
|
120
|
+
if "summary" in session_data:
|
|
121
|
+
triples.append(Triple(entity_id, "summary", session_data["summary"]))
|
|
122
|
+
if "sessionSummary" in session_data:
|
|
123
|
+
triples.append(Triple(entity_id, "summary", session_data["sessionSummary"]))
|
|
124
|
+
|
|
125
|
+
# Tool usage
|
|
126
|
+
for tool in session_data.get("toolsUsed", []):
|
|
127
|
+
tool_name = tool if isinstance(tool, str) else tool.get("name", "")
|
|
128
|
+
if tool_name:
|
|
129
|
+
tool_id = f"tool:{_make_slug(tool_name)}"
|
|
130
|
+
triples.append(Triple(tool_id, "name", tool_name))
|
|
131
|
+
triples.append(Triple(entity_id, "used_tool", tool_id, "ref"))
|
|
132
|
+
|
|
133
|
+
# Duration
|
|
134
|
+
if "durationMs" in session_data:
|
|
135
|
+
triples.append(Triple(entity_id, "duration_ms", str(session_data["durationMs"])))
|
|
136
|
+
|
|
137
|
+
# Extract concepts from summary
|
|
138
|
+
summary_text = session_data.get("summary", session_data.get("sessionSummary", ""))
|
|
139
|
+
if summary_text:
|
|
140
|
+
for concept_triple in self.extract_concepts(summary_text):
|
|
141
|
+
triples.append(concept_triple)
|
|
142
|
+
if concept_triple.entity_id.startswith("concept:"):
|
|
143
|
+
triples.append(Triple(
|
|
144
|
+
entity_id, "related_to", concept_triple.entity_id, "ref"
|
|
145
|
+
))
|
|
146
|
+
|
|
147
|
+
return triples
|
|
148
|
+
|
|
149
|
+
def extract_mining(self, mining_data: dict) -> list[Triple]:
|
|
150
|
+
"""Extract triples from memory mining results (Tier 1).
|
|
151
|
+
|
|
152
|
+
New patterns → pattern:{slug} entities.
|
|
153
|
+
"""
|
|
154
|
+
triples: list[Triple] = []
|
|
155
|
+
|
|
156
|
+
for pattern_text in mining_data.get("newPatterns", []):
|
|
157
|
+
slug = _make_slug(pattern_text)
|
|
158
|
+
if not slug:
|
|
159
|
+
continue
|
|
160
|
+
pattern_id = f"pattern:{slug}"
|
|
161
|
+
triples.append(Triple(pattern_id, "text", pattern_text))
|
|
162
|
+
triples.append(Triple(pattern_id, "source", "memory_miner"))
|
|
163
|
+
# Extract concepts
|
|
164
|
+
for ct in self.extract_concepts(pattern_text):
|
|
165
|
+
triples.append(ct)
|
|
166
|
+
if ct.entity_id.startswith("concept:"):
|
|
167
|
+
triples.append(Triple(pattern_id, "related_to", ct.entity_id, "ref"))
|
|
168
|
+
|
|
169
|
+
for pref in mining_data.get("preferences", []):
|
|
170
|
+
slug = _make_slug(pref)
|
|
171
|
+
if slug:
|
|
172
|
+
pref_id = f"pattern:{slug}"
|
|
173
|
+
triples.append(Triple(pref_id, "text", pref))
|
|
174
|
+
triples.append(Triple(pref_id, "source", "memory_miner"))
|
|
175
|
+
triples.append(Triple(pref_id, "pattern_type", "preference"))
|
|
176
|
+
|
|
177
|
+
for contradiction in mining_data.get("contradictions", []):
|
|
178
|
+
slug = _make_slug(contradiction)
|
|
179
|
+
if slug:
|
|
180
|
+
c_id = f"pattern:{slug}"
|
|
181
|
+
triples.append(Triple(c_id, "text", contradiction))
|
|
182
|
+
triples.append(Triple(c_id, "source", "memory_miner"))
|
|
183
|
+
triples.append(Triple(c_id, "pattern_type", "contradiction"))
|
|
184
|
+
|
|
185
|
+
return triples
|
|
186
|
+
|
|
187
|
+
# ----- Tier 2: Regex extraction -----
|
|
188
|
+
|
|
189
|
+
def extract_playbook(self, playbook_text: str) -> list[Triple]:
|
|
190
|
+
"""Extract triples from playbook markdown (Tier 2: regex).
|
|
191
|
+
|
|
192
|
+
Pattern: ^- text (score: N.N)?
|
|
193
|
+
Falls back to Tier 3 if <3 patterns extracted from non-empty input.
|
|
194
|
+
"""
|
|
195
|
+
triples: list[Triple] = []
|
|
196
|
+
pattern_re = re.compile(r"^-\s+(.+?)(?:\s*\(score:\s*([\d.]+)\))?\s*$", re.MULTILINE)
|
|
197
|
+
|
|
198
|
+
for match in pattern_re.finditer(playbook_text):
|
|
199
|
+
text = match.group(1).strip()
|
|
200
|
+
score = match.group(2)
|
|
201
|
+
|
|
202
|
+
# Skip HTML comments and metadata
|
|
203
|
+
if text.startswith("<!--") or text.startswith("[since:"):
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
slug = _make_slug(text)
|
|
207
|
+
if not slug:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
pattern_id = f"pattern:{slug}"
|
|
211
|
+
triples.append(Triple(pattern_id, "text", text))
|
|
212
|
+
triples.append(Triple(pattern_id, "source", "playbook"))
|
|
213
|
+
if score:
|
|
214
|
+
triples.append(Triple(pattern_id, "score", score))
|
|
215
|
+
|
|
216
|
+
# Extract concepts from pattern text
|
|
217
|
+
for ct in self.extract_concepts(text):
|
|
218
|
+
triples.append(ct)
|
|
219
|
+
if ct.entity_id.startswith("concept:"):
|
|
220
|
+
triples.append(Triple(pattern_id, "related_to", ct.entity_id, "ref"))
|
|
221
|
+
|
|
222
|
+
# Tier 3 fallback: if we got <3 patterns from non-trivial input
|
|
223
|
+
non_comment = re.sub(r"<!--.*?-->", "", playbook_text, flags=re.DOTALL).strip()
|
|
224
|
+
if len(non_comment) > 100 and sum(1 for t in triples if t.attribute == "text") < 3:
|
|
225
|
+
tier3 = self._extract_patterns_llm(playbook_text)
|
|
226
|
+
triples.extend(tier3)
|
|
227
|
+
|
|
228
|
+
return triples
|
|
229
|
+
|
|
230
|
+
def extract_module(
|
|
231
|
+
self, module_id: str, manifest: dict, patterns_text: str,
|
|
232
|
+
guidance_text: str = "",
|
|
233
|
+
) -> list[Triple]:
|
|
234
|
+
"""Extract triples from a module's manifest (Tier 1) + patterns.md + guidance.md (Tier 2).
|
|
235
|
+
|
|
236
|
+
Creates module:{id} entity + pattern entities from patterns.md
|
|
237
|
+
+ guidance entities from guidance.md.
|
|
238
|
+
"""
|
|
239
|
+
triples: list[Triple] = []
|
|
240
|
+
entity_id = f"module:{module_id}"
|
|
241
|
+
|
|
242
|
+
# Tier 1: manifest fields
|
|
243
|
+
triples.append(Triple(entity_id, "name", manifest.get("name", module_id)))
|
|
244
|
+
if "description" in manifest:
|
|
245
|
+
triples.append(Triple(entity_id, "description", manifest["description"]))
|
|
246
|
+
if "version" in manifest:
|
|
247
|
+
triples.append(Triple(entity_id, "version", manifest["version"]))
|
|
248
|
+
|
|
249
|
+
# Tier 2: extract patterns from patterns.md
|
|
250
|
+
if patterns_text:
|
|
251
|
+
pattern_triples = self.extract_playbook(patterns_text)
|
|
252
|
+
for pt in pattern_triples:
|
|
253
|
+
triples.append(pt)
|
|
254
|
+
# Link patterns to module
|
|
255
|
+
if pt.attribute == "text" and pt.entity_id.startswith("pattern:"):
|
|
256
|
+
triples.append(Triple(pt.entity_id, "belongs_to", entity_id, "ref"))
|
|
257
|
+
|
|
258
|
+
# Tier 2b: extract guidance items from guidance.md
|
|
259
|
+
if guidance_text:
|
|
260
|
+
guidance_triples = self.extract_playbook(guidance_text)
|
|
261
|
+
for gt in guidance_triples:
|
|
262
|
+
# Remap pattern: → guidance: entity prefix
|
|
263
|
+
if gt.entity_id.startswith("pattern:"):
|
|
264
|
+
gt = Triple(
|
|
265
|
+
gt.entity_id.replace("pattern:", "guidance:", 1),
|
|
266
|
+
gt.attribute, gt.value, gt.value_type,
|
|
267
|
+
)
|
|
268
|
+
triples.append(gt)
|
|
269
|
+
if gt.attribute == "text" and gt.entity_id.startswith("guidance:"):
|
|
270
|
+
triples.append(Triple(gt.entity_id, "type", "guidance"))
|
|
271
|
+
triples.append(Triple(gt.entity_id, "belongs_to", entity_id, "ref"))
|
|
272
|
+
|
|
273
|
+
return triples
|
|
274
|
+
|
|
275
|
+
# ----- Concept extraction (3-tier) -----
|
|
276
|
+
|
|
277
|
+
def extract_concepts(self, text: str) -> list[Triple]:
|
|
278
|
+
"""Extract concept entities from text using 3-tier strategy.
|
|
279
|
+
|
|
280
|
+
Tier 1: Match against vocabulary cache from store.
|
|
281
|
+
Tier 2: Regex noun-phrase extraction.
|
|
282
|
+
Tier 3: LLM fallback (only if tiers 1+2 yield nothing from substantial text).
|
|
283
|
+
"""
|
|
284
|
+
concepts: set[str] = set()
|
|
285
|
+
|
|
286
|
+
# Tier 1: vocabulary cache matching
|
|
287
|
+
vocab = self._get_vocab_cache()
|
|
288
|
+
text_lower = text.lower()
|
|
289
|
+
for concept_name, concept_id in vocab:
|
|
290
|
+
if concept_name in text_lower:
|
|
291
|
+
concepts.add(concept_id)
|
|
292
|
+
|
|
293
|
+
# Tier 2: regex noun-phrase extraction
|
|
294
|
+
# Match capitalized multi-word phrases and technical terms
|
|
295
|
+
noun_phrases = set()
|
|
296
|
+
# Capitalized phrases (2+ words)
|
|
297
|
+
for m in re.finditer(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text):
|
|
298
|
+
noun_phrases.add(m.group(1))
|
|
299
|
+
# Technical terms: word-word patterns (e.g., "frame-batching", "OCR-pipeline")
|
|
300
|
+
for m in re.finditer(r"\b([a-zA-Z]+-[a-zA-Z]+(?:-[a-zA-Z]+)*)\b", text):
|
|
301
|
+
term = m.group(1)
|
|
302
|
+
if len(term) > 4: # skip short terms like "is-a"
|
|
303
|
+
noun_phrases.add(term)
|
|
304
|
+
# ALL-CAPS acronyms (2+ chars)
|
|
305
|
+
for m in re.finditer(r"\b([A-Z]{2,})\b", text):
|
|
306
|
+
noun_phrases.add(m.group(1))
|
|
307
|
+
|
|
308
|
+
# Convert noun phrases to concept triples
|
|
309
|
+
triples: list[Triple] = []
|
|
310
|
+
for concept_id in concepts:
|
|
311
|
+
# Already exists in store — just reference it
|
|
312
|
+
pass # caller will create ref triples
|
|
313
|
+
|
|
314
|
+
for phrase in noun_phrases:
|
|
315
|
+
slug = _make_slug(phrase)
|
|
316
|
+
if not slug or len(slug) < 2:
|
|
317
|
+
continue
|
|
318
|
+
concept_id = f"concept:{slug}"
|
|
319
|
+
if concept_id not in concepts:
|
|
320
|
+
concepts.add(concept_id)
|
|
321
|
+
triples.append(Triple(concept_id, "name", phrase))
|
|
322
|
+
|
|
323
|
+
# Return known concepts as triples too (for caller to create refs)
|
|
324
|
+
for cid in concepts:
|
|
325
|
+
if not any(t.entity_id == cid for t in triples):
|
|
326
|
+
# Concept from vocab cache — ensure it's in the output
|
|
327
|
+
triples.append(Triple(cid, "name", cid.split(":", 1)[1] if ":" in cid else cid))
|
|
328
|
+
|
|
329
|
+
# Tier 3: LLM fallback only if we found nothing from substantial text
|
|
330
|
+
if not concepts and len(text) > 100:
|
|
331
|
+
tier3 = self._extract_concepts_llm(text)
|
|
332
|
+
triples.extend(tier3)
|
|
333
|
+
|
|
334
|
+
return triples
|
|
335
|
+
|
|
336
|
+
def _get_vocab_cache(self) -> list[tuple[str, str]]:
|
|
337
|
+
"""Load vocabulary from store: all (name, entity_id) for concept: entities."""
|
|
338
|
+
if self._vocab_cache is not None:
|
|
339
|
+
return self._vocab_cache
|
|
340
|
+
try:
|
|
341
|
+
results = self.store.entities_with_attr("name")
|
|
342
|
+
self._vocab_cache = [
|
|
343
|
+
(val.lower(), eid)
|
|
344
|
+
for eid, val in results
|
|
345
|
+
if eid.startswith("concept:")
|
|
346
|
+
]
|
|
347
|
+
except Exception:
|
|
348
|
+
self._vocab_cache = []
|
|
349
|
+
return self._vocab_cache
|
|
350
|
+
|
|
351
|
+
# ----- Tier 3: LLM fallback -----
|
|
352
|
+
|
|
353
|
+
def _extract_patterns_llm(self, text: str) -> list[Triple]:
|
|
354
|
+
"""Use LLM to extract patterns from unstructured text."""
|
|
355
|
+
try:
|
|
356
|
+
from common import call_llm_with_fallback, extract_json
|
|
357
|
+
except ImportError:
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
system = (
|
|
361
|
+
"Extract actionable patterns from this text. Return JSON: "
|
|
362
|
+
'{"patterns": ["pattern 1", "pattern 2", ...]}'
|
|
363
|
+
)
|
|
364
|
+
try:
|
|
365
|
+
raw = call_llm_with_fallback(system, text[:4000], script="triple_extractor")
|
|
366
|
+
data = extract_json(raw)
|
|
367
|
+
triples: list[Triple] = []
|
|
368
|
+
for p in data.get("patterns", []):
|
|
369
|
+
slug = _make_slug(p)
|
|
370
|
+
if slug:
|
|
371
|
+
pid = f"pattern:{slug}"
|
|
372
|
+
triples.append(Triple(pid, "text", p))
|
|
373
|
+
triples.append(Triple(pid, "source", "llm_extraction"))
|
|
374
|
+
return triples
|
|
375
|
+
except Exception as e:
|
|
376
|
+
print(f"[warn] Tier 3 pattern extraction failed: {e}", file=sys.stderr)
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
def _extract_concepts_llm(self, text: str) -> list[Triple]:
|
|
380
|
+
"""Use LLM to extract concepts from text."""
|
|
381
|
+
try:
|
|
382
|
+
from common import call_llm_with_fallback, extract_json
|
|
383
|
+
except ImportError:
|
|
384
|
+
return []
|
|
385
|
+
|
|
386
|
+
system = (
|
|
387
|
+
"Extract key concepts/entities from this text. Return JSON: "
|
|
388
|
+
'{"concepts": ["concept 1", "concept 2", ...]}'
|
|
389
|
+
)
|
|
390
|
+
try:
|
|
391
|
+
raw = call_llm_with_fallback(system, text[:4000], script="triple_extractor")
|
|
392
|
+
data = extract_json(raw)
|
|
393
|
+
triples: list[Triple] = []
|
|
394
|
+
for c in data.get("concepts", []):
|
|
395
|
+
slug = _make_slug(c)
|
|
396
|
+
if slug:
|
|
397
|
+
cid = f"concept:{slug}"
|
|
398
|
+
triples.append(Triple(cid, "name", c))
|
|
399
|
+
return triples
|
|
400
|
+
except Exception as e:
|
|
401
|
+
print(f"[warn] Tier 3 concept extraction failed: {e}", file=sys.stderr)
|
|
402
|
+
return []
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Triple Ingest — CLI entry point for ingesting data into the triple store.
|
|
3
|
+
|
|
4
|
+
Called by the sinain-hud plugin via runScript() for fire-and-forget ingestion.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python3 triple_ingest.py --memory-dir memory/ --signal-result '{"signals":[...]}' --tick-ts 2026-03-01T10:00:00Z
|
|
8
|
+
python3 triple_ingest.py --memory-dir memory/ --ingest-playbook
|
|
9
|
+
python3 triple_ingest.py --memory-dir memory/ --ingest-session '{"ts":"...","summary":"..."}'
|
|
10
|
+
python3 triple_ingest.py --memory-dir memory/ --ingest-mining '{"newPatterns":[...]}'
|
|
11
|
+
python3 triple_ingest.py --memory-dir memory/ --ingest-module react-native-dev --modules-dir modules/
|
|
12
|
+
python3 triple_ingest.py --memory-dir memory/ --retract-module react-native-dev
|
|
13
|
+
python3 triple_ingest.py --memory-dir memory/ --embed (add --embed to any mode to trigger embedding)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
# Ensure sinain-koog is on path for local imports
|
|
23
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
24
|
+
|
|
25
|
+
from triplestore import TripleStore
|
|
26
|
+
from triple_extractor import TripleExtractor
|
|
27
|
+
from common import output_json, read_effective_playbook, read_file_safe
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ── Privacy matrix helpers ────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
def _privacy_level(data_type: str, dest: str) -> str:
|
|
33
|
+
"""Read PRIVACY_<DATA_TYPE>_<DEST> env var, default 'full'."""
|
|
34
|
+
key = f"PRIVACY_{data_type.upper()}_{dest.upper()}"
|
|
35
|
+
val = os.environ.get(key, "full")
|
|
36
|
+
if val not in ("full", "redacted", "summary", "none"):
|
|
37
|
+
return "full"
|
|
38
|
+
return val
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _should_ingest(data_type: str) -> bool:
|
|
42
|
+
"""Return True if the data type is allowed to reach the triple store."""
|
|
43
|
+
level = _privacy_level(data_type, "TRIPLE_STORE")
|
|
44
|
+
return level != "none"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _db_path(memory_dir: str) -> str:
|
|
48
|
+
return str(Path(memory_dir) / "triplestore.db")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _assert_triples(store: TripleStore, tx_id: int, triples: list) -> int:
|
|
52
|
+
"""Assert all triples in a transaction. Returns count."""
|
|
53
|
+
count = 0
|
|
54
|
+
for t in triples:
|
|
55
|
+
store.assert_triple(tx_id, t.entity_id, t.attribute, t.value, t.value_type)
|
|
56
|
+
count += 1
|
|
57
|
+
return count
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _run_embeddings(store: TripleStore, memory_dir: str) -> None:
|
|
61
|
+
"""Run Phase 2 embeddings on recent entities (best-effort)."""
|
|
62
|
+
try:
|
|
63
|
+
from embedder import Embedder
|
|
64
|
+
embedder = Embedder(_db_path(memory_dir))
|
|
65
|
+
# Get entities from last transaction
|
|
66
|
+
latest = store.latest_tx()
|
|
67
|
+
if latest == 0:
|
|
68
|
+
return
|
|
69
|
+
novelties = store.novelty(max(0, latest - 1))
|
|
70
|
+
entity_ids = list({n["entity_id"] for n in novelties})
|
|
71
|
+
if not entity_ids:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
# Build text for embedding
|
|
75
|
+
entity_texts: dict[str, str] = {}
|
|
76
|
+
for eid in entity_ids[:50]: # cap at 50 per batch
|
|
77
|
+
attrs = store.entity(eid)
|
|
78
|
+
text = _build_embed_text(eid, attrs)
|
|
79
|
+
if text:
|
|
80
|
+
entity_texts[eid] = text
|
|
81
|
+
|
|
82
|
+
if entity_texts:
|
|
83
|
+
embedder.store_embeddings(entity_texts)
|
|
84
|
+
print(f"[embed] Embedded {len(entity_texts)} entities", file=sys.stderr)
|
|
85
|
+
except ImportError:
|
|
86
|
+
print("[embed] embedder not available, skipping", file=sys.stderr)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
print(f"[embed] Error: {e}", file=sys.stderr)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _build_embed_text(entity_id: str, attrs: dict[str, list[str]]) -> str:
|
|
92
|
+
"""Build embedding source text from entity attributes.
|
|
93
|
+
|
|
94
|
+
Templates per entity type (from design doc §5.3).
|
|
95
|
+
"""
|
|
96
|
+
etype = entity_id.split(":")[0] if ":" in entity_id else "unknown"
|
|
97
|
+
|
|
98
|
+
if etype == "pattern":
|
|
99
|
+
text = attrs.get("text", [""])[0]
|
|
100
|
+
concepts = ", ".join(attrs.get("related_to", []))
|
|
101
|
+
return f"pattern: {text} (concepts: {concepts})" if text else ""
|
|
102
|
+
|
|
103
|
+
if etype == "concept":
|
|
104
|
+
name = attrs.get("name", [""])[0]
|
|
105
|
+
return f"concept: {name}" if name else ""
|
|
106
|
+
|
|
107
|
+
if etype == "session":
|
|
108
|
+
summary = attrs.get("summary", [""])[0]
|
|
109
|
+
return f"session: {summary}" if summary else ""
|
|
110
|
+
|
|
111
|
+
if etype == "signal":
|
|
112
|
+
desc = attrs.get("description", [""])[0]
|
|
113
|
+
priority = attrs.get("priority", ["medium"])[0]
|
|
114
|
+
return f"signal: {desc} (priority: {priority})" if desc else ""
|
|
115
|
+
|
|
116
|
+
if etype == "guidance":
|
|
117
|
+
text = attrs.get("text", [""])[0]
|
|
118
|
+
return f"guidance: {text}" if text else ""
|
|
119
|
+
|
|
120
|
+
if etype == "module":
|
|
121
|
+
name = attrs.get("name", [""])[0]
|
|
122
|
+
description = attrs.get("description", [""])[0]
|
|
123
|
+
return f"module: {name} — {description}" if name else ""
|
|
124
|
+
|
|
125
|
+
return ""
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cmd_signal(args: argparse.Namespace) -> None:
|
|
129
|
+
"""Ingest signal analysis result."""
|
|
130
|
+
# Privacy gate: check if audio_transcript and screen_ocr are allowed for triple_store
|
|
131
|
+
if not _should_ingest("AUDIO") and not _should_ingest("OCR"):
|
|
132
|
+
output_json({"ingested": 0, "source": "signal", "skipped": "privacy_gate"})
|
|
133
|
+
return
|
|
134
|
+
signal_data = json.loads(args.signal_result)
|
|
135
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
136
|
+
try:
|
|
137
|
+
extractor = TripleExtractor(store)
|
|
138
|
+
triples = extractor.extract_signal(signal_data, args.tick_ts)
|
|
139
|
+
tx = store.begin_tx("signal_analyzer", metadata={"tick_ts": args.tick_ts})
|
|
140
|
+
count = _assert_triples(store, tx, triples)
|
|
141
|
+
if args.embed:
|
|
142
|
+
_run_embeddings(store, args.memory_dir)
|
|
143
|
+
output_json({"ingested": count, "entities": len({t.entity_id for t in triples}), "source": "signal", "txId": tx})
|
|
144
|
+
finally:
|
|
145
|
+
store.close()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def cmd_playbook(args: argparse.Namespace) -> None:
|
|
149
|
+
"""Ingest the current playbook."""
|
|
150
|
+
playbook = read_effective_playbook(args.memory_dir)
|
|
151
|
+
if not playbook:
|
|
152
|
+
output_json({"ingested": 0, "source": "playbook", "error": "empty"})
|
|
153
|
+
return
|
|
154
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
155
|
+
try:
|
|
156
|
+
extractor = TripleExtractor(store)
|
|
157
|
+
triples = extractor.extract_playbook(playbook)
|
|
158
|
+
tx = store.begin_tx("playbook_curator")
|
|
159
|
+
count = _assert_triples(store, tx, triples)
|
|
160
|
+
if args.embed:
|
|
161
|
+
_run_embeddings(store, args.memory_dir)
|
|
162
|
+
output_json({"ingested": count, "entities": len({t.entity_id for t in triples}), "source": "playbook", "txId": tx})
|
|
163
|
+
finally:
|
|
164
|
+
store.close()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def cmd_session(args: argparse.Namespace) -> None:
|
|
168
|
+
"""Ingest a session summary."""
|
|
169
|
+
session_data = json.loads(args.ingest_session)
|
|
170
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
171
|
+
try:
|
|
172
|
+
extractor = TripleExtractor(store)
|
|
173
|
+
triples = extractor.extract_session(session_data)
|
|
174
|
+
tx = store.begin_tx("agent_end", metadata={"session": session_data.get("ts")})
|
|
175
|
+
count = _assert_triples(store, tx, triples)
|
|
176
|
+
if args.embed:
|
|
177
|
+
_run_embeddings(store, args.memory_dir)
|
|
178
|
+
output_json({"ingested": count, "entities": len({t.entity_id for t in triples}), "source": "session", "txId": tx})
|
|
179
|
+
finally:
|
|
180
|
+
store.close()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def cmd_mining(args: argparse.Namespace) -> None:
|
|
184
|
+
"""Ingest memory mining results."""
|
|
185
|
+
mining_data = json.loads(args.ingest_mining)
|
|
186
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
187
|
+
try:
|
|
188
|
+
extractor = TripleExtractor(store)
|
|
189
|
+
triples = extractor.extract_mining(mining_data)
|
|
190
|
+
tx = store.begin_tx("memory_miner")
|
|
191
|
+
count = _assert_triples(store, tx, triples)
|
|
192
|
+
if args.embed:
|
|
193
|
+
_run_embeddings(store, args.memory_dir)
|
|
194
|
+
output_json({"ingested": count, "entities": len({t.entity_id for t in triples}), "source": "mining", "txId": tx})
|
|
195
|
+
finally:
|
|
196
|
+
store.close()
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def cmd_module(args: argparse.Namespace) -> None:
|
|
200
|
+
"""Ingest a module's patterns and guidance into the triple store."""
|
|
201
|
+
modules_dir = Path(args.modules_dir)
|
|
202
|
+
module_id = args.ingest_module
|
|
203
|
+
manifest_path = modules_dir / module_id / "manifest.json"
|
|
204
|
+
patterns_path = modules_dir / module_id / "patterns.md"
|
|
205
|
+
guidance_path = modules_dir / module_id / "guidance.md"
|
|
206
|
+
|
|
207
|
+
if not manifest_path.exists():
|
|
208
|
+
output_json({"ingested": 0, "source": "module", "error": f"manifest not found: {manifest_path}"})
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
212
|
+
patterns_text = read_file_safe(str(patterns_path))
|
|
213
|
+
guidance_text = read_file_safe(str(guidance_path))
|
|
214
|
+
|
|
215
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
216
|
+
try:
|
|
217
|
+
extractor = TripleExtractor(store)
|
|
218
|
+
triples = extractor.extract_module(module_id, manifest, patterns_text, guidance_text)
|
|
219
|
+
tx = store.begin_tx("module_ingest", metadata={"module_id": module_id})
|
|
220
|
+
count = _assert_triples(store, tx, triples)
|
|
221
|
+
if args.embed:
|
|
222
|
+
_run_embeddings(store, args.memory_dir)
|
|
223
|
+
output_json({"ingested": count, "entities": len({t.entity_id for t in triples}), "source": "module", "module": module_id, "txId": tx})
|
|
224
|
+
finally:
|
|
225
|
+
store.close()
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def cmd_retract_module(args: argparse.Namespace) -> None:
|
|
229
|
+
"""Retract a module's triples from the store."""
|
|
230
|
+
module_id = args.retract_module
|
|
231
|
+
entity_id = f"module:{module_id}"
|
|
232
|
+
store = TripleStore(_db_path(args.memory_dir))
|
|
233
|
+
try:
|
|
234
|
+
tx = store.begin_tx("module_retract", metadata={"module_id": module_id})
|
|
235
|
+
# Retract the module entity itself
|
|
236
|
+
attrs = store.entity(entity_id)
|
|
237
|
+
count = 0
|
|
238
|
+
for attr in attrs:
|
|
239
|
+
count += store.retract_triple(tx, entity_id, attr)
|
|
240
|
+
# Retract patterns that belong_to this module
|
|
241
|
+
backrefs = store.backrefs(entity_id, attribute="belongs_to")
|
|
242
|
+
for pattern_eid, _ in backrefs:
|
|
243
|
+
pattern_attrs = store.entity(pattern_eid)
|
|
244
|
+
for attr in pattern_attrs:
|
|
245
|
+
count += store.retract_triple(tx, pattern_eid, attr)
|
|
246
|
+
output_json({"retracted": count, "source": "module", "module": module_id, "txId": tx})
|
|
247
|
+
finally:
|
|
248
|
+
store.close()
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def main() -> None:
|
|
252
|
+
parser = argparse.ArgumentParser(description="Triple Store Ingestion CLI")
|
|
253
|
+
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
254
|
+
parser.add_argument("--embed", action="store_true", help="Trigger embedding after ingestion")
|
|
255
|
+
|
|
256
|
+
# Mutually exclusive ingestion modes
|
|
257
|
+
group = parser.add_mutually_exclusive_group(required=True)
|
|
258
|
+
group.add_argument("--signal-result", help="JSON: signal analysis result")
|
|
259
|
+
group.add_argument("--ingest-playbook", action="store_true", help="Ingest current playbook")
|
|
260
|
+
group.add_argument("--ingest-session", help="JSON: session summary")
|
|
261
|
+
group.add_argument("--ingest-mining", help="JSON: mining results")
|
|
262
|
+
group.add_argument("--ingest-module", help="Module ID to ingest")
|
|
263
|
+
group.add_argument("--retract-module", help="Module ID to retract")
|
|
264
|
+
|
|
265
|
+
# Conditional args
|
|
266
|
+
parser.add_argument("--tick-ts", help="Tick timestamp (required with --signal-result)")
|
|
267
|
+
parser.add_argument("--modules-dir", help="Path to modules/ directory (required with --ingest-module)")
|
|
268
|
+
|
|
269
|
+
args = parser.parse_args()
|
|
270
|
+
|
|
271
|
+
if args.signal_result:
|
|
272
|
+
if not args.tick_ts:
|
|
273
|
+
parser.error("--tick-ts required with --signal-result")
|
|
274
|
+
cmd_signal(args)
|
|
275
|
+
elif args.ingest_playbook:
|
|
276
|
+
cmd_playbook(args)
|
|
277
|
+
elif args.ingest_session:
|
|
278
|
+
cmd_session(args)
|
|
279
|
+
elif args.ingest_mining:
|
|
280
|
+
cmd_mining(args)
|
|
281
|
+
elif args.ingest_module:
|
|
282
|
+
if not args.modules_dir:
|
|
283
|
+
parser.error("--modules-dir required with --ingest-module")
|
|
284
|
+
cmd_module(args)
|
|
285
|
+
elif args.retract_module:
|
|
286
|
+
cmd_retract_module(args)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
if __name__ == "__main__":
|
|
290
|
+
main()
|