lorekeep 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lorekeep/__init__.py +3 -0
- lorekeep/cli.py +229 -0
- lorekeep/compile/__init__.py +0 -0
- lorekeep/compile/extract.py +150 -0
- lorekeep/compile/ingest.py +55 -0
- lorekeep/compile/providers.py +49 -0
- lorekeep/compile/resolve.py +111 -0
- lorekeep/compile/writer.py +63 -0
- lorekeep/config.py +39 -0
- lorekeep/defaults.py +44 -0
- lorekeep/eval/__init__.py +0 -0
- lorekeep/eval/construction.py +97 -0
- lorekeep/eval/gold.py +31 -0
- lorekeep/eval/retrieval.py +46 -0
- lorekeep/facts_io.py +22 -0
- lorekeep/integrations/__init__.py +0 -0
- lorekeep/integrations/claude_code.py +19 -0
- lorekeep/integrations/codex.py +56 -0
- lorekeep/integrations/common.py +23 -0
- lorekeep/integrations/cursor.py +21 -0
- lorekeep/mcp_server.py +120 -0
- lorekeep/models.py +130 -0
- lorekeep/paths.py +58 -0
- lorekeep/perm/__init__.py +0 -0
- lorekeep/perm/ns.py +113 -0
- lorekeep/pipeline.py +67 -0
- lorekeep/schema_io.py +12 -0
- lorekeep/store/__init__.py +0 -0
- lorekeep/store/fts.py +54 -0
- lorekeep/store/graph.py +137 -0
- lorekeep-0.1.0.dist-info/METADATA +246 -0
- lorekeep-0.1.0.dist-info/RECORD +35 -0
- lorekeep-0.1.0.dist-info/WHEEL +4 -0
- lorekeep-0.1.0.dist-info/entry_points.txt +2 -0
- lorekeep-0.1.0.dist-info/licenses/LICENSE +21 -0
lorekeep/__init__.py
ADDED
lorekeep/cli.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Lorekeep CLI."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from lorekeep import __version__
|
|
11
|
+
from lorekeep.compile.providers import FakeProvider, LiteLLMProvider
|
|
12
|
+
from lorekeep.config import load_config
|
|
13
|
+
from lorekeep.pipeline import compile_graph
|
|
14
|
+
from lorekeep.paths import resolve_paths
|
|
15
|
+
from lorekeep.defaults import DEFAULT_CONFIG_YAML, DEFAULT_SCHEMA
|
|
16
|
+
from lorekeep.schema_io import load_schema
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(help="Lorekeep — compile team docs into a temporal knowledge graph.")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Empty callback forces multi-command mode so subcommands are not auto-promoted.
|
|
22
|
+
@app.callback()
|
|
23
|
+
def _main() -> None:
|
|
24
|
+
"""Lorekeep — compile team docs into a temporal knowledge graph."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@app.command()
|
|
29
|
+
def version() -> None:
|
|
30
|
+
"""Print the Lorekeep version."""
|
|
31
|
+
typer.echo(f"lorekeep {__version__}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@app.command()
|
|
35
|
+
def compile() -> None:
|
|
36
|
+
"""Compile raw/ into graph/facts.jsonl."""
|
|
37
|
+
p = resolve_paths()
|
|
38
|
+
schema = load_schema(p["schema"])
|
|
39
|
+
config = load_config(p["config"])
|
|
40
|
+
|
|
41
|
+
if os.environ.get("LOREKEEP_PROVIDER") == "fake":
|
|
42
|
+
canned = json.dumps({
|
|
43
|
+
"nodes": [
|
|
44
|
+
{"id": "svc:payments-api", "type": "service", "name": "payments-api",
|
|
45
|
+
"props": {"lang": "go"}, "valid_from": "2024-01-15"},
|
|
46
|
+
{"id": "svc:auth", "type": "service", "name": "auth"},
|
|
47
|
+
{"id": "team:backend", "type": "team", "name": "team-backend"},
|
|
48
|
+
{"id": "dec:adr-007", "type": "decision",
|
|
49
|
+
"props": {"title": "payments-api adopts internal signing"}},
|
|
50
|
+
],
|
|
51
|
+
"edges": [
|
|
52
|
+
{"type": "depends_on", "from": "svc:payments-api", "to": "svc:auth",
|
|
53
|
+
"valid_from": "2024-01-15", "valid_to": "2025-03-01"},
|
|
54
|
+
{"type": "decided_by", "from": "dec:adr-007", "to": "team:backend"},
|
|
55
|
+
],
|
|
56
|
+
"aliases": {},
|
|
57
|
+
})
|
|
58
|
+
provider = FakeProvider(responses=[canned])
|
|
59
|
+
else:
|
|
60
|
+
api_key = None
|
|
61
|
+
if config.provider.api_key_env:
|
|
62
|
+
api_key = os.environ.get(config.provider.api_key_env)
|
|
63
|
+
if not api_key:
|
|
64
|
+
api_key = config.provider.api_key
|
|
65
|
+
if api_key is config.provider.api_key and api_key:
|
|
66
|
+
typer.echo(
|
|
67
|
+
"warning: using inline api_key from config.yaml — prefer api_key_env "
|
|
68
|
+
"(env var). config.yaml is gitignored but env is safer."
|
|
69
|
+
)
|
|
70
|
+
provider = LiteLLMProvider(
|
|
71
|
+
model=config.provider.model,
|
|
72
|
+
api_base=config.provider.api_base,
|
|
73
|
+
temperature=config.provider.temperature,
|
|
74
|
+
api_key=api_key,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
manifest = compile_graph(
|
|
78
|
+
raw_root=p["raw"], out_dir=p["out"], schema=schema,
|
|
79
|
+
provider=provider, cache_path=p["cache"], chunk_lines=config.compile.chunk_lines,
|
|
80
|
+
)
|
|
81
|
+
typer.echo(f"compiled: {manifest.node_count} nodes, {manifest.edge_count} edges, "
|
|
82
|
+
f"run_id={manifest.run_id}, facts_hash={manifest.facts_hash}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@app.command(name="eval")
|
|
86
|
+
def eval_cmd() -> None:
|
|
87
|
+
"""Run Tier-1 construction-quality evaluation vs the gold corpus."""
|
|
88
|
+
p = resolve_paths()
|
|
89
|
+
gold_dir = Path(os.environ.get("LOREKEEP_GOLD", "tests/fixtures/gold"))
|
|
90
|
+
from lorekeep.eval.construction import extraction_report, structure_report
|
|
91
|
+
report = {
|
|
92
|
+
"extraction": extraction_report(p["out"], gold_dir),
|
|
93
|
+
"structure": structure_report(p["out"]),
|
|
94
|
+
}
|
|
95
|
+
results_path = Path(os.environ.get("LOREKEEP_EVAL_RESULTS",
|
|
96
|
+
".lorekeep/eval/results.json"))
|
|
97
|
+
results_path.parent.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
results_path.write_text(json.dumps(report, indent=2, sort_keys=True))
|
|
99
|
+
typer.echo(json.dumps(report, indent=2, sort_keys=True))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@app.command()
|
|
103
|
+
def check() -> None:
|
|
104
|
+
"""Validate the compiled graph: loads, no dangling edges."""
|
|
105
|
+
p = resolve_paths()
|
|
106
|
+
from lorekeep.eval.construction import structure_report
|
|
107
|
+
struct = structure_report(p["out"])
|
|
108
|
+
if struct["dangling_edge_rate"] > 0:
|
|
109
|
+
typer.echo(f"check: FAIL — {struct['dangling_edge_rate']} dangling edges")
|
|
110
|
+
raise typer.Exit(code=1)
|
|
111
|
+
typer.echo(f"check: ok — {struct['node_count']} nodes, {struct['edge_count']} edges, 0 dangling")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@app.command()
|
|
115
|
+
def serve(
|
|
116
|
+
transport: str = typer.Option("stdio", "--transport", help="stdio (default) | http"),
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Serve the scoped graph over MCP."""
|
|
119
|
+
p = resolve_paths()
|
|
120
|
+
raw_ns = os.environ.get("LOREKEEP_NS")
|
|
121
|
+
if raw_ns:
|
|
122
|
+
allowed = [x.strip() for x in raw_ns.split(",") if x.strip()]
|
|
123
|
+
else:
|
|
124
|
+
allowed = load_config(p["config"]).ns.default
|
|
125
|
+
from lorekeep.mcp_server import configure, mcp
|
|
126
|
+
configure(graph_dir=p["out"], allowed_ns=allowed, schema_path=p["schema"])
|
|
127
|
+
mcp.run(transport=transport)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
mcp_app = typer.Typer(help="Coding-agent integration.")
|
|
131
|
+
app.add_typer(mcp_app, name="mcp")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@mcp_app.command("add")
|
|
135
|
+
def mcp_add(
|
|
136
|
+
agent: str = typer.Option(..., "--agent", help="claude | cursor | codex"),
|
|
137
|
+
scope: str = typer.Option("project", "--scope", help="project | user"),
|
|
138
|
+
ns: str = typer.Option(None, "--ns", help="namespace to scope the agent to"),
|
|
139
|
+
) -> None:
|
|
140
|
+
"""Write the agent's MCP config + print an agent-memory snippet."""
|
|
141
|
+
from lorekeep.integrations import claude_code, codex, cursor
|
|
142
|
+
from lorekeep.integrations.common import agent_memory_snippet, resolve_command
|
|
143
|
+
|
|
144
|
+
p = resolve_paths()
|
|
145
|
+
config = load_config(p["config"])
|
|
146
|
+
command, args = resolve_command(config.install_source)
|
|
147
|
+
|
|
148
|
+
target = Path.cwd() if scope == "project" else Path.home()
|
|
149
|
+
writers = {"claude": claude_code, "cursor": cursor, "codex": codex}
|
|
150
|
+
if agent not in writers:
|
|
151
|
+
typer.echo(f"unknown agent: {agent} (choose claude|cursor|codex)")
|
|
152
|
+
raise typer.Exit(code=1)
|
|
153
|
+
written = writers[agent].write_config(target, command, args, ns)
|
|
154
|
+
typer.echo(f"wrote {agent} config -> {written}")
|
|
155
|
+
typer.echo("\n" + agent_memory_snippet())
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@app.command()
|
|
159
|
+
def doctor() -> None:
|
|
160
|
+
"""Verify install: graph loads, schema valid, ns resolves, a tool responds."""
|
|
161
|
+
p = resolve_paths()
|
|
162
|
+
problems = []
|
|
163
|
+
|
|
164
|
+
facts_path = p["out"] / "facts.jsonl"
|
|
165
|
+
if not facts_path.exists():
|
|
166
|
+
typer.echo(f"FAIL: facts.jsonl not found at {facts_path}")
|
|
167
|
+
raise typer.Exit(code=1)
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
from lorekeep.store.graph import GraphStore
|
|
171
|
+
store = GraphStore.from_jsonl(facts_path)
|
|
172
|
+
except Exception as exc:
|
|
173
|
+
typer.echo(f"FAIL: cannot load graph: {exc}")
|
|
174
|
+
raise typer.Exit(code=1)
|
|
175
|
+
|
|
176
|
+
if not p["schema"].exists():
|
|
177
|
+
problems.append("schema.json missing")
|
|
178
|
+
else:
|
|
179
|
+
try:
|
|
180
|
+
load_schema(p["schema"])
|
|
181
|
+
except Exception as exc:
|
|
182
|
+
problems.append(f"schema invalid: {exc}")
|
|
183
|
+
|
|
184
|
+
raw_ns = os.environ.get("LOREKEEP_NS")
|
|
185
|
+
allowed = [x.strip() for x in raw_ns.split(",")] if raw_ns else load_config(p["config"]).ns.default
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
from lorekeep.mcp_server import configure, list_namespaces
|
|
189
|
+
configure(graph_dir=p["out"], allowed_ns=allowed, schema_path=p["schema"])
|
|
190
|
+
ns = list_namespaces()
|
|
191
|
+
except Exception as exc:
|
|
192
|
+
problems.append(f"mcp configure/tool failed: {exc}")
|
|
193
|
+
ns = []
|
|
194
|
+
|
|
195
|
+
if problems:
|
|
196
|
+
typer.echo("FAIL: " + "; ".join(problems))
|
|
197
|
+
raise typer.Exit(code=1)
|
|
198
|
+
|
|
199
|
+
typer.echo(
|
|
200
|
+
f"all checks passed: {len(store.node_ids())} nodes, "
|
|
201
|
+
f"{len(store.all_edges())} edges, namespaces={ns}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@app.command()
|
|
206
|
+
def init() -> None:
|
|
207
|
+
"""Bootstrap the data home: config + schema + raw/graph dirs."""
|
|
208
|
+
p = resolve_paths()
|
|
209
|
+
created = []
|
|
210
|
+
p["config"].parent.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
if not p["config"].exists():
|
|
212
|
+
p["config"].write_text(DEFAULT_CONFIG_YAML)
|
|
213
|
+
created.append(str(p["config"]))
|
|
214
|
+
p["schema"].parent.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
if not p["schema"].exists():
|
|
216
|
+
p["schema"].write_text(json.dumps(DEFAULT_SCHEMA, indent=2))
|
|
217
|
+
created.append(str(p["schema"]))
|
|
218
|
+
p["raw"].mkdir(parents=True, exist_ok=True)
|
|
219
|
+
p["out"].mkdir(parents=True, exist_ok=True)
|
|
220
|
+
typer.echo(f"home ready: config={p['config']}")
|
|
221
|
+
typer.echo(f" schema={p['schema']} raw={p['raw']} graph={p['out']}")
|
|
222
|
+
if created:
|
|
223
|
+
typer.echo(f" wrote defaults: {created}")
|
|
224
|
+
else:
|
|
225
|
+
typer.echo(" (existing config/schema preserved)")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Extract: turn a DocChunk into candidate facts via an LLM. Pure helpers first."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from datetime import date
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from lorekeep.models import DocChunk, Edge, Node, Schema
|
|
10
|
+
|
|
11
|
+
SYSTEM_PROMPT = (
|
|
12
|
+
"You are a knowledge-graph extractor. Read the document chunk and emit a JSON "
|
|
13
|
+
'object {"nodes":[...], "edges":[...], "aliases":{...}}. '
|
|
14
|
+
"Only use node_types and edge_types listed in the provided schema. "
|
|
15
|
+
"For every node give id (stable slug prefixed by type, e.g. svc:payments-api), "
|
|
16
|
+
"type, name, optional props, optional valid_from/valid_to (ISO dates, null = unknown). "
|
|
17
|
+
"For every edge give type, from (node id), to (node id), optional valid_from/valid_to. "
|
|
18
|
+
"aliases maps a canonical name to surface variants. Emit NO text outside the JSON."
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_prompt(chunk: DocChunk, schema: Schema) -> str:
|
|
23
|
+
node_types = ", ".join(schema.node_types.keys())
|
|
24
|
+
edge_types = ", ".join(
|
|
25
|
+
f"{k}({v.from_}->{v.to})" for k, v in schema.edge_types.items()
|
|
26
|
+
)
|
|
27
|
+
return (
|
|
28
|
+
f"Allowed node_types: {node_types}\n"
|
|
29
|
+
f"Allowed edge_types: {edge_types}\n\n"
|
|
30
|
+
f"Source: {chunk.src}\n"
|
|
31
|
+
f"Namespace: {chunk.namespace}\n\n"
|
|
32
|
+
f"Document chunk:\n{chunk.text}\n"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse_date(v: Any) -> date | None:
|
|
37
|
+
if not v:
|
|
38
|
+
return None
|
|
39
|
+
return date.fromisoformat(v)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _extract_json(raw: str, chunk: DocChunk) -> str:
|
|
43
|
+
"""Best-effort recover a JSON object from LLM output.
|
|
44
|
+
|
|
45
|
+
response_format=json_object usually yields clean JSON, but some models wrap
|
|
46
|
+
output in ```json fences or prepend prose. Strip fences, then fall back to
|
|
47
|
+
the first balanced {...} span. Raises ValueError (with chunk src) if the
|
|
48
|
+
output still can't be parsed, so the pipeline reports a clear failure.
|
|
49
|
+
"""
|
|
50
|
+
s = raw.strip()
|
|
51
|
+
if s.startswith("```"):
|
|
52
|
+
s = s.strip("`")
|
|
53
|
+
brace = s.find("{")
|
|
54
|
+
if brace != -1:
|
|
55
|
+
s = s[brace:]
|
|
56
|
+
try:
|
|
57
|
+
json.loads(s)
|
|
58
|
+
return s
|
|
59
|
+
except json.JSONDecodeError:
|
|
60
|
+
m = re.search(r"\{.*\}", raw, re.DOTALL)
|
|
61
|
+
if m:
|
|
62
|
+
json.loads(m.group(0)) # validate; raises if malformed
|
|
63
|
+
return m.group(0)
|
|
64
|
+
raise ValueError(f"LLM returned non-JSON for {chunk.src}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def parse_response(
|
|
68
|
+
raw: str, chunk: DocChunk, schema: Schema | None = None,
|
|
69
|
+
) -> tuple[list[Node], list[Edge], dict[str, list[str]]]:
|
|
70
|
+
data = json.loads(_extract_json(raw, chunk))
|
|
71
|
+
nodes: list[Node] = []
|
|
72
|
+
for n in data.get("nodes", []):
|
|
73
|
+
ntype = n.get("type")
|
|
74
|
+
if schema is not None and not schema.is_valid_node_type(ntype):
|
|
75
|
+
continue
|
|
76
|
+
props = dict(n.get("props", {}))
|
|
77
|
+
if "name" in n and "name" not in props:
|
|
78
|
+
props["name"] = n["name"]
|
|
79
|
+
nodes.append(Node(
|
|
80
|
+
id=n["id"],
|
|
81
|
+
type=ntype,
|
|
82
|
+
ns=(chunk.namespace,),
|
|
83
|
+
valid_from=_parse_date(n.get("valid_from")),
|
|
84
|
+
valid_to=_parse_date(n.get("valid_to")),
|
|
85
|
+
props=props,
|
|
86
|
+
src=(chunk.src,),
|
|
87
|
+
))
|
|
88
|
+
edges: list[Edge] = []
|
|
89
|
+
for e in data.get("edges", []):
|
|
90
|
+
etype = e.get("type")
|
|
91
|
+
if schema is not None and not schema.is_valid_edge_type(etype):
|
|
92
|
+
continue
|
|
93
|
+
edges.append(Edge(
|
|
94
|
+
id="", # assigned deterministically in resolve
|
|
95
|
+
type=etype,
|
|
96
|
+
**{"from": e["from"]},
|
|
97
|
+
to=e["to"],
|
|
98
|
+
ns=(chunk.namespace,),
|
|
99
|
+
valid_from=_parse_date(e.get("valid_from")),
|
|
100
|
+
valid_to=_parse_date(e.get("valid_to")),
|
|
101
|
+
src=(chunk.src,),
|
|
102
|
+
))
|
|
103
|
+
aliases = {k: list(v) for k, v in data.get("aliases", {}).items()}
|
|
104
|
+
return nodes, edges, aliases
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
import hashlib
|
|
108
|
+
from pathlib import Path
|
|
109
|
+
|
|
110
|
+
from lorekeep.compile.providers import LLMProvider
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ExtractionCache:
|
|
114
|
+
"""Maps (chunk_hash, schema_version) -> raw LLM response. Local only."""
|
|
115
|
+
|
|
116
|
+
def __init__(self, path: Path) -> None:
|
|
117
|
+
self.path = Path(path)
|
|
118
|
+
self._data: dict[str, str] = {}
|
|
119
|
+
if self.path.exists():
|
|
120
|
+
self._data = json.loads(self.path.read_text(encoding="utf-8"))
|
|
121
|
+
|
|
122
|
+
def key(self, chunk: DocChunk, schema_version: int) -> str:
|
|
123
|
+
h = hashlib.sha256()
|
|
124
|
+
h.update(str(schema_version).encode("utf-8"))
|
|
125
|
+
h.update(b"\n")
|
|
126
|
+
h.update(chunk.hash.encode("utf-8"))
|
|
127
|
+
return h.hexdigest()
|
|
128
|
+
|
|
129
|
+
def get(self, key: str) -> str | None:
|
|
130
|
+
return self._data.get(key)
|
|
131
|
+
|
|
132
|
+
def set(self, key: str, raw: str) -> None:
|
|
133
|
+
self._data[key] = raw
|
|
134
|
+
|
|
135
|
+
def save(self) -> None:
|
|
136
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
self.path.write_text(
|
|
138
|
+
json.dumps(self._data, sort_keys=True, indent=2), encoding="utf-8"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_chunk(
|
|
143
|
+
chunk: DocChunk, schema: Schema, provider: LLMProvider, cache: ExtractionCache,
|
|
144
|
+
) -> tuple[list[Node], list[Edge], dict[str, list[str]]]:
|
|
145
|
+
key = cache.key(chunk, schema.version)
|
|
146
|
+
raw = cache.get(key)
|
|
147
|
+
if raw is None:
|
|
148
|
+
raw = provider.extract_json(SYSTEM_PROMPT, build_prompt(chunk, schema))
|
|
149
|
+
cache.set(key, raw)
|
|
150
|
+
return parse_response(raw, chunk, schema)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Ingest: raw markdown files -> DocChunks with provenance."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from lorekeep.models import DocChunk
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def namespace_for(raw_root: Path, path: Path) -> str:
|
|
11
|
+
rel = path.relative_to(raw_root)
|
|
12
|
+
parts = rel.parts
|
|
13
|
+
if len(parts) >= 2: # <dir>/<file> -> ns is the first directory
|
|
14
|
+
return parts[0]
|
|
15
|
+
return "public"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def ingest_file(raw_root: Path, path: Path, chunk_lines: int) -> list[DocChunk]:
|
|
19
|
+
ns = namespace_for(raw_root, path)
|
|
20
|
+
rel = str(path.relative_to(raw_root))
|
|
21
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
22
|
+
chunks: list[DocChunk] = []
|
|
23
|
+
for start in range(0, len(lines), chunk_lines):
|
|
24
|
+
block = lines[start:start + chunk_lines]
|
|
25
|
+
if not any(line.strip() for line in block):
|
|
26
|
+
continue
|
|
27
|
+
chunks.append(DocChunk(
|
|
28
|
+
path=rel,
|
|
29
|
+
start_line=start + 1,
|
|
30
|
+
end_line=start + len(block),
|
|
31
|
+
text="\n".join(block),
|
|
32
|
+
namespace=ns,
|
|
33
|
+
))
|
|
34
|
+
return chunks
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def ingest(raw_root: Path, glob: str = "**/*.md", chunk_lines: int = 60) -> list[DocChunk]:
|
|
38
|
+
"""Ingest files under raw_root into DocChunks.
|
|
39
|
+
|
|
40
|
+
Any path whose resolved target escapes raw_root is skipped with a stderr
|
|
41
|
+
warning. Everything under raw/ is sent to the LLM provider at compile, so a
|
|
42
|
+
planted symlink (e.g. raw/x/leak.md -> ~/.ssh/id_rsa) must not exfiltrate
|
|
43
|
+
files outside raw_root — fail closed.
|
|
44
|
+
"""
|
|
45
|
+
root = raw_root.resolve()
|
|
46
|
+
chunks: list[DocChunk] = []
|
|
47
|
+
for p in sorted(raw_root.glob(glob)):
|
|
48
|
+
if not p.is_file():
|
|
49
|
+
continue
|
|
50
|
+
if not p.resolve().is_relative_to(root):
|
|
51
|
+
print(f"lorekeep: skip path outside raw_root (possible symlink): {p}",
|
|
52
|
+
file=sys.stderr)
|
|
53
|
+
continue
|
|
54
|
+
chunks.extend(ingest_file(raw_root, p, chunk_lines))
|
|
55
|
+
return chunks
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""LLM provider abstraction. litellm is the only hard dependency on a vendor."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Protocol, runtime_checkable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@runtime_checkable
|
|
8
|
+
class LLMProvider(Protocol):
|
|
9
|
+
def extract_json(self, system: str, user: str) -> str: ...
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FakeProvider:
|
|
13
|
+
"""Returns canned responses in order. Used by tests; never hits a network."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, responses: list[str]) -> None:
|
|
16
|
+
self._responses = list(responses)
|
|
17
|
+
self.calls: list[tuple[str, str]] = []
|
|
18
|
+
|
|
19
|
+
def extract_json(self, system: str, user: str) -> str:
|
|
20
|
+
self.calls.append((system, user))
|
|
21
|
+
if not self._responses:
|
|
22
|
+
raise RuntimeError("FakeProvider: no canned response left")
|
|
23
|
+
return self._responses.pop(0)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LiteLLMProvider:
|
|
27
|
+
"""Real provider backed by litellm. Supports openai/anthropic/ollama."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, model: str, api_base: str | None = None,
|
|
30
|
+
temperature: float = 0.0, api_key: str | None = None) -> None:
|
|
31
|
+
self.model = model
|
|
32
|
+
self.api_base = api_base
|
|
33
|
+
self.temperature = temperature
|
|
34
|
+
self.api_key = api_key
|
|
35
|
+
|
|
36
|
+
def extract_json(self, system: str, user: str) -> str:
|
|
37
|
+
import litellm # imported lazily so tests need not install it
|
|
38
|
+
resp = litellm.completion(
|
|
39
|
+
model=self.model,
|
|
40
|
+
api_base=self.api_base,
|
|
41
|
+
api_key=self.api_key,
|
|
42
|
+
temperature=self.temperature,
|
|
43
|
+
messages=[
|
|
44
|
+
{"role": "system", "content": system},
|
|
45
|
+
{"role": "user", "content": user},
|
|
46
|
+
],
|
|
47
|
+
response_format={"type": "json_object"},
|
|
48
|
+
)
|
|
49
|
+
return resp.choices[0].message.content
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Resolve: dedup entities, validate edges, enforce ns, quarantine bad facts.
|
|
2
|
+
|
|
3
|
+
Extraction may emit the same entity under several ids (aliases). This stage
|
|
4
|
+
collapses them onto one canonical id, rewrites edge endpoints, drops edges whose
|
|
5
|
+
endpoints disappeared, and quarantines malformed facts for review.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
from lorekeep.models import Edge, Node
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ResolveResult:
|
|
16
|
+
nodes: list[Node] = field(default_factory=list)
|
|
17
|
+
edges: list[Edge] = field(default_factory=list)
|
|
18
|
+
aliases: dict[str, str] = field(default_factory=dict) # alias_id -> canonical_id
|
|
19
|
+
quarantined: list[tuple[dict, str]] = field(default_factory=list)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _build_alias_map(
|
|
23
|
+
nodes: list[Node],
|
|
24
|
+
name_aliases: dict[str, list[str]] | None,
|
|
25
|
+
explicit_map: dict[str, str] | None,
|
|
26
|
+
) -> dict[str, str]:
|
|
27
|
+
"""Return alias_id -> canonical_id. Canonical = first node id seen for a name."""
|
|
28
|
+
alias_map: dict[str, str] = {}
|
|
29
|
+
# 1) by name: group nodes whose props.name matches an alias group's canonical
|
|
30
|
+
if name_aliases:
|
|
31
|
+
name_to_canonical: dict[str, str] = {}
|
|
32
|
+
for nd in nodes:
|
|
33
|
+
nm = nd.props.get("name")
|
|
34
|
+
if not nm:
|
|
35
|
+
continue
|
|
36
|
+
for canonical_name, variants in name_aliases.items():
|
|
37
|
+
if nm in variants:
|
|
38
|
+
canon = name_to_canonical.setdefault(canonical_name, nd.id)
|
|
39
|
+
if nd.id != canon:
|
|
40
|
+
alias_map[nd.id] = canon
|
|
41
|
+
# 2) explicit id->id overrides win
|
|
42
|
+
if explicit_map:
|
|
43
|
+
alias_map.update(explicit_map)
|
|
44
|
+
return alias_map
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _canonical(node_id: str, alias_map: dict[str, str]) -> str:
|
|
48
|
+
seen: set[str] = set()
|
|
49
|
+
cur = node_id
|
|
50
|
+
while cur in alias_map and cur not in seen:
|
|
51
|
+
seen.add(cur)
|
|
52
|
+
cur = alias_map[cur]
|
|
53
|
+
return cur
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def resolve(
|
|
57
|
+
nodes: list[Node],
|
|
58
|
+
edges: list[Edge],
|
|
59
|
+
name_aliases: dict[str, list[str]] | None = None,
|
|
60
|
+
aliases_map: dict[str, str] | None = None,
|
|
61
|
+
) -> ResolveResult:
|
|
62
|
+
alias_map = _build_alias_map(nodes, name_aliases, aliases_map)
|
|
63
|
+
|
|
64
|
+
# collapse nodes
|
|
65
|
+
canon_nodes: dict[str, Node] = {}
|
|
66
|
+
for nd in nodes:
|
|
67
|
+
cid = _canonical(nd.id, alias_map)
|
|
68
|
+
if cid in canon_nodes:
|
|
69
|
+
base = canon_nodes[cid]
|
|
70
|
+
merged_props = {**base.props, **nd.props}
|
|
71
|
+
merged_src = tuple(dict.fromkeys(base.src + nd.src))
|
|
72
|
+
merged_ns = tuple(dict.fromkeys(base.ns + nd.ns))
|
|
73
|
+
canon_nodes[cid] = base.model_copy(
|
|
74
|
+
update={"props": merged_props, "src": merged_src, "ns": merged_ns}
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
# normalize stored node id to the canonical key so node identity and
|
|
78
|
+
# dict key can never diverge (covers explicit_map to a non-node id)
|
|
79
|
+
canon_nodes[cid] = nd if nd.id == cid else nd.model_copy(update={"id": cid})
|
|
80
|
+
|
|
81
|
+
out_nodes = list(canon_nodes.values())
|
|
82
|
+
node_ids = set(canon_nodes.keys())
|
|
83
|
+
|
|
84
|
+
# rewrite + validate edges
|
|
85
|
+
out_edges: list[Edge] = []
|
|
86
|
+
quarantined: list[tuple[dict, str]] = []
|
|
87
|
+
counter = 0
|
|
88
|
+
for ed in edges:
|
|
89
|
+
f = _canonical(ed.from_, alias_map)
|
|
90
|
+
t = _canonical(ed.to, alias_map)
|
|
91
|
+
if f not in node_ids or t not in node_ids:
|
|
92
|
+
quarantined.append((ed.model_dump(mode="json", by_alias=True),
|
|
93
|
+
f"dangling endpoint ({f}->{t})"))
|
|
94
|
+
continue
|
|
95
|
+
if f == t:
|
|
96
|
+
quarantined.append((ed.model_dump(mode="json", by_alias=True),
|
|
97
|
+
"self-loop"))
|
|
98
|
+
continue
|
|
99
|
+
counter += 1
|
|
100
|
+
out_edges.append(ed.model_copy(update={
|
|
101
|
+
"id": f"e_{ed.type}_{counter:04d}",
|
|
102
|
+
**{"from_": f},
|
|
103
|
+
"to": t,
|
|
104
|
+
}))
|
|
105
|
+
|
|
106
|
+
return ResolveResult(
|
|
107
|
+
nodes=out_nodes,
|
|
108
|
+
edges=out_edges,
|
|
109
|
+
aliases=alias_map,
|
|
110
|
+
quarantined=quarantined,
|
|
111
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Writer: emit deterministic facts.jsonl + manifest.json.
|
|
2
|
+
|
|
3
|
+
Determinism = facts sorted by (kind, type, id), JSON keys sorted, stable
|
|
4
|
+
separators. Re-compiling unchanged input yields byte-identical output.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import tempfile
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from lorekeep.models import DocChunk, Edge, Manifest, Node
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _sort_key(fact: Node | Edge) -> tuple[str, str, str]:
|
|
18
|
+
return (fact.kind, fact.type, fact.id)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _atomic_write(path: Path, data: str) -> None:
|
|
22
|
+
"""Write data to path atomically: stage a temp file then os.replace onto it.
|
|
23
|
+
|
|
24
|
+
Prevents a torn read when the MCP server lazy-reloads facts.jsonl mid-write
|
|
25
|
+
(compile truncating the file while a query reads it). os.replace is atomic
|
|
26
|
+
when src and dst share a filesystem, which holds for a sibling temp file.
|
|
27
|
+
"""
|
|
28
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
fd, tmp = tempfile.mkstemp(dir=path.parent, prefix=path.name + ".", suffix=".tmp")
|
|
30
|
+
try:
|
|
31
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
32
|
+
f.write(data)
|
|
33
|
+
os.replace(tmp, path)
|
|
34
|
+
except BaseException:
|
|
35
|
+
try:
|
|
36
|
+
os.unlink(tmp)
|
|
37
|
+
except OSError:
|
|
38
|
+
pass
|
|
39
|
+
raise
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def write_graph(
|
|
43
|
+
out_dir: Path, nodes: list[Node], edges: list[Edge], manifest: Manifest,
|
|
44
|
+
) -> None:
|
|
45
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
facts = sorted(nodes + edges, key=_sort_key)
|
|
47
|
+
lines = [f.to_json_line() for f in facts]
|
|
48
|
+
text = "\n".join(lines) + ("\n" if lines else "")
|
|
49
|
+
_atomic_write(out_dir / "facts.jsonl", text)
|
|
50
|
+
_atomic_write(out_dir / "manifest.json", manifest.to_json())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def run_id(chunks: list[DocChunk], schema_version: int) -> str:
|
|
54
|
+
h = hashlib.sha256()
|
|
55
|
+
h.update(str(schema_version).encode("utf-8"))
|
|
56
|
+
for c in sorted(chunks, key=lambda c: (c.path, c.start_line)):
|
|
57
|
+
h.update(c.hash.encode("utf-8"))
|
|
58
|
+
return h.hexdigest()[:16]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def facts_hash(out_dir: Path) -> str:
|
|
62
|
+
raw = (out_dir / "facts.jsonl").read_bytes()
|
|
63
|
+
return hashlib.sha256(raw).hexdigest()[:16]
|