lorekeep 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lorekeep/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Lorekeep — temporal knowledge graph for AI agents via MCP."""
2
+
3
+ __version__ = "0.1.0"
lorekeep/cli.py ADDED
@@ -0,0 +1,229 @@
1
+ """Lorekeep CLI."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import typer
9
+
10
+ from lorekeep import __version__
11
+ from lorekeep.compile.providers import FakeProvider, LiteLLMProvider
12
+ from lorekeep.config import load_config
13
+ from lorekeep.pipeline import compile_graph
14
+ from lorekeep.paths import resolve_paths
15
+ from lorekeep.defaults import DEFAULT_CONFIG_YAML, DEFAULT_SCHEMA
16
+ from lorekeep.schema_io import load_schema
17
+
18
+ app = typer.Typer(help="Lorekeep — compile team docs into a temporal knowledge graph.")
19
+
20
+
21
+ # Empty callback forces multi-command mode so subcommands are not auto-promoted.
22
+ @app.callback()
23
+ def _main() -> None:
24
+ """Lorekeep — compile team docs into a temporal knowledge graph."""
25
+
26
+
27
+
28
+ @app.command()
29
+ def version() -> None:
30
+ """Print the Lorekeep version."""
31
+ typer.echo(f"lorekeep {__version__}")
32
+
33
+
34
+ @app.command()
35
+ def compile() -> None:
36
+ """Compile raw/ into graph/facts.jsonl."""
37
+ p = resolve_paths()
38
+ schema = load_schema(p["schema"])
39
+ config = load_config(p["config"])
40
+
41
+ if os.environ.get("LOREKEEP_PROVIDER") == "fake":
42
+ canned = json.dumps({
43
+ "nodes": [
44
+ {"id": "svc:payments-api", "type": "service", "name": "payments-api",
45
+ "props": {"lang": "go"}, "valid_from": "2024-01-15"},
46
+ {"id": "svc:auth", "type": "service", "name": "auth"},
47
+ {"id": "team:backend", "type": "team", "name": "team-backend"},
48
+ {"id": "dec:adr-007", "type": "decision",
49
+ "props": {"title": "payments-api adopts internal signing"}},
50
+ ],
51
+ "edges": [
52
+ {"type": "depends_on", "from": "svc:payments-api", "to": "svc:auth",
53
+ "valid_from": "2024-01-15", "valid_to": "2025-03-01"},
54
+ {"type": "decided_by", "from": "dec:adr-007", "to": "team:backend"},
55
+ ],
56
+ "aliases": {},
57
+ })
58
+ provider = FakeProvider(responses=[canned])
59
+ else:
60
+ api_key = None
61
+ if config.provider.api_key_env:
62
+ api_key = os.environ.get(config.provider.api_key_env)
63
+ if not api_key:
64
+ api_key = config.provider.api_key
65
+ if api_key is config.provider.api_key and api_key:
66
+ typer.echo(
67
+ "warning: using inline api_key from config.yaml — prefer api_key_env "
68
+ "(env var). config.yaml is gitignored but env is safer."
69
+ )
70
+ provider = LiteLLMProvider(
71
+ model=config.provider.model,
72
+ api_base=config.provider.api_base,
73
+ temperature=config.provider.temperature,
74
+ api_key=api_key,
75
+ )
76
+
77
+ manifest = compile_graph(
78
+ raw_root=p["raw"], out_dir=p["out"], schema=schema,
79
+ provider=provider, cache_path=p["cache"], chunk_lines=config.compile.chunk_lines,
80
+ )
81
+ typer.echo(f"compiled: {manifest.node_count} nodes, {manifest.edge_count} edges, "
82
+ f"run_id={manifest.run_id}, facts_hash={manifest.facts_hash}")
83
+
84
+
85
+ @app.command(name="eval")
86
+ def eval_cmd() -> None:
87
+ """Run Tier-1 construction-quality evaluation vs the gold corpus."""
88
+ p = resolve_paths()
89
+ gold_dir = Path(os.environ.get("LOREKEEP_GOLD", "tests/fixtures/gold"))
90
+ from lorekeep.eval.construction import extraction_report, structure_report
91
+ report = {
92
+ "extraction": extraction_report(p["out"], gold_dir),
93
+ "structure": structure_report(p["out"]),
94
+ }
95
+ results_path = Path(os.environ.get("LOREKEEP_EVAL_RESULTS",
96
+ ".lorekeep/eval/results.json"))
97
+ results_path.parent.mkdir(parents=True, exist_ok=True)
98
+ results_path.write_text(json.dumps(report, indent=2, sort_keys=True))
99
+ typer.echo(json.dumps(report, indent=2, sort_keys=True))
100
+
101
+
102
+ @app.command()
103
+ def check() -> None:
104
+ """Validate the compiled graph: loads, no dangling edges."""
105
+ p = resolve_paths()
106
+ from lorekeep.eval.construction import structure_report
107
+ struct = structure_report(p["out"])
108
+ if struct["dangling_edge_rate"] > 0:
109
+ typer.echo(f"check: FAIL — {struct['dangling_edge_rate']} dangling edges")
110
+ raise typer.Exit(code=1)
111
+ typer.echo(f"check: ok — {struct['node_count']} nodes, {struct['edge_count']} edges, 0 dangling")
112
+
113
+
114
+ @app.command()
115
+ def serve(
116
+ transport: str = typer.Option("stdio", "--transport", help="stdio (default) | http"),
117
+ ) -> None:
118
+ """Serve the scoped graph over MCP."""
119
+ p = resolve_paths()
120
+ raw_ns = os.environ.get("LOREKEEP_NS")
121
+ if raw_ns:
122
+ allowed = [x.strip() for x in raw_ns.split(",") if x.strip()]
123
+ else:
124
+ allowed = load_config(p["config"]).ns.default
125
+ from lorekeep.mcp_server import configure, mcp
126
+ configure(graph_dir=p["out"], allowed_ns=allowed, schema_path=p["schema"])
127
+ mcp.run(transport=transport)
128
+
129
+
130
+ mcp_app = typer.Typer(help="Coding-agent integration.")
131
+ app.add_typer(mcp_app, name="mcp")
132
+
133
+
134
+ @mcp_app.command("add")
135
+ def mcp_add(
136
+ agent: str = typer.Option(..., "--agent", help="claude | cursor | codex"),
137
+ scope: str = typer.Option("project", "--scope", help="project | user"),
138
+ ns: str = typer.Option(None, "--ns", help="namespace to scope the agent to"),
139
+ ) -> None:
140
+ """Write the agent's MCP config + print an agent-memory snippet."""
141
+ from lorekeep.integrations import claude_code, codex, cursor
142
+ from lorekeep.integrations.common import agent_memory_snippet, resolve_command
143
+
144
+ p = resolve_paths()
145
+ config = load_config(p["config"])
146
+ command, args = resolve_command(config.install_source)
147
+
148
+ target = Path.cwd() if scope == "project" else Path.home()
149
+ writers = {"claude": claude_code, "cursor": cursor, "codex": codex}
150
+ if agent not in writers:
151
+ typer.echo(f"unknown agent: {agent} (choose claude|cursor|codex)")
152
+ raise typer.Exit(code=1)
153
+ written = writers[agent].write_config(target, command, args, ns)
154
+ typer.echo(f"wrote {agent} config -> {written}")
155
+ typer.echo("\n" + agent_memory_snippet())
156
+
157
+
158
+ @app.command()
159
+ def doctor() -> None:
160
+ """Verify install: graph loads, schema valid, ns resolves, a tool responds."""
161
+ p = resolve_paths()
162
+ problems = []
163
+
164
+ facts_path = p["out"] / "facts.jsonl"
165
+ if not facts_path.exists():
166
+ typer.echo(f"FAIL: facts.jsonl not found at {facts_path}")
167
+ raise typer.Exit(code=1)
168
+
169
+ try:
170
+ from lorekeep.store.graph import GraphStore
171
+ store = GraphStore.from_jsonl(facts_path)
172
+ except Exception as exc:
173
+ typer.echo(f"FAIL: cannot load graph: {exc}")
174
+ raise typer.Exit(code=1)
175
+
176
+ if not p["schema"].exists():
177
+ problems.append("schema.json missing")
178
+ else:
179
+ try:
180
+ load_schema(p["schema"])
181
+ except Exception as exc:
182
+ problems.append(f"schema invalid: {exc}")
183
+
184
+ raw_ns = os.environ.get("LOREKEEP_NS")
185
+ allowed = [x.strip() for x in raw_ns.split(",")] if raw_ns else load_config(p["config"]).ns.default
186
+
187
+ try:
188
+ from lorekeep.mcp_server import configure, list_namespaces
189
+ configure(graph_dir=p["out"], allowed_ns=allowed, schema_path=p["schema"])
190
+ ns = list_namespaces()
191
+ except Exception as exc:
192
+ problems.append(f"mcp configure/tool failed: {exc}")
193
+ ns = []
194
+
195
+ if problems:
196
+ typer.echo("FAIL: " + "; ".join(problems))
197
+ raise typer.Exit(code=1)
198
+
199
+ typer.echo(
200
+ f"all checks passed: {len(store.node_ids())} nodes, "
201
+ f"{len(store.all_edges())} edges, namespaces={ns}"
202
+ )
203
+
204
+
205
+ @app.command()
206
+ def init() -> None:
207
+ """Bootstrap the data home: config + schema + raw/graph dirs."""
208
+ p = resolve_paths()
209
+ created = []
210
+ p["config"].parent.mkdir(parents=True, exist_ok=True)
211
+ if not p["config"].exists():
212
+ p["config"].write_text(DEFAULT_CONFIG_YAML)
213
+ created.append(str(p["config"]))
214
+ p["schema"].parent.mkdir(parents=True, exist_ok=True)
215
+ if not p["schema"].exists():
216
+ p["schema"].write_text(json.dumps(DEFAULT_SCHEMA, indent=2))
217
+ created.append(str(p["schema"]))
218
+ p["raw"].mkdir(parents=True, exist_ok=True)
219
+ p["out"].mkdir(parents=True, exist_ok=True)
220
+ typer.echo(f"home ready: config={p['config']}")
221
+ typer.echo(f" schema={p['schema']} raw={p['raw']} graph={p['out']}")
222
+ if created:
223
+ typer.echo(f" wrote defaults: {created}")
224
+ else:
225
+ typer.echo(" (existing config/schema preserved)")
226
+
227
+
228
+ if __name__ == "__main__":
229
+ app()
File without changes
@@ -0,0 +1,150 @@
1
+ """Extract: turn a DocChunk into candidate facts via an LLM. Pure helpers first."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+ from datetime import date
7
+ from typing import Any
8
+
9
+ from lorekeep.models import DocChunk, Edge, Node, Schema
10
+
11
+ SYSTEM_PROMPT = (
12
+ "You are a knowledge-graph extractor. Read the document chunk and emit a JSON "
13
+ 'object {"nodes":[...], "edges":[...], "aliases":{...}}. '
14
+ "Only use node_types and edge_types listed in the provided schema. "
15
+ "For every node give id (stable slug prefixed by type, e.g. svc:payments-api), "
16
+ "type, name, optional props, optional valid_from/valid_to (ISO dates, null = unknown). "
17
+ "For every edge give type, from (node id), to (node id), optional valid_from/valid_to. "
18
+ "aliases maps a canonical name to surface variants. Emit NO text outside the JSON."
19
+ )
20
+
21
+
22
+ def build_prompt(chunk: DocChunk, schema: Schema) -> str:
23
+ node_types = ", ".join(schema.node_types.keys())
24
+ edge_types = ", ".join(
25
+ f"{k}({v.from_}->{v.to})" for k, v in schema.edge_types.items()
26
+ )
27
+ return (
28
+ f"Allowed node_types: {node_types}\n"
29
+ f"Allowed edge_types: {edge_types}\n\n"
30
+ f"Source: {chunk.src}\n"
31
+ f"Namespace: {chunk.namespace}\n\n"
32
+ f"Document chunk:\n{chunk.text}\n"
33
+ )
34
+
35
+
36
+ def _parse_date(v: Any) -> date | None:
37
+ if not v:
38
+ return None
39
+ return date.fromisoformat(v)
40
+
41
+
42
+ def _extract_json(raw: str, chunk: DocChunk) -> str:
43
+ """Best-effort recover a JSON object from LLM output.
44
+
45
+ response_format=json_object usually yields clean JSON, but some models wrap
46
+ output in ```json fences or prepend prose. Strip fences, then fall back to
47
+ the first balanced {...} span. Raises ValueError (with chunk src) if the
48
+ output still can't be parsed, so the pipeline reports a clear failure.
49
+ """
50
+ s = raw.strip()
51
+ if s.startswith("```"):
52
+ s = s.strip("`")
53
+ brace = s.find("{")
54
+ if brace != -1:
55
+ s = s[brace:]
56
+ try:
57
+ json.loads(s)
58
+ return s
59
+ except json.JSONDecodeError:
60
+ m = re.search(r"\{.*\}", raw, re.DOTALL)
61
+ if m:
62
+ json.loads(m.group(0)) # validate; raises if malformed
63
+ return m.group(0)
64
+ raise ValueError(f"LLM returned non-JSON for {chunk.src}")
65
+
66
+
67
+ def parse_response(
68
+ raw: str, chunk: DocChunk, schema: Schema | None = None,
69
+ ) -> tuple[list[Node], list[Edge], dict[str, list[str]]]:
70
+ data = json.loads(_extract_json(raw, chunk))
71
+ nodes: list[Node] = []
72
+ for n in data.get("nodes", []):
73
+ ntype = n.get("type")
74
+ if schema is not None and not schema.is_valid_node_type(ntype):
75
+ continue
76
+ props = dict(n.get("props", {}))
77
+ if "name" in n and "name" not in props:
78
+ props["name"] = n["name"]
79
+ nodes.append(Node(
80
+ id=n["id"],
81
+ type=ntype,
82
+ ns=(chunk.namespace,),
83
+ valid_from=_parse_date(n.get("valid_from")),
84
+ valid_to=_parse_date(n.get("valid_to")),
85
+ props=props,
86
+ src=(chunk.src,),
87
+ ))
88
+ edges: list[Edge] = []
89
+ for e in data.get("edges", []):
90
+ etype = e.get("type")
91
+ if schema is not None and not schema.is_valid_edge_type(etype):
92
+ continue
93
+ edges.append(Edge(
94
+ id="", # assigned deterministically in resolve
95
+ type=etype,
96
+ **{"from": e["from"]},
97
+ to=e["to"],
98
+ ns=(chunk.namespace,),
99
+ valid_from=_parse_date(e.get("valid_from")),
100
+ valid_to=_parse_date(e.get("valid_to")),
101
+ src=(chunk.src,),
102
+ ))
103
+ aliases = {k: list(v) for k, v in data.get("aliases", {}).items()}
104
+ return nodes, edges, aliases
105
+
106
+
107
+ import hashlib
108
+ from pathlib import Path
109
+
110
+ from lorekeep.compile.providers import LLMProvider
111
+
112
+
113
+ class ExtractionCache:
114
+ """Maps (chunk_hash, schema_version) -> raw LLM response. Local only."""
115
+
116
+ def __init__(self, path: Path) -> None:
117
+ self.path = Path(path)
118
+ self._data: dict[str, str] = {}
119
+ if self.path.exists():
120
+ self._data = json.loads(self.path.read_text(encoding="utf-8"))
121
+
122
+ def key(self, chunk: DocChunk, schema_version: int) -> str:
123
+ h = hashlib.sha256()
124
+ h.update(str(schema_version).encode("utf-8"))
125
+ h.update(b"\n")
126
+ h.update(chunk.hash.encode("utf-8"))
127
+ return h.hexdigest()
128
+
129
+ def get(self, key: str) -> str | None:
130
+ return self._data.get(key)
131
+
132
+ def set(self, key: str, raw: str) -> None:
133
+ self._data[key] = raw
134
+
135
+ def save(self) -> None:
136
+ self.path.parent.mkdir(parents=True, exist_ok=True)
137
+ self.path.write_text(
138
+ json.dumps(self._data, sort_keys=True, indent=2), encoding="utf-8"
139
+ )
140
+
141
+
142
+ def extract_chunk(
143
+ chunk: DocChunk, schema: Schema, provider: LLMProvider, cache: ExtractionCache,
144
+ ) -> tuple[list[Node], list[Edge], dict[str, list[str]]]:
145
+ key = cache.key(chunk, schema.version)
146
+ raw = cache.get(key)
147
+ if raw is None:
148
+ raw = provider.extract_json(SYSTEM_PROMPT, build_prompt(chunk, schema))
149
+ cache.set(key, raw)
150
+ return parse_response(raw, chunk, schema)
@@ -0,0 +1,55 @@
1
+ """Ingest: raw markdown files -> DocChunks with provenance."""
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from lorekeep.models import DocChunk
8
+
9
+
10
+ def namespace_for(raw_root: Path, path: Path) -> str:
11
+ rel = path.relative_to(raw_root)
12
+ parts = rel.parts
13
+ if len(parts) >= 2: # <dir>/<file> -> ns is the first directory
14
+ return parts[0]
15
+ return "public"
16
+
17
+
18
+ def ingest_file(raw_root: Path, path: Path, chunk_lines: int) -> list[DocChunk]:
19
+ ns = namespace_for(raw_root, path)
20
+ rel = str(path.relative_to(raw_root))
21
+ lines = path.read_text(encoding="utf-8").splitlines()
22
+ chunks: list[DocChunk] = []
23
+ for start in range(0, len(lines), chunk_lines):
24
+ block = lines[start:start + chunk_lines]
25
+ if not any(line.strip() for line in block):
26
+ continue
27
+ chunks.append(DocChunk(
28
+ path=rel,
29
+ start_line=start + 1,
30
+ end_line=start + len(block),
31
+ text="\n".join(block),
32
+ namespace=ns,
33
+ ))
34
+ return chunks
35
+
36
+
37
+ def ingest(raw_root: Path, glob: str = "**/*.md", chunk_lines: int = 60) -> list[DocChunk]:
38
+ """Ingest files under raw_root into DocChunks.
39
+
40
+ Any path whose resolved target escapes raw_root is skipped with a stderr
41
+ warning. Everything under raw/ is sent to the LLM provider at compile, so a
42
+ planted symlink (e.g. raw/x/leak.md -> ~/.ssh/id_rsa) must not exfiltrate
43
+ files outside raw_root — fail closed.
44
+ """
45
+ root = raw_root.resolve()
46
+ chunks: list[DocChunk] = []
47
+ for p in sorted(raw_root.glob(glob)):
48
+ if not p.is_file():
49
+ continue
50
+ if not p.resolve().is_relative_to(root):
51
+ print(f"lorekeep: skip path outside raw_root (possible symlink): {p}",
52
+ file=sys.stderr)
53
+ continue
54
+ chunks.extend(ingest_file(raw_root, p, chunk_lines))
55
+ return chunks
@@ -0,0 +1,49 @@
1
+ """LLM provider abstraction. litellm is the only hard dependency on a vendor."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Protocol, runtime_checkable
5
+
6
+
7
+ @runtime_checkable
8
+ class LLMProvider(Protocol):
9
+ def extract_json(self, system: str, user: str) -> str: ...
10
+
11
+
12
+ class FakeProvider:
13
+ """Returns canned responses in order. Used by tests; never hits a network."""
14
+
15
+ def __init__(self, responses: list[str]) -> None:
16
+ self._responses = list(responses)
17
+ self.calls: list[tuple[str, str]] = []
18
+
19
+ def extract_json(self, system: str, user: str) -> str:
20
+ self.calls.append((system, user))
21
+ if not self._responses:
22
+ raise RuntimeError("FakeProvider: no canned response left")
23
+ return self._responses.pop(0)
24
+
25
+
26
+ class LiteLLMProvider:
27
+ """Real provider backed by litellm. Supports openai/anthropic/ollama."""
28
+
29
+ def __init__(self, model: str, api_base: str | None = None,
30
+ temperature: float = 0.0, api_key: str | None = None) -> None:
31
+ self.model = model
32
+ self.api_base = api_base
33
+ self.temperature = temperature
34
+ self.api_key = api_key
35
+
36
+ def extract_json(self, system: str, user: str) -> str:
37
+ import litellm # imported lazily so tests need not install it
38
+ resp = litellm.completion(
39
+ model=self.model,
40
+ api_base=self.api_base,
41
+ api_key=self.api_key,
42
+ temperature=self.temperature,
43
+ messages=[
44
+ {"role": "system", "content": system},
45
+ {"role": "user", "content": user},
46
+ ],
47
+ response_format={"type": "json_object"},
48
+ )
49
+ return resp.choices[0].message.content
@@ -0,0 +1,111 @@
1
+ """Resolve: dedup entities, validate edges, enforce ns, quarantine bad facts.
2
+
3
+ Extraction may emit the same entity under several ids (aliases). This stage
4
+ collapses them onto one canonical id, rewrites edge endpoints, drops edges whose
5
+ endpoints disappeared, and quarantines malformed facts for review.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+
11
+ from lorekeep.models import Edge, Node
12
+
13
+
14
+ @dataclass
15
+ class ResolveResult:
16
+ nodes: list[Node] = field(default_factory=list)
17
+ edges: list[Edge] = field(default_factory=list)
18
+ aliases: dict[str, str] = field(default_factory=dict) # alias_id -> canonical_id
19
+ quarantined: list[tuple[dict, str]] = field(default_factory=list)
20
+
21
+
22
+ def _build_alias_map(
23
+ nodes: list[Node],
24
+ name_aliases: dict[str, list[str]] | None,
25
+ explicit_map: dict[str, str] | None,
26
+ ) -> dict[str, str]:
27
+ """Return alias_id -> canonical_id. Canonical = first node id seen for a name."""
28
+ alias_map: dict[str, str] = {}
29
+ # 1) by name: group nodes whose props.name matches an alias group's canonical
30
+ if name_aliases:
31
+ name_to_canonical: dict[str, str] = {}
32
+ for nd in nodes:
33
+ nm = nd.props.get("name")
34
+ if not nm:
35
+ continue
36
+ for canonical_name, variants in name_aliases.items():
37
+ if nm in variants:
38
+ canon = name_to_canonical.setdefault(canonical_name, nd.id)
39
+ if nd.id != canon:
40
+ alias_map[nd.id] = canon
41
+ # 2) explicit id->id overrides win
42
+ if explicit_map:
43
+ alias_map.update(explicit_map)
44
+ return alias_map
45
+
46
+
47
+ def _canonical(node_id: str, alias_map: dict[str, str]) -> str:
48
+ seen: set[str] = set()
49
+ cur = node_id
50
+ while cur in alias_map and cur not in seen:
51
+ seen.add(cur)
52
+ cur = alias_map[cur]
53
+ return cur
54
+
55
+
56
+ def resolve(
57
+ nodes: list[Node],
58
+ edges: list[Edge],
59
+ name_aliases: dict[str, list[str]] | None = None,
60
+ aliases_map: dict[str, str] | None = None,
61
+ ) -> ResolveResult:
62
+ alias_map = _build_alias_map(nodes, name_aliases, aliases_map)
63
+
64
+ # collapse nodes
65
+ canon_nodes: dict[str, Node] = {}
66
+ for nd in nodes:
67
+ cid = _canonical(nd.id, alias_map)
68
+ if cid in canon_nodes:
69
+ base = canon_nodes[cid]
70
+ merged_props = {**base.props, **nd.props}
71
+ merged_src = tuple(dict.fromkeys(base.src + nd.src))
72
+ merged_ns = tuple(dict.fromkeys(base.ns + nd.ns))
73
+ canon_nodes[cid] = base.model_copy(
74
+ update={"props": merged_props, "src": merged_src, "ns": merged_ns}
75
+ )
76
+ else:
77
+ # normalize stored node id to the canonical key so node identity and
78
+ # dict key can never diverge (covers explicit_map to a non-node id)
79
+ canon_nodes[cid] = nd if nd.id == cid else nd.model_copy(update={"id": cid})
80
+
81
+ out_nodes = list(canon_nodes.values())
82
+ node_ids = set(canon_nodes.keys())
83
+
84
+ # rewrite + validate edges
85
+ out_edges: list[Edge] = []
86
+ quarantined: list[tuple[dict, str]] = []
87
+ counter = 0
88
+ for ed in edges:
89
+ f = _canonical(ed.from_, alias_map)
90
+ t = _canonical(ed.to, alias_map)
91
+ if f not in node_ids or t not in node_ids:
92
+ quarantined.append((ed.model_dump(mode="json", by_alias=True),
93
+ f"dangling endpoint ({f}->{t})"))
94
+ continue
95
+ if f == t:
96
+ quarantined.append((ed.model_dump(mode="json", by_alias=True),
97
+ "self-loop"))
98
+ continue
99
+ counter += 1
100
+ out_edges.append(ed.model_copy(update={
101
+ "id": f"e_{ed.type}_{counter:04d}",
102
+ **{"from_": f},
103
+ "to": t,
104
+ }))
105
+
106
+ return ResolveResult(
107
+ nodes=out_nodes,
108
+ edges=out_edges,
109
+ aliases=alias_map,
110
+ quarantined=quarantined,
111
+ )
@@ -0,0 +1,63 @@
1
+ """Writer: emit deterministic facts.jsonl + manifest.json.
2
+
3
+ Determinism = facts sorted by (kind, type, id), JSON keys sorted, stable
4
+ separators. Re-compiling unchanged input yields byte-identical output.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import json
10
+ import os
11
+ import tempfile
12
+ from pathlib import Path
13
+
14
+ from lorekeep.models import DocChunk, Edge, Manifest, Node
15
+
16
+
17
+ def _sort_key(fact: Node | Edge) -> tuple[str, str, str]:
18
+ return (fact.kind, fact.type, fact.id)
19
+
20
+
21
+ def _atomic_write(path: Path, data: str) -> None:
22
+ """Write data to path atomically: stage a temp file then os.replace onto it.
23
+
24
+ Prevents a torn read when the MCP server lazy-reloads facts.jsonl mid-write
25
+ (compile truncating the file while a query reads it). os.replace is atomic
26
+ when src and dst share a filesystem, which holds for a sibling temp file.
27
+ """
28
+ path.parent.mkdir(parents=True, exist_ok=True)
29
+ fd, tmp = tempfile.mkstemp(dir=path.parent, prefix=path.name + ".", suffix=".tmp")
30
+ try:
31
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
32
+ f.write(data)
33
+ os.replace(tmp, path)
34
+ except BaseException:
35
+ try:
36
+ os.unlink(tmp)
37
+ except OSError:
38
+ pass
39
+ raise
40
+
41
+
42
+ def write_graph(
43
+ out_dir: Path, nodes: list[Node], edges: list[Edge], manifest: Manifest,
44
+ ) -> None:
45
+ out_dir.mkdir(parents=True, exist_ok=True)
46
+ facts = sorted(nodes + edges, key=_sort_key)
47
+ lines = [f.to_json_line() for f in facts]
48
+ text = "\n".join(lines) + ("\n" if lines else "")
49
+ _atomic_write(out_dir / "facts.jsonl", text)
50
+ _atomic_write(out_dir / "manifest.json", manifest.to_json())
51
+
52
+
53
+ def run_id(chunks: list[DocChunk], schema_version: int) -> str:
54
+ h = hashlib.sha256()
55
+ h.update(str(schema_version).encode("utf-8"))
56
+ for c in sorted(chunks, key=lambda c: (c.path, c.start_line)):
57
+ h.update(c.hash.encode("utf-8"))
58
+ return h.hexdigest()[:16]
59
+
60
+
61
+ def facts_hash(out_dir: Path) -> str:
62
+ raw = (out_dir / "facts.jsonl").read_bytes()
63
+ return hashlib.sha256(raw).hexdigest()[:16]