temporal-reasoning 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_server.py ADDED
@@ -0,0 +1,2734 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Temporal Reasoning MCP Server.
4
+
5
+ Persistent stdio MCP server providing bi-temporal graph memory for AI coding agents.
6
+ Sole interface to the minigraf .graph file via the MiniGrafDb Python binding.
7
+ """
8
+ import asyncio
9
+ import datetime
10
+ import json
11
+ import os
12
+ import re
13
+ import subprocess as _subprocess
14
+ import sys
15
+ import threading
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from mcp.server import Server
20
+ from mcp.server.stdio import stdio_server
21
+ from minigraf import MiniGrafDb, MiniGrafError
22
+
23
+ try:
24
+ from rank_bm25 import BM25Okapi as _BM25Okapi
25
+ _BM25_AVAILABLE = True
26
+ except ImportError:
27
+ _BM25Okapi = None # type: ignore[assignment,misc]
28
+ _BM25_AVAILABLE = False
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Session-scoped rules — registered once at startup, cached in RuleRegistry
32
+ # ---------------------------------------------------------------------------
33
+ SESSION_RULES = [
34
+ "(rule [(linked ?a ?b) [?a :depends-on ?b]])",
35
+ "(rule [(linked ?a ?b) [?a :calls ?b]])",
36
+ "(rule [(reachable ?a ?b) [?a :depends-on ?b]])",
37
+ "(rule [(reachable ?a ?b) [?a :calls ?b]])",
38
+ "(rule [(linked ?a ?b) [?a :contains ?b]])",
39
+ "(rule [(reachable ?a ?b) [?a :contains ?b]])",
40
+ # Commit-graph traversal: (ancestor ?child ?anc) holds when ?anc is a
41
+ # (possibly transitive) git ancestor of ?child via :parent edges.
42
+ # Only evaluated when a query explicitly calls (ancestor ...).
43
+ "(rule [(ancestor ?child ?anc) [?child :parent ?anc]])",
44
+ "(rule [(ancestor ?child ?anc) [?child :parent ?mid] (ancestor ?mid ?anc)])",
45
+ ]
46
+
47
+ # User-registered rules — persisted across DB reopens (unlike SESSION_RULES,
48
+ # these are accumulated at runtime via minigraf_rule and re-applied on every open).
49
+ _user_rules: List[str] = []
50
+
51
+ # Module-level DB instance — opened once, held for the session lifetime
52
+ _db: Optional[MiniGrafDb] = None
53
+
54
+ # Track graph path and last-known mtime so we can detect external modifications.
55
+ # minigraf's Drop impl writes to the file even for read-only handles, which
56
+ # invalidates any other open handle's in-memory page table. Reopening on
57
+ # mtime change is the workaround until the upstream bug is fixed.
58
+ _graph_path: str = ""
59
+ _db_mtime: float = 0.0
60
+
61
+ # Module-level server reference — set after server creation for MCP sampling
62
+ _server_ref: Optional[Server] = None
63
+
64
+ # Ingestion state
65
+ _ingest_task: Optional[asyncio.Task] = None
66
+ _ingest_progress: Dict[str, Any] = {
67
+ "status": "idle", "processed": 0, "total": 0,
68
+ "current_commit": "", "error": None,
69
+ }
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Language detection and grammar caching
73
+ # ---------------------------------------------------------------------------
74
+
75
+ _EXT_TO_LANG: Dict[str, str] = {
76
+ ".py": "python", ".js": "javascript", ".ts": "typescript",
77
+ ".tsx": "tsx", ".jsx": "javascript", ".rs": "rust",
78
+ ".go": "go", ".java": "java", ".c": "c", ".cpp": "cpp",
79
+ ".cs": "c_sharp", ".rb": "ruby", ".php": "php",
80
+ ".kt": "kotlin", ".swift": "swift", ".scala": "scala",
81
+ ".hs": "haskell", ".lua": "lua", ".ex": "elixir", ".exs": "elixir",
82
+ }
83
+
84
+ _grammar_cache: Dict[str, Any] = {} # lang_name → Parser or None
85
+
86
+
87
+ def _get_parser(file_path: str) -> Optional[Any]:
88
+ """Return a cached tree_sitter.Parser for the file's language, or None if unsupported.
89
+
90
+ Tries two backends in order:
91
+ 1. tree_sitter_languages (bundled, requires Python <=3.12)
92
+ 2. Individual tree-sitter-<lang> packages (e.g. tree-sitter-rust, tree-sitter-python)
93
+ — compatible with Python 3.13+ and tree-sitter >=0.22
94
+ """
95
+ ext = Path(file_path).suffix.lower()
96
+ lang_name = _EXT_TO_LANG.get(ext)
97
+ if not lang_name:
98
+ return None
99
+ if lang_name in _grammar_cache:
100
+ return _grammar_cache[lang_name]
101
+
102
+ parser = None
103
+
104
+ # Attempt 1: tree_sitter_languages (bundled grammars, old-style API)
105
+ try:
106
+ import tree_sitter_languages # type: ignore
107
+ import tree_sitter # type: ignore
108
+ lang = tree_sitter_languages.get_language(lang_name)
109
+ p = tree_sitter.Parser()
110
+ p.set_language(lang)
111
+ parser = p
112
+ except Exception:
113
+ pass
114
+
115
+ # Attempt 2: individual tree-sitter-<lang> packages (new-style API, Python 3.13+)
116
+ if parser is None:
117
+ try:
118
+ mod = __import__(f"tree_sitter_{lang_name}", fromlist=["language"])
119
+ from tree_sitter import Language, Parser # type: ignore
120
+ # PHP exposes language_php() instead of language()
121
+ lang_fn = getattr(mod, f"language_{lang_name}", None) or mod.language
122
+ lang_obj = Language(lang_fn())
123
+ parser = Parser(lang_obj)
124
+ except Exception:
125
+ pass
126
+
127
+ _grammar_cache[lang_name] = parser
128
+ return parser
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # AST extraction
132
+ # ---------------------------------------------------------------------------
133
+
134
+ _LANG_NODE_TYPES: Dict[str, Dict[str, set]] = {
135
+ "python": {
136
+ "functions": {"function_definition", "async_function_definition"},
137
+ "classes": {"class_definition"},
138
+ "imports": {"import_statement", "import_from_statement"},
139
+ "calls": {"call"},
140
+ },
141
+ "javascript": {
142
+ "functions": {"function_declaration", "function_expression", "method_definition"},
143
+ "classes": {"class_declaration"},
144
+ "imports": {"import_statement"},
145
+ "calls": {"call_expression"},
146
+ },
147
+ "typescript": {
148
+ "functions": {"function_declaration", "function_expression", "method_definition"},
149
+ "classes": {"class_declaration"},
150
+ "imports": {"import_statement"},
151
+ "calls": {"call_expression"},
152
+ },
153
+ "rust": {
154
+ "functions": {"function_item"},
155
+ "classes": {"struct_item", "impl_item"},
156
+ "imports": {"use_declaration"},
157
+ "calls": {"call_expression"},
158
+ },
159
+ "go": {
160
+ "functions": {"function_declaration", "method_declaration"},
161
+ "classes": {"type_declaration"},
162
+ "imports": {"import_declaration"},
163
+ "calls": {"call_expression"},
164
+ },
165
+ "java": {
166
+ "functions": {"method_declaration"},
167
+ "classes": {"class_declaration"},
168
+ "imports": {"import_declaration"},
169
+ "calls": {"method_invocation"},
170
+ },
171
+ "c": {
172
+ "functions": {"function_definition"},
173
+ "classes": {"struct_specifier"},
174
+ "imports": {"preproc_include"},
175
+ "calls": {"call_expression"},
176
+ },
177
+ "cpp": {
178
+ "functions": {"function_definition"},
179
+ "classes": {"class_specifier", "struct_specifier"},
180
+ "imports": {"preproc_include"},
181
+ "calls": {"call_expression"},
182
+ },
183
+ "c_sharp": {
184
+ "functions": {"method_declaration"},
185
+ "classes": {"class_declaration"},
186
+ "imports": {"using_directive"},
187
+ "calls": {"invocation_expression"},
188
+ },
189
+ "ruby": {
190
+ "functions": {"method"},
191
+ "classes": {"class"},
192
+ "imports": {"call"},
193
+ "calls": set(),
194
+ },
195
+ "php": {
196
+ "functions": {"function_definition", "method_declaration"},
197
+ "classes": {"class_declaration"},
198
+ "imports": {"require_expression", "include_expression",
199
+ "require_once_expression", "include_once_expression"},
200
+ "calls": {"function_call_expression"},
201
+ },
202
+ "kotlin": {
203
+ "functions": {"function_declaration"},
204
+ "classes": {"class_declaration"},
205
+ "imports": {"import"},
206
+ "calls": {"call_expression"},
207
+ },
208
+ "swift": {
209
+ "functions": {"function_declaration"},
210
+ "classes": {"class_declaration"},
211
+ "imports": {"import_declaration"},
212
+ "calls": {"call_expression"},
213
+ },
214
+ "scala": {
215
+ "functions": {"function_definition"},
216
+ "classes": {"class_definition"},
217
+ "imports": {"import_declaration"},
218
+ "calls": {"call_expression"},
219
+ },
220
+ "haskell": {
221
+ "functions": {"function"},
222
+ "classes": {"data_type"},
223
+ "imports": {"import"},
224
+ "calls": {"apply"},
225
+ },
226
+ "lua": {
227
+ "functions": {"function_definition"},
228
+ "classes": set(),
229
+ "imports": {"function_call"},
230
+ "calls": set(),
231
+ },
232
+ "elixir": {
233
+ "functions": {"def", "defp"},
234
+ "classes": {"defmodule"},
235
+ "imports": {"call"},
236
+ "calls": set(),
237
+ },
238
+ }
239
+
240
+
241
+ def _rust_use_root(node) -> Optional[str]:
242
+ """Return the root crate/module name from a Rust use_declaration node.
243
+
244
+ Rust use paths have these shapes in the tree-sitter AST:
245
+ use_declaration
246
+ scoped_identifier → std::collections::HashMap
247
+ scoped_use_list → crate::storage::{mod1, mod2}
248
+ identifier → use foo;
249
+ use_as_clause → use foo as bar;
250
+
251
+ We always want the leftmost identifier in the path, which is the crate name
252
+ (e.g. "std", "tokio") or "crate"/"super"/"self" for intra-project paths.
253
+ For crate-relative paths we return the first path segment after "crate" so
254
+ the edge points to the local module, not the generic keyword "crate".
255
+ """
256
+ def leftmost_ident(n) -> Optional[str]:
257
+ """Recursively find the leftmost identifier/keyword in a path node."""
258
+ if n.type == "identifier":
259
+ return n.text.decode("utf-8")
260
+ if n.type in ("crate", "super", "self"):
261
+ # intra-project: find first real identifier among siblings/children
262
+ return None # caller will try the next path segment
263
+ # scoped_identifier / scoped_use_list: path is in named children
264
+ for child in n.named_children:
265
+ result = leftmost_ident(child)
266
+ if result is not None:
267
+ return result
268
+ return None
269
+
270
+ def root_from_path(n) -> Optional[str]:
271
+ """Extract root module name from a path-like node."""
272
+ if n.type == "identifier":
273
+ return n.text.decode("utf-8")
274
+ if n.type in ("crate", "super", "self"):
275
+ return None # skip; caller handles intra-project
276
+ if n.type in ("scoped_identifier", "scoped_use_list"):
277
+ children = n.named_children
278
+ if not children:
279
+ return None
280
+ first = children[0]
281
+ if first.type in ("crate", "super", "self"):
282
+ # intra-project: return the next segment
283
+ if len(children) > 1:
284
+ seg = children[1]
285
+ if seg.type == "identifier":
286
+ return seg.text.decode("utf-8")
287
+ return None
288
+ return root_from_path(first)
289
+ if n.type == "use_as_clause":
290
+ path_node = n.child_by_field_name("path")
291
+ return root_from_path(path_node) if path_node else None
292
+ return None
293
+
294
+ for child in node.named_children:
295
+ result = root_from_path(child)
296
+ if result:
297
+ return result
298
+ return None
299
+
300
+
301
+ def _c_include_name(node) -> Optional[str]:
302
+ """Return the header name (no path, no extension) from a C/C++ preproc_include node.
303
+
304
+ Handles both:
305
+ #include <stdio.h> → system_lib_string → "stdio"
306
+ #include "myheader.h" → string_literal → "myheader"
307
+ """
308
+ import os
309
+ for child in node.children:
310
+ if child.type in ("system_lib_string", "string_literal"):
311
+ raw = child.text.decode("utf-8").strip("<>\"'")
312
+ return os.path.splitext(os.path.basename(raw))[0]
313
+ return None
314
+
315
+
316
+ def _csharp_using_name(node) -> Optional[str]:
317
+ """Return the root namespace from a C# using_directive node.
318
+
319
+ using System; → "System"
320
+ using System.Collections.Generic; → "System"
321
+ """
322
+ def _first_ident(n) -> Optional[str]:
323
+ if n.type == "identifier":
324
+ return n.text.decode("utf-8")
325
+ for c in n.named_children:
326
+ result = _first_ident(c)
327
+ if result:
328
+ return result
329
+ return None
330
+
331
+ return _first_ident(node)
332
+
333
+
334
+ def _ruby_require_name(node) -> Optional[str]:
335
+ """Return the required module name from a Ruby call node.
336
+
337
+ Handles:
338
+ require 'rails' → "rails"
339
+ require_relative 'my_mod' → "my_mod"
340
+ Returns None for non-require calls.
341
+ """
342
+ import os
343
+ method = node.child_by_field_name("method")
344
+ if method is None or method.text.decode("utf-8") not in ("require", "require_relative"):
345
+ return None
346
+ args = node.child_by_field_name("arguments")
347
+ if args is None:
348
+ return None
349
+ for child in args.named_children:
350
+ if child.type == "string":
351
+ content_node = next(
352
+ (c for c in child.named_children if c.type == "string_content"),
353
+ None,
354
+ )
355
+ if content_node:
356
+ val = content_node.text.decode("utf-8")
357
+ else:
358
+ val = child.text.decode("utf-8").strip("'\"")
359
+ return os.path.splitext(os.path.basename(val))[0]
360
+ return None
361
+
362
+
363
+ def _lua_require_name(node) -> Optional[str]:
364
+ """Return the module name from a Lua function_call to require().
365
+
366
+ require("socket") → "socket"
367
+ Returns None for non-require calls.
368
+
369
+ AST shape:
370
+ function_call
371
+ identifier b'require'
372
+ arguments
373
+ ( b'('
374
+ string b'"socket"'
375
+ ) b')'
376
+ """
377
+ fn_node = None
378
+ for child in node.children:
379
+ if child.type == "identifier":
380
+ fn_node = child
381
+ break
382
+ if fn_node is None or fn_node.text.decode("utf-8") != "require":
383
+ return None
384
+ for child in node.children:
385
+ if child.type == "arguments":
386
+ for arg in child.children:
387
+ if arg.type == "string":
388
+ return arg.text.decode("utf-8").strip("'\"")
389
+ return None
390
+
391
+
392
+ def _elixir_module_name(node) -> Optional[str]:
393
+ """Return the root module name from an Elixir alias/import/use/require call.
394
+
395
+ alias MyApp.Router → "MyApp"
396
+ import Ecto.Query → "Ecto"
397
+ use Phoenix.Controller → "Phoenix"
398
+ require Logger → "Logger"
399
+ Returns None for non-module calls (e.g. IO.puts/1 where target is a dot node).
400
+ """
401
+ _ELIXIR_MODULE_CALLS = {"alias", "import", "use", "require"}
402
+ # The call target is the field named "target" — an identifier for alias/import/use/require,
403
+ # or a dot node for things like IO.puts/1.
404
+ target = node.child_by_field_name("target")
405
+ if target is None or target.type != "identifier":
406
+ return None
407
+ if target.text.decode("utf-8") not in _ELIXIR_MODULE_CALLS:
408
+ return None
409
+ # The module argument is in an "arguments" child (unnamed field).
410
+ # It contains an "alias" node whose text is the full dotted module name.
411
+ for child in node.children:
412
+ if child.type == "arguments":
413
+ for arg in child.children:
414
+ if arg.type == "alias":
415
+ txt = arg.text.decode("utf-8")
416
+ return txt.split(".")[0]
417
+ return None
418
+
419
+
420
+ def _extract_import_name(node, lang_name: str) -> List[str]:
421
+ """Extract top-level module names from an import node (may return multiple)."""
422
+ names: List[str] = []
423
+ if lang_name == "python":
424
+ if node.type == "import_from_statement":
425
+ m = node.child_by_field_name("module_name")
426
+ if m:
427
+ names.append(m.text.decode("utf-8").split(".")[0])
428
+ else:
429
+ # import_statement: collect all top-level module names
430
+ for child in node.named_children:
431
+ if child.type == "aliased_import":
432
+ n = child.child_by_field_name("name")
433
+ if n:
434
+ names.append(n.text.decode("utf-8").split(".")[0])
435
+ elif child.type == "dotted_name":
436
+ names.append(child.text.decode("utf-8").split(".")[0])
437
+ elif lang_name in ("javascript", "typescript"):
438
+ src = node.child_by_field_name("source")
439
+ if src:
440
+ names.append(src.text.decode("utf-8").strip("'\""))
441
+ elif lang_name == "rust":
442
+ name = _rust_use_root(node)
443
+ if name:
444
+ names.append(name)
445
+ elif lang_name == "go":
446
+ def _go_spec(spec_node):
447
+ path = spec_node.child_by_field_name("path")
448
+ if path:
449
+ val = path.text.decode("utf-8").strip('"')
450
+ names.append(val.split("/")[-1])
451
+
452
+ for child in node.named_children:
453
+ if child.type == "import_spec":
454
+ _go_spec(child)
455
+ elif child.type == "import_spec_list":
456
+ for spec in child.named_children:
457
+ if spec.type == "import_spec":
458
+ _go_spec(spec)
459
+ elif lang_name == "java":
460
+ def _java_leftmost(n) -> Optional[str]:
461
+ if n.type == "identifier":
462
+ return n.text.decode("utf-8")
463
+ for c in n.named_children:
464
+ result = _java_leftmost(c)
465
+ if result:
466
+ return result
467
+ return None
468
+
469
+ result = _java_leftmost(node)
470
+ if result:
471
+ names.append(result)
472
+ elif lang_name in ("c", "cpp"):
473
+ name = _c_include_name(node)
474
+ if name:
475
+ names.append(name)
476
+ elif lang_name == "c_sharp":
477
+ name = _csharp_using_name(node)
478
+ if name:
479
+ names.append(name)
480
+ elif lang_name == "ruby":
481
+ name = _ruby_require_name(node)
482
+ if name:
483
+ names.append(name)
484
+ elif lang_name == "php":
485
+ import os
486
+ for child in node.children:
487
+ if child.type in ("string", "encapsed_string", "string_literal"):
488
+ val = child.text.decode("utf-8").strip("'\"")
489
+ names.append(os.path.splitext(os.path.basename(val))[0])
490
+ break
491
+ elif lang_name == "kotlin":
492
+ def _kotlin_first_seg(n) -> Optional[str]:
493
+ if n.type in ("simple_identifier", "identifier"):
494
+ return n.text.decode("utf-8")
495
+ for c in n.named_children:
496
+ result = _kotlin_first_seg(c)
497
+ if result:
498
+ return result
499
+ return None
500
+
501
+ result = _kotlin_first_seg(node)
502
+ if result:
503
+ names.append(result)
504
+ elif lang_name == "swift":
505
+ for child in node.named_children:
506
+ if child.type in ("identifier", "simple_identifier"):
507
+ names.append(child.text.decode("utf-8"))
508
+ break
509
+ elif lang_name == "scala":
510
+ for child in node.named_children:
511
+ txt = child.text.decode("utf-8")
512
+ names.append(txt.split(".")[0])
513
+ break
514
+ elif lang_name == "haskell":
515
+ for child in node.named_children:
516
+ if child.type in ("module", "qualified_module", "constructor"):
517
+ txt = child.text.decode("utf-8")
518
+ names.append(txt.split(".")[0])
519
+ break
520
+ elif lang_name == "lua":
521
+ name = _lua_require_name(node)
522
+ if name:
523
+ names.append(name)
524
+ elif lang_name == "elixir":
525
+ name = _elixir_module_name(node)
526
+ if name:
527
+ names.append(name)
528
+ return names
529
+
530
+
531
+ def _extract_call_name(node, lang_name: str) -> Optional[str]:
532
+ """Extract the function name from a call node (best-effort, identifiers only)."""
533
+ fn = node.child_by_field_name("function")
534
+ if fn and fn.type == "identifier":
535
+ return fn.text.decode("utf-8")
536
+ return None
537
+
538
+
539
+ def _walk_ast(node, results: Dict[str, List[str]], lang_name: str) -> None:
540
+ """Recursively extract code entities from a tree-sitter AST node."""
541
+ node_types = _LANG_NODE_TYPES.get(lang_name)
542
+ if node_types is None:
543
+ return
544
+
545
+ if node.type in node_types.get("functions", set()):
546
+ name_node = node.child_by_field_name("name")
547
+ if name_node:
548
+ results["functions"].append(name_node.text.decode("utf-8"))
549
+
550
+ elif node.type in node_types.get("classes", set()):
551
+ name_node = node.child_by_field_name("name")
552
+ if name_node:
553
+ results["classes"].append(name_node.text.decode("utf-8"))
554
+
555
+ elif node.type in node_types.get("imports", set()):
556
+ names = _extract_import_name(node, lang_name)
557
+ results["imports"].extend(names)
558
+
559
+ elif node.type in node_types.get("calls", set()):
560
+ name = _extract_call_name(node, lang_name)
561
+ if name:
562
+ results["calls"].append(name)
563
+
564
+ for child in node.children:
565
+ _walk_ast(child, results, lang_name)
566
+
567
+
568
+ def _extract_from_source(
569
+ source: bytes, parser: Any, file_path: str
570
+ ) -> Dict[str, List[str]]:
571
+ """Parse source bytes and extract functions, classes, imports, calls."""
572
+ results: Dict[str, List[str]] = {
573
+ "functions": [], "classes": [], "imports": [], "calls": []
574
+ }
575
+ try:
576
+ tree = parser.parse(source)
577
+ lang_name = _EXT_TO_LANG.get(Path(file_path).suffix.lower(), "")
578
+ _walk_ast(tree.root_node, results, lang_name)
579
+ except Exception:
580
+ pass # best-effort; parse failures are non-fatal
581
+ return results
582
+
583
+ # ---------------------------------------------------------------------------
584
+ # DB lifecycle
585
+ # ---------------------------------------------------------------------------
586
+
587
+
588
+ def _get_graph_path() -> str:
589
+ return os.environ.get("MINIGRAF_GRAPH_PATH", str(Path.cwd() / "memory.graph"))
590
+
591
+
592
+ def _open_db_at(path: str) -> MiniGrafDb:
593
+ """Open MiniGrafDb at path, register session rules, update mtime tracking."""
594
+ global _db, _graph_path, _db_mtime
595
+ _db = MiniGrafDb.open(path)
596
+ for rule in SESSION_RULES:
597
+ _db.execute(rule)
598
+ for rule in _user_rules:
599
+ _db.execute(rule)
600
+ _graph_path = path
601
+ try:
602
+ _db_mtime = os.path.getmtime(path)
603
+ except OSError:
604
+ _db_mtime = 0.0
605
+ return _db
606
+
607
+
608
+ def open_db(graph_path: Optional[str] = None) -> MiniGrafDb:
609
+ """Open MiniGrafDb and register session-scoped rules. Called once at startup."""
610
+ return _open_db_at(graph_path or _get_graph_path())
611
+
612
+
613
+ def _update_mtime() -> None:
614
+ """Record the graph file mtime after our own checkpoint so we don't
615
+ treat our own write as an external modification on the next call."""
616
+ global _db_mtime
617
+ if not _graph_path:
618
+ return
619
+ try:
620
+ _db_mtime = os.path.getmtime(_graph_path)
621
+ except OSError:
622
+ pass
623
+
624
+
625
+ def _refresh_if_stale() -> None:
626
+ """Reopen the DB if the graph file was modified externally since last open.
627
+
628
+ minigraf's Drop impl writes to the file even for read-only handles (upstream
629
+ bug). Any subprocess that opens the same file — including the prepare/finalize
630
+ hooks — will change the mtime and invalidate this process's in-memory page
631
+ table. Detect this via mtime and reopen transparently.
632
+ """
633
+ global _db_mtime
634
+ if not _graph_path:
635
+ return
636
+ try:
637
+ current_mtime = os.path.getmtime(_graph_path)
638
+ except OSError:
639
+ return
640
+ if current_mtime != _db_mtime:
641
+ _open_db_at(_graph_path)
642
+
643
+
644
+ def get_db() -> MiniGrafDb:
645
+ """Return the open DB instance, opening it if not currently held.
646
+
647
+ The DB is opened per-operation and released after each call_tool() invocation
648
+ so that the prepare_hook subprocess can acquire the file lock between turns.
649
+ """
650
+ if _db is None:
651
+ _open_db_at(_graph_path or _get_graph_path())
652
+ return _db
653
+
654
+
655
+ # ---------------------------------------------------------------------------
656
+ # Result parsing
657
+ # ---------------------------------------------------------------------------
658
+
659
+ def _parse_query_result(raw_json: str) -> Dict[str, Any]:
660
+ """Parse JSON returned by MiniGrafDb.execute() for a query command."""
661
+ try:
662
+ data = json.loads(raw_json)
663
+ return {"ok": True, "results": data.get("results", [])}
664
+ except json.JSONDecodeError as e:
665
+ return {"ok": False, "error": f"Unexpected result format: {e} — raw: {raw_json[:200]}"}
666
+
667
+
668
+ def _parse_tx_result(raw_json: str) -> Dict[str, Any]:
669
+ """Parse JSON returned by MiniGrafDb.execute() for a transact/retract command."""
670
+ try:
671
+ data = json.loads(raw_json)
672
+ return {"ok": True, "tx": str(data.get("tx", "unknown"))}
673
+ except json.JSONDecodeError as e:
674
+ return {"ok": False, "error": f"Unexpected result format: {e} — raw: {raw_json[:200]}"}
675
+
676
+
677
+ # ---------------------------------------------------------------------------
678
+ # Explicit agent tool handlers
679
+ # ---------------------------------------------------------------------------
680
+
681
+ def handle_minigraf_query(datalog: str) -> Dict[str, Any]:
682
+ """Query the graph. Returns {ok, results} or {ok, error}."""
683
+ db = get_db()
684
+ try:
685
+ raw = db.execute(f"(query {datalog})")
686
+ return _parse_query_result(raw)
687
+ except MiniGrafError as e:
688
+ return {"ok": False, "error": str(e)}
689
+
690
+
691
+ def handle_minigraf_transact(facts: str, reason: str) -> Dict[str, Any]:
692
+ """Transact facts into the graph. reason is required.
693
+
694
+ :valid-at is set to the current UTC ms timestamp so every agent-initiated
695
+ write has a recorded valid time, enabling correct bi-temporal queries.
696
+ """
697
+ if not reason or not reason.strip():
698
+ return {"ok": False, "error": "reason is required for all writes"}
699
+ # Schema validation — closed-world enforcement on parseable string-valued triples.
700
+ # Only string-valued triples are schema-validated. Keyword-valued triples
701
+ # (e.g. relationship edges like [:service/auth :calls :component/jwt]) are
702
+ # not covered by MINIGRAF_SCHEMA and pass through unvalidated by design.
703
+ parsed = _parse_transact_facts(facts)
704
+ if parsed:
705
+ violations = _validate_facts(parsed)
706
+ if violations:
707
+ return {"ok": False, "error": f"schema violations: {'; '.join(violations)}"}
708
+ _refresh_if_stale()
709
+ db = get_db()
710
+ try:
711
+ raw = db.execute(f'(transact {facts} {{:valid-from "{_now_utc_ms()}"}})')
712
+ db.checkpoint()
713
+ _update_mtime()
714
+ result = _parse_tx_result(raw)
715
+ if result["ok"]:
716
+ result["reason"] = reason
717
+ _index_cache.invalidate()
718
+ return result
719
+ except MiniGrafError as e:
720
+ return {"ok": False, "error": str(e)}
721
+
722
+
723
+ def handle_minigraf_retract(facts: str, reason: str) -> Dict[str, Any]:
724
+ """Retract facts from the graph. reason is required."""
725
+ if not reason or not reason.strip():
726
+ return {"ok": False, "error": "reason is required for retract"}
727
+ _refresh_if_stale()
728
+ db = get_db()
729
+ try:
730
+ raw = db.execute(f"(retract {facts})")
731
+ db.checkpoint()
732
+ _update_mtime()
733
+ result = _parse_tx_result(raw)
734
+ if result["ok"]:
735
+ result["reason"] = reason
736
+ _index_cache.invalidate()
737
+ return result
738
+ except MiniGrafError as e:
739
+ return {"ok": False, "error": str(e)}
740
+
741
+
742
+ def handle_minigraf_rule(rule: str) -> Dict[str, Any]:
743
+ """Register a Datalog rule for use in subsequent queries.
744
+
745
+ Rules persist for the lifetime of the server session and are re-registered
746
+ whenever the DB is reopened. To make a rule permanent across server restarts,
747
+ add it to SESSION_RULES in mcp_server.py.
748
+
749
+ Syntax: [(rule-name ?arg ...) body-clause ...]
750
+ Example: [(ancestor ?a ?d) [?a :parent ?d]]
751
+ """
752
+ global _user_rules
753
+ db = get_db()
754
+ try:
755
+ db.execute(f"(rule {rule})")
756
+ rule_expr = f"(rule {rule})"
757
+ if rule_expr not in _user_rules:
758
+ _user_rules.append(rule_expr)
759
+ return {"ok": True, "rule": rule}
760
+ except MiniGrafError as e:
761
+ return {"ok": False, "error": str(e)}
762
+
763
+
764
+ def handle_minigraf_report_issue(
765
+ category: str,
766
+ description: str,
767
+ datalog: Optional[str] = None,
768
+ error: Optional[str] = None,
769
+ ) -> Dict[str, Any]:
770
+ """Delegate to report_issue.py."""
771
+ try:
772
+ from report_issue import report_issue
773
+ report_issue(category, description, datalog=datalog, error=error)
774
+ return {"ok": True}
775
+ except Exception as e:
776
+ return {"ok": False, "error": str(e)}
777
+
778
+
779
+ def handle_minigraf_audit(as_of: Optional[int] = None) -> Dict[str, Any]:
780
+ """Audit graph entities against MINIGRAF_SCHEMA.
781
+
782
+ Current state (as_of=None): validates all entities and retracts violators.
783
+ Point-in-time (as_of=N): reports violations only — no retractions.
784
+
785
+ Ported from Schema.audit_as_of() in minigraf-examples minigraf-schema crate.
786
+ """
787
+ _refresh_if_stale()
788
+ db = get_db()
789
+ audited = 0
790
+ retracted = 0
791
+ all_violations: List[Dict[str, Any]] = []
792
+
793
+ as_of_clause = f":as-of {as_of} " if as_of is not None else ""
794
+
795
+ for entity_type in MINIGRAF_SCHEMA:
796
+ # Step 1: Find all entity UUIDs of this type.
797
+ type_query = (
798
+ f"[:find ?e {as_of_clause}"
799
+ f":where [?e :entity-type :type/{entity_type}]]"
800
+ )
801
+ try:
802
+ type_result = handle_minigraf_query(type_query)
803
+ type_rows = type_result.get("results", [])
804
+ except Exception:
805
+ continue
806
+
807
+ for row in type_rows:
808
+ if not row:
809
+ continue
810
+ entity_uuid = row[0]
811
+ audited += 1
812
+
813
+ # Step 2: Fetch all attributes using #uuid tagged literal.
814
+ # minigraf's EDN parser treats #uuid "..." as EdnValue::Uuid and routes
815
+ # it through edn_to_entity_id directly — no keyword-to-UUID derivation
816
+ # needed and no join-variable round-trip problem.
817
+ attr_query = (
818
+ f'[:find ?a ?v {as_of_clause}'
819
+ f':where [#uuid "{entity_uuid}" ?a ?v]]'
820
+ )
821
+ try:
822
+ attr_result = handle_minigraf_query(attr_query)
823
+ attr_rows = attr_result.get("results", [])
824
+ except Exception:
825
+ continue
826
+
827
+ # Extract keyword ident from the stored :ident datom for reporting.
828
+ # Falls back to the UUID string if :ident was not written.
829
+ kw_ident = next(
830
+ (v for a, v in attr_rows if a == ":ident" and isinstance(v, str)),
831
+ entity_uuid,
832
+ )
833
+
834
+ # Exclude system attributes from schema validation.
835
+ attr_facts = [
836
+ {
837
+ "entity": kw_ident,
838
+ "entity_type": entity_type,
839
+ "attribute": a,
840
+ "value": v,
841
+ }
842
+ for a, v in attr_rows
843
+ if a not in _SYSTEM_ATTRS
844
+ ]
845
+
846
+ if not attr_facts:
847
+ attr_facts = [{"entity": kw_ident, "entity_type": entity_type,
848
+ "attribute": ":__no_attributes__", "value": ""}]
849
+
850
+ violations = _validate_facts(attr_facts)
851
+ if violations:
852
+ for v in violations:
853
+ all_violations.append({"entity": kw_ident, "detail": v})
854
+
855
+ if as_of is None:
856
+ # Retract using #uuid tagged literal — works even without knowing
857
+ # the original keyword ident. History preserved (bi-temporal).
858
+ try:
859
+ retract_triples = [
860
+ f'[#uuid "{entity_uuid}" :entity-type :type/{entity_type}]',
861
+ ]
862
+ for a, v in attr_rows:
863
+ if isinstance(v, str):
864
+ escaped = v.replace('"', '\\"')
865
+ retract_triples.append(
866
+ f'[#uuid "{entity_uuid}" {a} "{escaped}"]'
867
+ )
868
+ retract_expr = f"(retract [{' '.join(retract_triples)}])"
869
+ db.execute(retract_expr)
870
+ db.checkpoint()
871
+ _update_mtime()
872
+ retracted += 1
873
+ except Exception:
874
+ pass
875
+
876
+ return {
877
+ "ok": True,
878
+ "audited": audited,
879
+ "retracted": retracted,
880
+ "violations": all_violations,
881
+ }
882
+
883
+
884
+ # ---------------------------------------------------------------------------
885
+ # memory_prepare_turn
886
+ # ---------------------------------------------------------------------------
887
+
888
+ _STOP_WORDS = frozenset(
889
+ "a an the is are was were be been being have has had do does did will would could should "
890
+ "may might shall can need dare ought used to am i we you he she it they what which who "
891
+ "this that these those my our your his her its their about above after all also and as at "
892
+ "before but by for from if in into just me more most no not of on only or other our out "
893
+ "same so than then there they through to too under up us very via was we what when where "
894
+ "which while who why with".split()
895
+ )
896
+
897
+ _MIN_ENTITY_LEN = 4
898
+
899
+
900
+ def _canonical_ident(entity_type: str, value: str) -> str:
901
+ """Slug-canonicalize a value into a Minigraf keyword ident.
902
+
903
+ Lowercases, replaces any character outside [a-z0-9-] with a hyphen,
904
+ collapses consecutive hyphens, strips leading/trailing hyphens.
905
+ Ported from _to_kw() in minigraf-examples LlamaIndex integration.
906
+ """
907
+ slug = re.sub(r"[^a-z0-9-]", "-", value.lower())
908
+ slug = re.sub(r"-+", "-", slug).strip("-")
909
+ return f":{entity_type}/{slug}"
910
+
911
+
912
+ def _resolve_module_import(import_name: str, file_entities: Dict[str, List[str]]) -> str:
913
+ """Resolve an import name to a module ident that joins with stored module entities.
914
+
915
+ For a name like "storage", tries standard Rust source-root locations first
916
+ (src/storage.rs, src/storage/mod.rs) before falling back to a broader name
917
+ search. The ordered-priority approach prevents e.g. src/graph/storage.rs
918
+ from matching a top-level `use crate::storage` import.
919
+
920
+ Falls back to _canonical_ident for external crate names (std, tokio, …)
921
+ so they still get an edge even though they have no :path attribute.
922
+ """
923
+ # Priority 1: canonical Rust module root paths under common source roots
924
+ for src_root in ("src", "lib", ""):
925
+ prefix = f"{src_root}/" if src_root else ""
926
+ candidate_file = f"{prefix}{import_name}.rs"
927
+ candidate_mod = f"{prefix}{import_name}/mod.rs"
928
+ if candidate_file in file_entities:
929
+ return _code_ident("module", candidate_file)
930
+ if candidate_mod in file_entities:
931
+ return _code_ident("module", candidate_mod)
932
+
933
+ # Priority 2: broader search — only match files directly under a src root
934
+ # (parent.parent is the source root, not a nested subdir)
935
+ for file_path in file_entities:
936
+ p = Path(file_path)
937
+ if p.stem == "mod" and p.parent.name == import_name:
938
+ return _code_ident("module", file_path)
939
+
940
+ return _canonical_ident("module", import_name)
941
+
942
+
943
+ def _code_ident(entity_type: str, file_path: str, name: Optional[str] = None) -> str:
944
+ """Return a canonical ident for a code entity.
945
+
946
+ Appends '::name' to file_path before slugging so that the function
947
+ name appears AFTER the file extension in the slug, keeping it distinct
948
+ from a file whose path ends with the name (e.g. 'src/auth_login.py').
949
+
950
+ This is best-effort — the separator itself becomes '-' after slugging,
951
+ so collisions are still possible for contrived path/name combinations.
952
+ """
953
+ if name:
954
+ value = f"{file_path}::{name}"
955
+ else:
956
+ value = file_path
957
+ return _canonical_ident(entity_type, value)
958
+
959
+
960
+ # ---------------------------------------------------------------------------
961
+ # Git helpers
962
+ # ---------------------------------------------------------------------------
963
+
964
+
965
+ def _git_commits(
966
+ repo_path: str,
967
+ watermark_hash: Optional[str],
968
+ branch: str = "HEAD",
969
+ ) -> List[tuple]:
970
+ """Return list of (hash, ts_iso, author_email, subject) in chronological order."""
971
+ range_spec = f"{watermark_hash}..{branch}" if watermark_hash else branch
972
+ result = _subprocess.run(
973
+ ["git", "log", "--reverse", "--format=%H %at %ae %s", range_spec],
974
+ cwd=repo_path, capture_output=True, text=True, check=True,
975
+ )
976
+ commits = []
977
+ for line in result.stdout.strip().splitlines():
978
+ if not line.strip():
979
+ continue
980
+ parts = line.split(" ", 3)
981
+ hash_ = parts[0]
982
+ ts_unix = int(parts[1])
983
+ author = parts[2]
984
+ subject = parts[3] if len(parts) > 3 else ""
985
+ ts_iso = datetime.datetime.fromtimestamp(ts_unix, datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
986
+ commits.append((hash_, ts_iso, author, subject))
987
+ return commits
988
+
989
+
990
+ def _git_changed_files(repo_path: str, commit_hash: str) -> List[tuple]:
991
+ """Return list of (status_char, path) for files changed in this commit."""
992
+ result = _subprocess.run(
993
+ ["git", "diff-tree", "--no-commit-id", "-r", "--name-status", "--root", commit_hash],
994
+ cwd=repo_path, capture_output=True, text=True, check=True,
995
+ )
996
+ changes = []
997
+ for line in result.stdout.strip().splitlines():
998
+ if not line.strip():
999
+ continue
1000
+ parts = line.split("\t", 1)
1001
+ if len(parts) == 2:
1002
+ status_char = parts[0][0] # A, M, D, R, C → take first char
1003
+ changes.append((status_char, parts[1]))
1004
+ return changes
1005
+
1006
+
1007
+ def _edn_escape(s: str) -> str:
1008
+ """Escape a string for embedding in an EDN double-quoted literal."""
1009
+ return s.replace("\\", "\\\\").replace('"', '\\"')
1010
+
1011
+
1012
+ def _git_file_content(repo_path: str, commit_hash: str, file_path: str) -> bytes:
1013
+ """Return raw bytes of a file at the given commit."""
1014
+ result = _subprocess.run(
1015
+ ["git", "show", f"{commit_hash}:{file_path}"],
1016
+ cwd=repo_path, capture_output=True, check=True,
1017
+ )
1018
+ return result.stdout
1019
+
1020
+
1021
+ def _git_parent_hashes(repo_path: str, commit_hash: str) -> List[str]:
1022
+ """Return the parent commit hashes for the given commit (empty for root commits)."""
1023
+ result = _subprocess.run(
1024
+ ["git", "log", "-1", "--format=%P", commit_hash],
1025
+ cwd=repo_path, capture_output=True, text=True, check=True,
1026
+ )
1027
+ raw = result.stdout.strip()
1028
+ return raw.split() if raw else []
1029
+
1030
+
1031
+ def _git_tags(repo_path: str) -> List[tuple]:
1032
+ """Return list of (tag_name, commit_hash, date_iso) for all tags in the repo.
1033
+
1034
+ For annotated tags, returns the dereferenced commit hash.
1035
+ For lightweight tags, returns the tagged commit directly.
1036
+ Date is the tagger date for annotated tags, or commit date for lightweight.
1037
+ """
1038
+ result = _subprocess.run(
1039
+ ["git", "tag", "-l", "--sort=version:refname",
1040
+ "--format=%(refname:short)\t%(*objectname)\t%(objectname)\t%(creatordate:iso-strict)"],
1041
+ cwd=repo_path, capture_output=True, text=True, check=True,
1042
+ )
1043
+ tags = []
1044
+ for line in result.stdout.strip().splitlines():
1045
+ if not line.strip():
1046
+ continue
1047
+ parts = line.split("\t", 3)
1048
+ if len(parts) < 3:
1049
+ continue
1050
+ tag_name = parts[0]
1051
+ deref_hash = parts[1].strip() # non-empty for annotated tags
1052
+ obj_hash = parts[2].strip()
1053
+ date_raw = parts[3].strip() if len(parts) > 3 else ""
1054
+ commit_hash = deref_hash if deref_hash else obj_hash
1055
+ if not commit_hash:
1056
+ continue
1057
+ tags.append((tag_name, commit_hash, date_raw))
1058
+ return tags
1059
+
1060
+
1061
+ # ---------------------------------------------------------------------------
1062
+ # Bi-temporal write helpers
1063
+ # ---------------------------------------------------------------------------
1064
+
1065
+
1066
+ def _build_close_triples(
1067
+ ident: str,
1068
+ description: str,
1069
+ module_ident: str,
1070
+ ) -> List[str]:
1071
+ """Return triple strings needed to bi-temporally close an entity.
1072
+
1073
+ Closes :ident (canonical existence fact), :description (with real value),
1074
+ and the parent module's :contains edge. The module's own :contains triple
1075
+ is omitted when ident == module_ident (modules have no parent module here).
1076
+ """
1077
+ triples = [
1078
+ f'[{ident} :ident "{_edn_escape(ident)}"]',
1079
+ f'[{ident} :description "{_edn_escape(description)}"]',
1080
+ ]
1081
+ if ident != module_ident:
1082
+ triples.append(f"[{module_ident} :contains {ident}]")
1083
+ return triples
1084
+
1085
+
1086
+ def _ingest_transact(
1087
+ db: Any,
1088
+ triples: List[str],
1089
+ commit_ts_iso: str,
1090
+ reason: str,
1091
+ ) -> None:
1092
+ """Transact code-structure facts with :valid-from set to the commit timestamp."""
1093
+ if not triples:
1094
+ return
1095
+ facts_str = "[" + " ".join(triples) + "]"
1096
+ db.execute(f'(transact {facts_str} {{:valid-from "{commit_ts_iso}"}})')
1097
+
1098
+
1099
+ def _ingest_close(
1100
+ db: Any,
1101
+ triples: List[str],
1102
+ original_ts_iso: str,
1103
+ commit_ts_iso: str,
1104
+ reason: str,
1105
+ ) -> None:
1106
+ """Close a fact's valid window at the deletion commit timestamp.
1107
+
1108
+ Two-step process:
1109
+ 1. Retract each original open-ended fact so it vanishes from current-time
1110
+ queries (retract has no temporal options, so this removes the unbounded
1111
+ assertion from the live view while keeping it in transaction history).
1112
+ 2. Re-transact the same facts with explicit :valid-from + :valid-to so the
1113
+ historical valid window is preserved for point-in-time queries.
1114
+
1115
+ Triples are retracted one-by-one to avoid EAVT collision on :contains edges
1116
+ (Minigraf's pending index omits value bytes, so batching multiple
1117
+ [module :contains fn] retracts could collide).
1118
+ """
1119
+ if not triples:
1120
+ return
1121
+ for triple in triples:
1122
+ try:
1123
+ db.execute(f"(retract [{triple}])")
1124
+ except Exception:
1125
+ pass # best-effort: original may not exist if preload was incomplete
1126
+ facts_str = "[" + " ".join(triples) + "]"
1127
+ db.execute(
1128
+ f'(transact {facts_str} {{:valid-from "{original_ts_iso}" :valid-to "{commit_ts_iso}"}})'
1129
+ )
1130
+
1131
+
1132
+ def _watermark_query(db: Any) -> Optional[str]:
1133
+ """Return the hash of the last ingested commit, or None if no watermark exists."""
1134
+ raw = db.execute("(query [:find ?h :where [:ingestion/watermark :hash ?h]])")
1135
+ results = json.loads(raw).get("results", [])
1136
+ return results[0][0] if results else None
1137
+
1138
+
1139
+ def _total_ingested_query(db: Any) -> int:
1140
+ """Return the cumulative number of commits ingested across all runs, or 0."""
1141
+ raw = db.execute("(query [:find ?n :any-valid-time :where [:ingestion/last-run-at :total-ingested ?n]])")
1142
+ results = json.loads(raw).get("results", [])
1143
+ return int(results[0][0]) if results else 0
1144
+
1145
+
1146
+ def _watermark_update(db: Any, commit_hash: str, commit_ts_iso: str, reason: str) -> None:
1147
+ """Record the last successfully ingested commit hash in the graph."""
1148
+ existing = _watermark_query(db)
1149
+ if existing:
1150
+ db.execute(f'(retract [[:ingestion/watermark :hash "{existing}"]])')
1151
+ db.execute(
1152
+ f'(transact [[:ingestion/watermark :entity-type :type/ingestion] '
1153
+ f'[:ingestion/watermark :ident ":ingestion/watermark"] '
1154
+ f'[:ingestion/watermark :description "git ingestion watermark"] '
1155
+ f'[:ingestion/watermark :hash "{commit_hash}"]] '
1156
+ f'{{:valid-from "{commit_ts_iso}"}})'
1157
+ )
1158
+
1159
+
1160
+ def _last_run_write(db: Any, commit_hash: str, run_at: str, total_ingested: int) -> None:
1161
+ """Record the wall-clock time, final commit hash, and cumulative ingested count."""
1162
+ db.execute(
1163
+ f'(transact [[:ingestion/last-run-at :entity-type :type/ingestion] '
1164
+ f'[:ingestion/last-run-at :ident ":ingestion/last-run-at"] '
1165
+ f'[:ingestion/last-run-at :description "last ingestion run timestamp"] '
1166
+ f'[:ingestion/last-run-at :last-run-at "{run_at}"] '
1167
+ f'[:ingestion/last-run-at :last-commit "{commit_hash}"] '
1168
+ f'[:ingestion/last-run-at :total-ingested {total_ingested}]])'
1169
+ )
1170
+
1171
+
1172
+ # System attributes written by _transact_extracted_facts alongside domain attributes.
1173
+ # They are invisible to schema validation and filtered from attr_facts in minigraf_audit.
1174
+ _SYSTEM_ATTRS: frozenset = frozenset({":entity-type", ":ident"})
1175
+
1176
+ MINIGRAF_SCHEMA: Dict[str, Dict[str, Dict[str, type]]] = {
1177
+ "decision": {
1178
+ "required": {":description": str},
1179
+ "optional": {":rationale": str, ":date": str, ":alias": str},
1180
+ },
1181
+ "preference": {
1182
+ "required": {":description": str},
1183
+ "optional": {":rationale": str, ":alias": str},
1184
+ },
1185
+ "constraint": {
1186
+ "required": {":description": str},
1187
+ "optional": {":rationale": str, ":alias": str},
1188
+ },
1189
+ "dependency": {
1190
+ "required": {":description": str},
1191
+ "optional": {":rationale": str, ":alias": str},
1192
+ },
1193
+ "module": {
1194
+ "required": {":description": str},
1195
+ "optional": {
1196
+ ":path": str, ":alias": str,
1197
+ # graph edges (keyword-valued, stored as strings)
1198
+ ":contains": str, ":depends-on": str, ":calls": str,
1199
+ # commit cross-references
1200
+ ":introduced-by": str, ":modified-in": str,
1201
+ },
1202
+ },
1203
+ "function": {
1204
+ "required": {":description": str},
1205
+ "optional": {
1206
+ ":file": str, ":alias": str,
1207
+ ":introduced-by": str, ":modified-in": str,
1208
+ },
1209
+ },
1210
+ "class": {
1211
+ "required": {":description": str},
1212
+ "optional": {
1213
+ ":file": str, ":alias": str,
1214
+ ":introduced-by": str, ":modified-in": str,
1215
+ },
1216
+ },
1217
+ "ingestion": {
1218
+ "required": {":description": str},
1219
+ "optional": {":hash": str, ":alias": str, ":last-run-at": str, ":last-commit": str, ":total-ingested": int},
1220
+ },
1221
+ "commit": {
1222
+ "required": {":description": str},
1223
+ "optional": {
1224
+ ":hash": str, ":author": str, ":subject": str, ":date": str, ":alias": str,
1225
+ # parent commit reference (keyword-valued edge, stored as string)
1226
+ ":parent": str,
1227
+ },
1228
+ },
1229
+ }
1230
+
1231
+
1232
+ def _validate_facts(facts: List[Dict[str, Any]]) -> List[str]:
1233
+ """Validate proposed facts against MINIGRAF_SCHEMA. Returns violation strings.
1234
+
1235
+ Closed-world: unknown entity types and unknown attributes are both violations.
1236
+ System attributes (_SYSTEM_ATTRS) are silently skipped — they are internal
1237
+ tags added by _transact_extracted_facts, not domain attributes.
1238
+ Pure function — no DB access. Mirrors Schema.validate() from minigraf-schema.
1239
+ """
1240
+ violations: List[str] = []
1241
+
1242
+ # Group facts by entity to check required attributes across all facts for one entity.
1243
+ entity_attrs: Dict[str, Dict[str, Any]] = {}
1244
+ entity_types: Dict[str, str] = {}
1245
+ for fact in facts:
1246
+ entity = fact.get("entity", "")
1247
+ entity_type = fact.get("entity_type", "")
1248
+ attribute = fact.get("attribute", "")
1249
+ value = fact.get("value")
1250
+ if attribute in _SYSTEM_ATTRS:
1251
+ continue # system attributes bypass schema validation
1252
+ entity_attrs.setdefault(entity, {})[attribute] = value
1253
+ if entity_type:
1254
+ entity_types[entity] = entity_type
1255
+
1256
+ for entity, attrs in entity_attrs.items():
1257
+ entity_type = entity_types.get(entity, "")
1258
+
1259
+ # Closed-world: unknown entity type is a violation.
1260
+ if entity_type not in MINIGRAF_SCHEMA:
1261
+ violations.append(
1262
+ f"entity '{entity}' has unknown type '{entity_type}' — "
1263
+ f"allowed: {list(MINIGRAF_SCHEMA)}"
1264
+ )
1265
+ continue
1266
+
1267
+ schema = MINIGRAF_SCHEMA[entity_type]
1268
+ required = schema["required"]
1269
+ optional = schema["optional"]
1270
+ allowed = set(required) | set(optional)
1271
+
1272
+ # Check required attributes are present with correct type.
1273
+ for attr, expected_type in required.items():
1274
+ if attr not in attrs:
1275
+ violations.append(
1276
+ f"entity '{entity}' missing required attribute '{attr}'"
1277
+ )
1278
+ elif not isinstance(attrs[attr], expected_type):
1279
+ violations.append(
1280
+ f"entity '{entity}' attribute '{attr}' has wrong type "
1281
+ f"(expected {expected_type.__name__}, got {type(attrs[attr]).__name__})"
1282
+ )
1283
+
1284
+ # Check optional attributes, if present, have correct type.
1285
+ for attr, value in attrs.items():
1286
+ if attr in optional and not isinstance(value, optional[attr]):
1287
+ violations.append(
1288
+ f"entity '{entity}' attribute '{attr}' has wrong type "
1289
+ f"(expected {optional[attr].__name__}, got {type(value).__name__})"
1290
+ )
1291
+
1292
+ # Closed-world: unknown attributes are violations.
1293
+ for attr in attrs:
1294
+ if attr not in allowed:
1295
+ violations.append(
1296
+ f"entity '{entity}' has unknown attribute '{attr}' — "
1297
+ f"allowed: {sorted(allowed)}"
1298
+ )
1299
+
1300
+ return violations
1301
+
1302
+
1303
+ def _parse_transact_facts(facts_str: str) -> List[Dict[str, Any]]:
1304
+ """Parse a Datalog transact string into fact dicts for schema validation.
1305
+
1306
+ Only captures string-valued triples (quoted values). Keyword values
1307
+ like :type/decision are skipped — they are internal type tags, not
1308
+ user-authored facts subject to schema validation.
1309
+ """
1310
+ pattern = r'\[(\:[^\s\]]+)\s+(\:[^\s\]]+)\s+"([^"]+)"\]'
1311
+ result = []
1312
+ for match in re.finditer(pattern, facts_str):
1313
+ entity, attribute, value = match.groups()
1314
+ entity_type = entity.split("/")[0].lstrip(":") if "/" in entity else ""
1315
+ result.append({
1316
+ "entity": entity,
1317
+ "entity_type": entity_type,
1318
+ "attribute": attribute,
1319
+ "value": value,
1320
+ })
1321
+ return result
1322
+
1323
+
1324
+ def _query_canonical_entities() -> str:
1325
+ """Query existing canonical entity idents for schema-aware prompt injection.
1326
+
1327
+ Returns a formatted string listing up to 50 entity idents and their
1328
+ descriptions. Returns empty string if the graph has no entities — in
1329
+ that case the caller omits the section from the prompt entirely.
1330
+
1331
+ Uses a two-step approach: first fetches all stored :ident keyword strings,
1332
+ then fetches each entity's :description using the keyword ident as a literal.
1333
+ This returns proper keyword idents (e.g. :decision/redis) rather than the
1334
+ internal UUIDs that join-variable queries would return for ?e.
1335
+ """
1336
+ try:
1337
+ ident_result = handle_minigraf_query("[:find ?id :where [?e :ident ?id]]")
1338
+ ident_rows = ident_result.get("results", [])
1339
+ except Exception:
1340
+ return ""
1341
+ if not ident_rows:
1342
+ return ""
1343
+ lines = []
1344
+ for row in ident_rows[:50]:
1345
+ kw_ident = row[0] if row else None
1346
+ if not isinstance(kw_ident, str) or not kw_ident.startswith(":"):
1347
+ continue
1348
+ try:
1349
+ desc_result = handle_minigraf_query(
1350
+ f"[:find ?desc :where [{kw_ident} :description ?desc]]"
1351
+ )
1352
+ desc_rows = desc_result.get("results", [])
1353
+ desc = desc_rows[0][0] if desc_rows else ""
1354
+ except Exception:
1355
+ desc = ""
1356
+ if desc:
1357
+ lines.append(f" {kw_ident} — {desc}")
1358
+ return "\n".join(lines)
1359
+
1360
+
1361
+ def _extract_entities(text: str) -> List[str]:
1362
+ """Extract candidate entity tokens from user message text."""
1363
+ tokens = text.lower().split()
1364
+ result = []
1365
+ for t in tokens:
1366
+ stripped = t.strip(".,?!;:\"'()[]")
1367
+ if len(stripped) >= _MIN_ENTITY_LEN and stripped not in _STOP_WORDS:
1368
+ result.append(stripped)
1369
+ return result
1370
+
1371
+
1372
+ def _format_facts(results: List[List[str]]) -> str:
1373
+ """Format a list of [attr, val] or [e, attr, val] rows as a readable block."""
1374
+ if not results:
1375
+ return ""
1376
+ lines = []
1377
+ for row in results:
1378
+ lines.append(" " + " | ".join(str(v) for v in row))
1379
+ return "\n".join(lines)
1380
+
1381
+
1382
+ _HISTORICAL_SIGNALS = re.compile(
1383
+ r"\b(last\s+\w+|yesterday|before|earlier|as\s+of|at\s+the\s+time|back\s+when|previously)\b",
1384
+ re.IGNORECASE,
1385
+ )
1386
+ # Note: "last <word>" is a broad pattern — "last resort", "last mile", etc. will match.
1387
+ # Without an explicit ISO date in the message, _build_query_clauses falls back to the
1388
+ # current UTC timestamp regardless, so false positives cause no harm in practice.
1389
+ _DATE_PATTERN = re.compile(
1390
+ r"\b(\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4})\b"
1391
+ )
1392
+
1393
+
1394
+ def _is_historical_query(user_message: str) -> bool:
1395
+ return bool(_HISTORICAL_SIGNALS.search(user_message))
1396
+
1397
+
1398
+ def _now_utc_ms() -> str:
1399
+ """Return current UTC time as an ISO 8601 string with millisecond precision and Z suffix.
1400
+
1401
+ minigraf requires UTC (no timezone offsets) and millisecond precision to
1402
+ reliably find facts transacted in the same second as the query.
1403
+ e.g. "2026-05-02T15:44:52.184Z"
1404
+ """
1405
+ return datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
1406
+
1407
+
1408
+ def _build_query_clauses(user_message: str) -> str:
1409
+ """
1410
+ Return temporal clauses to append to a Datalog query.
1411
+
1412
+ For current-state queries use :valid-at with the current UTC timestamp
1413
+ (millisecond precision). This correctly finds all facts whose valid window
1414
+ includes right now — including facts transacted earlier the same second —
1415
+ while excluding expired/retracted facts and future-dated facts.
1416
+
1417
+ For historical queries where an explicit ISO date is detected in the user
1418
+ message, use :valid-at with that date (resolves to midnight UTC on that
1419
+ date — intentional for point-in-time historical semantics).
1420
+
1421
+ minigraf :valid-at accepts: ISO 8601 date ("YYYY-MM-DD" → midnight UTC)
1422
+ or UTC datetime with Z suffix ("YYYY-MM-DDTHH:MM:SS.mmmZ").
1423
+ Timezone offsets are not supported; :any-valid-time disables filtering.
1424
+ """
1425
+ if _is_historical_query(user_message):
1426
+ date_match = _DATE_PATTERN.search(user_message)
1427
+ if date_match:
1428
+ valid_at = date_match.group(1)
1429
+ return f':valid-at "{valid_at}"'
1430
+ return f':valid-at "{_now_utc_ms()}"'
1431
+
1432
+
1433
+ # ---------------------------------------------------------------------------
1434
+ # BM25 index — semantic retrieval primitives
1435
+ # ---------------------------------------------------------------------------
1436
+
1437
+ _MEMORY_PREFIXES = (":decision/", ":preference/", ":constraint/", ":dependency/")
1438
+
1439
+
1440
+ def _tokenize(text: str) -> List[str]:
1441
+ """Split text on non-alphanumeric chars, lowercase, filter empties.
1442
+
1443
+ Works on raw fact values and keyword idents alike:
1444
+ ":decision/use-redis" → ["decision", "use", "redis"]
1445
+ "use Redis for caching" → ["use", "redis", "for", "caching"]
1446
+ """
1447
+ return [t for t in re.split(r"[^a-z0-9]+", text.lower()) if t]
1448
+
1449
+
1450
+ class FactIndex:
1451
+ """Immutable BM25 snapshot over a set of graph facts.
1452
+
1453
+ Each fact row [e, a, v] is tokenised as a single document.
1454
+ Memory facts (entity idents with a known memory prefix) receive
1455
+ a configurable score multiplier at query time.
1456
+ """
1457
+
1458
+ def __init__(self, facts: List[List], boost: float = 2.0) -> None:
1459
+ self._boost = boost
1460
+ docs = [_tokenize(" ".join(str(x) for x in row)) for row in facts]
1461
+ # Filter out rows whose full text produces no tokens
1462
+ valid = [
1463
+ (row, doc, any(str(row[0]).startswith(p) for p in _MEMORY_PREFIXES))
1464
+ for row, doc in zip(facts, docs)
1465
+ if doc
1466
+ ]
1467
+ if not valid or _BM25Okapi is None:
1468
+ self._bm25 = None
1469
+ self._facts: List[List] = []
1470
+ self._is_memory: List[bool] = []
1471
+ self._docs: List[List[str]] = []
1472
+ return
1473
+ rows, valid_docs, memory_flags = zip(*valid)
1474
+ self._facts = list(rows)
1475
+ self._is_memory = list(memory_flags)
1476
+ self._docs: List[List[str]] = list(valid_docs)
1477
+ self._bm25 = _BM25Okapi(self._docs)
1478
+
1479
+ def query(self, text: str, top_n: int = 50) -> List[List]:
1480
+ """Return up to top_n facts ranked by BM25 score (memory boost applied).
1481
+
1482
+ Facts with no token overlap with the query are excluded. Returns []
1483
+ if the index is empty or no query tokens appear in any indexed fact.
1484
+ """
1485
+ if self._bm25 is None or not self._facts:
1486
+ return []
1487
+ tokens = _tokenize(text)
1488
+ if not tokens:
1489
+ return []
1490
+ raw_scores = self._bm25.get_scores(tokens).tolist()
1491
+ # Identify docs with any token overlap.
1492
+ # BM25Okapi can return negative scores in small corpora (negative IDF),
1493
+ # so we detect overlap via a per-token presence check rather than relying on score > 0.
1494
+ token_set = set(tokens)
1495
+ has_overlap = [bool(token_set & set(doc)) for doc in self._docs]
1496
+ overlapping_scores = [raw_scores[i] for i in range(len(raw_scores)) if has_overlap[i]]
1497
+ if not overlapping_scores:
1498
+ return []
1499
+ # Shift so minimum overlapping score is 1.0 — ensures boost always raises
1500
+ # memory facts in rank, even when BM25 produces negative IDF in small corpora.
1501
+ shift = max(0.0, 1.0 - min(overlapping_scores))
1502
+ scores = [raw_scores[i] + shift for i in range(len(raw_scores))]
1503
+ for i, is_mem in enumerate(self._is_memory):
1504
+ if is_mem:
1505
+ scores[i] *= self._boost
1506
+ ranked = sorted(
1507
+ [(scores[i], self._facts[i]) for i in range(len(self._facts)) if has_overlap[i]],
1508
+ key=lambda x: x[0],
1509
+ reverse=True,
1510
+ )
1511
+ return [row for _, row in ranked[:top_n]]
1512
+
1513
+
1514
+ class IndexCache:
1515
+ """Module-level singleton managing the live BM25 FactIndex.
1516
+
1517
+ Rebuilds asynchronously in a background thread. Serves the stale index
1518
+ during rebuilds; returns None before the first successful rebuild.
1519
+ Invalidation is idempotent while a rebuild is in progress.
1520
+ """
1521
+
1522
+ def __init__(self) -> None:
1523
+ self._current: Optional[FactIndex] = None
1524
+ self._rebuilding: bool = False
1525
+ self._lock = threading.Lock()
1526
+
1527
+ def get(self) -> Optional[FactIndex]:
1528
+ """Return the current index (may be stale or None)."""
1529
+ return self._current
1530
+
1531
+ def invalidate(self) -> None:
1532
+ """Trigger an async rebuild if one is not already running."""
1533
+ if self._rebuilding:
1534
+ return
1535
+ self._rebuilding = True
1536
+ t = threading.Thread(target=self._rebuild, daemon=True)
1537
+ t.start()
1538
+
1539
+ def _rebuild(self) -> None:
1540
+ """Fetch all currently-valid facts from the DB and swap the index."""
1541
+ try:
1542
+ db = get_db()
1543
+ boost = float(os.environ.get("MINIGRAF_MEMORY_BOOST", "2.0"))
1544
+ raw = db.execute(
1545
+ f'(query [:find ?e ?a ?v :valid-at "{_now_utc_ms()}" :where [?e ?a ?v]])'
1546
+ )
1547
+ facts = json.loads(raw).get("results", [])
1548
+ new_index = FactIndex(facts, boost=boost)
1549
+ with self._lock:
1550
+ self._current = new_index
1551
+ except Exception as e:
1552
+ print(f"[IndexCache] rebuild failed: {e}", file=sys.stderr)
1553
+ finally:
1554
+ self._rebuilding = False
1555
+
1556
+
1557
+ _index_cache = IndexCache()
1558
+
1559
+
1560
+ def _handle_memory_prepare_turn_heuristic(user_message: str) -> str:
1561
+ """Heuristic fallback for handle_memory_prepare_turn.
1562
+
1563
+ Used when rank_bm25 is unavailable. Queries the graph using substring
1564
+ token matching (contains?) for entities extracted from the user message,
1565
+ falling back to a broad scan when no targeted results are found.
1566
+
1567
+ For current-state queries, uses :valid-at with the current UTC ms timestamp
1568
+ (via _build_query_clauses) so facts whose valid window includes right now
1569
+ are returned. For historical queries where an explicit ISO date is detected
1570
+ in the user message, :valid-at is set to that date (midnight UTC).
1571
+ """
1572
+ db = get_db()
1573
+ scan_limit = int(os.environ.get("MINIGRAF_PREPARE_SCAN_LIMIT", "50"))
1574
+ temporal_clauses = _build_query_clauses(user_message)
1575
+
1576
+ entities = _extract_entities(user_message)
1577
+ collected: List[List[str]] = []
1578
+ seen: set = set()
1579
+
1580
+ for entity in entities:
1581
+ try:
1582
+ raw = db.execute(
1583
+ f'(query [:find ?a ?v {temporal_clauses} :where [?e ?a ?v] (contains? ?v "{entity}")])'
1584
+ )
1585
+ data = json.loads(raw)
1586
+ for row in data.get("results", []):
1587
+ key = tuple(row)
1588
+ if key not in seen:
1589
+ seen.add(key)
1590
+ collected.append(row)
1591
+ except (MiniGrafError, json.JSONDecodeError):
1592
+ continue
1593
+
1594
+ if not collected:
1595
+ # Broad fallback scan — still respect temporal clause
1596
+ try:
1597
+ raw = db.execute(
1598
+ f"(query [:find ?e ?a ?v {temporal_clauses} :where [?e ?a ?v]])"
1599
+ )
1600
+ data = json.loads(raw)
1601
+ all_results = data.get("results", [])
1602
+ collected = all_results[:scan_limit]
1603
+ except (MiniGrafError, json.JSONDecodeError):
1604
+ pass
1605
+
1606
+ if not collected:
1607
+ return ""
1608
+
1609
+ block = _format_facts(collected)
1610
+ return f"Relevant memory context:\n{block}"
1611
+
1612
+
1613
+ def handle_memory_prepare_turn(user_message: str) -> str:
1614
+ """Query graph for facts relevant to the user message.
1615
+
1616
+ Uses BM25-ranked retrieval over a cached FactIndex when rank_bm25 is
1617
+ available. Falls back to the heuristic (substring token) implementation
1618
+ when rank_bm25 is not installed.
1619
+
1620
+ Returns a formatted context block string for injection as additionalContext,
1621
+ or an empty string if no relevant facts are found.
1622
+ """
1623
+ if not _BM25_AVAILABLE:
1624
+ return _handle_memory_prepare_turn_heuristic(user_message)
1625
+
1626
+ scan_limit = int(os.environ.get("MINIGRAF_PREPARE_SCAN_LIMIT", "50"))
1627
+ index = _index_cache.get()
1628
+ if index is None:
1629
+ return ""
1630
+ results = index.query(user_message, top_n=scan_limit)
1631
+ if not results:
1632
+ return ""
1633
+ return f"Relevant memory context:\n{_format_facts(results)}"
1634
+
1635
+
1636
+ # ---------------------------------------------------------------------------
1637
+ # Fact extraction — heuristic strategy
1638
+ # ---------------------------------------------------------------------------
1639
+
1640
+ _SIGNAL_PATTERNS = [
1641
+ # Each pattern captures a single token after the signal phrase. Articles ("a", "the", etc.)
1642
+ # will match first if present (e.g. "depends on the auth-service" → captures "the"), but
1643
+ # the stop-word filter below drops them, producing zero facts for that phrase. Users should
1644
+ # write "depends on auth-service" (no article) to ensure capture.
1645
+ (r"we(?:'ll?|\s+will)\s+use\s+([\w\-]+)", "decision", ":description", "chosen technology or approach"),
1646
+ (r"going\s+with\s+([\w\-]+)", "decision", ":description", "chosen approach"),
1647
+ (r"decided\s+(?:to\s+)?(?:use\s+)?([\w\-]+)", "decision", ":description", "decided approach"),
1648
+ (r"we\s+chose\s+([\w\-]+)", "decision", ":description", "chosen option"),
1649
+ (r"I\s+prefer\s+([\w\-]+)", "preference", ":description", "stated preference"),
1650
+ (r"I\s+don'?t\s+like\s+([\w\-]+)", "preference", ":description", "stated dislike"),
1651
+ (r"always\s+use\s+([\w\-]+)", "preference", ":description", "always-use preference"),
1652
+ (r"never\s+use\s+([\w\-]+)", "preference", ":description", "never-use preference"),
1653
+ (r"prioritize\s+([\w\-]+)", "preference", ":description", "priority preference"),
1654
+ (r"must\s+be\s+([\w\-]+)", "constraint", ":description", "hard constraint"),
1655
+ (r"can'?t\s+use\s+([\w\-]+)", "constraint", ":description", "exclusion constraint"),
1656
+ (r"depends\s+on\s+([\w\-]+)", "dependency", ":description", "dependency relationship"),
1657
+ (r"requires?\s+([\w\-]+)", "dependency", ":description", "required dependency"),
1658
+ ]
1659
+
1660
+
1661
+ def heuristic_extract(text: str) -> List[Dict[str, str]]:
1662
+ """
1663
+ Scan text for decision-signal phrases and return a list of fact dicts.
1664
+ Each dict has keys: entity, attribute, value, reason.
1665
+ """
1666
+ facts = []
1667
+ seen_values: set = set()
1668
+
1669
+ for pattern, entity_type, attribute, reason_prefix in _SIGNAL_PATTERNS:
1670
+ for match in re.finditer(pattern, text, re.IGNORECASE):
1671
+ value = match.group(1).strip()
1672
+ if len(value) < 2 or value.lower() in _STOP_WORDS:
1673
+ continue
1674
+ key = (entity_type, value.lower())
1675
+ if key in seen_values:
1676
+ continue
1677
+ seen_values.add(key)
1678
+ entity_ident = _canonical_ident(entity_type, value)
1679
+ facts.append({
1680
+ "entity": entity_ident,
1681
+ "entity_type": entity_type,
1682
+ "attribute": attribute,
1683
+ "value": value,
1684
+ "reason": f"{reason_prefix} — extracted by heuristic strategy",
1685
+ })
1686
+
1687
+ return facts
1688
+
1689
+
1690
+ def _transact_extracted_facts(facts: List[Dict[str, str]], valid_from: Optional[str] = None) -> int:
1691
+ """
1692
+ Transact a list of extracted fact dicts. Returns count of successfully stored facts.
1693
+
1694
+ Sets :valid-from to the current UTC ms timestamp on every write so that
1695
+ valid-time is recorded. Combined with :as-of in queries this enables true
1696
+ bi-temporal point-in-time reads.
1697
+
1698
+ valid_from: override the :valid-from timestamp (ISO 8601). If None, defaults
1699
+ to the current UTC time. Pass a past date to backdate facts (e.g. from
1700
+ LLM-annotated '; valid-at: YYYY-MM-DD' hints).
1701
+ """
1702
+ _refresh_if_stale()
1703
+ db = get_db()
1704
+ stored = 0
1705
+ for fact in facts:
1706
+ entity = fact["entity"]
1707
+ entity_type = fact.get("entity_type", "")
1708
+ attribute = fact["attribute"]
1709
+ value = fact["value"]
1710
+ # Schema validation — closed-world: skip invalid facts.
1711
+ violations = _validate_facts([fact])
1712
+ if violations:
1713
+ continue
1714
+ now_z = valid_from or _now_utc_ms()
1715
+ try:
1716
+ # Combine main fact, :entity-type tag, and :ident into one transact so
1717
+ # all triples are written atomically — a single (transact [...]) is one
1718
+ # transaction. :ident stores the keyword ident as a string value so that
1719
+ # handle_minigraf_audit and _query_canonical_entities can surface it for
1720
+ # display without knowing the UUID (audits retract via #uuid "..." syntax).
1721
+ if entity_type:
1722
+ triples = (
1723
+ f'[{entity} {attribute} "{value}"]'
1724
+ f' [{entity} :entity-type :type/{entity_type}]'
1725
+ f' [{entity} :ident "{entity}"]'
1726
+ )
1727
+ else:
1728
+ triples = f'[{entity} {attribute} "{value}"]'
1729
+ db.execute(f'(transact [{triples}] {{:valid-from "{now_z}"}})')
1730
+ stored += 1
1731
+ except MiniGrafError:
1732
+ continue
1733
+ if stored:
1734
+ db.checkpoint()
1735
+ _update_mtime()
1736
+ return stored
1737
+
1738
+
1739
+ # ---------------------------------------------------------------------------
1740
+ # Fact extraction — llm strategy
1741
+ # ---------------------------------------------------------------------------
1742
+
1743
+ _LLM_EXTRACTION_PROMPT = """You are a memory extraction assistant for a bi-temporal graph database. Review the conversation below and identify any decisions, preferences, constraints, or dependencies that should be stored in long-term memory.
1744
+
1745
+ Return ONLY a Datalog transact expression — a list of triples in this exact format:
1746
+ [[:entity/ident :attribute "value"]
1747
+ [:entity/ident :attribute "value"]]
1748
+
1749
+ If nothing worth storing was found, return an empty list: []
1750
+
1751
+ Allowed entity type prefixes: :decision/ :preference/ :constraint/ :dependency/
1752
+ Canonical ident form: lowercase, hyphens only — :decision/redis not :decision/Redis_cache.
1753
+ {canonical_entities_section}
1754
+ Use these attributes: :description (required), :rationale (optional), :date (optional), :alias (optional).
1755
+ No other attributes are valid.
1756
+
1757
+ IMPORTANT — entity resolution: if a reference matches an existing canonical ident or alias above,
1758
+ reuse that exact ident. Only mint a new ident if the entity is genuinely new.
1759
+
1760
+ IMPORTANT — bi-temporality: this database is bi-temporal. Facts have both a transaction time
1761
+ (when they were recorded) and a valid time (when they were true in the world). When the conversation
1762
+ mentions that something was decided or true at a specific past date, note that date alongside the
1763
+ fact so the caller can set :valid-at accordingly. Wrap such facts in a comment line:
1764
+ ; valid-at: 2024-03-15
1765
+ [[:entity/ident :attribute "value"]]
1766
+
1767
+ For point-in-time historical queries, always use :as-of N and :valid-at "date" TOGETHER —
1768
+ using only one gives a partial view.
1769
+
1770
+ Conversation:
1771
+ {conversation}"""
1772
+
1773
+
1774
+ def _get_anthropic_client():
1775
+ """Return an Anthropic client. Raises if anthropic package or API key is missing."""
1776
+ try:
1777
+ import anthropic
1778
+ except ImportError:
1779
+ raise RuntimeError("anthropic package not installed — pip install anthropic")
1780
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
1781
+ if not api_key:
1782
+ raise RuntimeError("ANTHROPIC_API_KEY not set")
1783
+ return anthropic.Anthropic(api_key=api_key)
1784
+
1785
+
1786
+ _OPENAI_MODEL_PREFIXES = ("gpt-", "o1", "o3", "o4")
1787
+
1788
+
1789
+ def _is_openai_model(model: str) -> bool:
1790
+ return any(model.startswith(p) for p in _OPENAI_MODEL_PREFIXES)
1791
+
1792
+
1793
+ def _get_openai_client():
1794
+ """Return an OpenAI client. Raises if openai package or API key is missing."""
1795
+ try:
1796
+ import openai
1797
+ except ImportError:
1798
+ raise RuntimeError("openai package not installed — pip install openai")
1799
+ api_key = os.environ.get("OPENAI_API_KEY")
1800
+ if not api_key:
1801
+ raise RuntimeError("OPENAI_API_KEY not set")
1802
+ return openai.OpenAI(api_key=api_key)
1803
+
1804
+
1805
+ def _strip_code_fences(text: str) -> str:
1806
+ """Remove markdown code fences that LLMs sometimes wrap around Datalog output.
1807
+
1808
+ Handles both ``` and ```datalog (or any language tag). Returns the inner
1809
+ content, stripped. If no fences are present, returns the input unchanged.
1810
+ """
1811
+ text = text.strip()
1812
+ if text.startswith("```"):
1813
+ # Drop the opening fence line (``` or ```datalog etc.)
1814
+ first_newline = text.find("\n")
1815
+ if first_newline != -1:
1816
+ text = text[first_newline + 1:]
1817
+ # Drop the closing fence if present
1818
+ if text.rstrip().endswith("```"):
1819
+ text = text.rstrip()[:-3]
1820
+ return text.strip()
1821
+
1822
+
1823
+ def _llm_missing_package_warning(error: str) -> str:
1824
+ """Return a user-facing install instruction when the LLM package is absent.
1825
+
1826
+ Inspects the error string from _llm_extract_and_transact and maps it to
1827
+ the correct pip install command based on the configured model.
1828
+ Returns an empty string when the error is not a missing-package error.
1829
+ """
1830
+ model = os.environ.get("MINIGRAF_LLM_MODEL", "claude-haiku-4-5-20251001")
1831
+ if "anthropic package not installed" in error:
1832
+ return (
1833
+ "ACTION REQUIRED: pip install anthropic\n"
1834
+ f" The configured model '{model}' requires the anthropic package.\n"
1835
+ " Set MINIGRAF_LLM_MODEL in .mcp.json if you want to use an OpenAI model instead."
1836
+ )
1837
+ if "openai package not installed" in error:
1838
+ return (
1839
+ "ACTION REQUIRED: pip install openai\n"
1840
+ f" The configured model '{model}' requires the openai package.\n"
1841
+ " Set MINIGRAF_LLM_MODEL in .mcp.json if you want to use an Anthropic model instead."
1842
+ )
1843
+ return ""
1844
+
1845
+
1846
+ def _call_llm(model: str, prompt: str) -> str:
1847
+ """Call an LLM and return the response text. Dispatches to OpenAI or Anthropic by model name."""
1848
+ if _is_openai_model(model):
1849
+ client = _get_openai_client()
1850
+ response = client.chat.completions.create(
1851
+ model=model,
1852
+ max_tokens=1024,
1853
+ messages=[{"role": "user", "content": prompt}],
1854
+ )
1855
+ return response.choices[0].message.content
1856
+ else:
1857
+ client = _get_anthropic_client()
1858
+ message = client.messages.create(
1859
+ model=model,
1860
+ max_tokens=1024,
1861
+ messages=[{"role": "user", "content": prompt}],
1862
+ )
1863
+ return message.content[0].text
1864
+
1865
+
1866
+ def _parse_valid_at_hint(raw: str):
1867
+ """Extract optional '; valid-at: YYYY-MM-DD' comment from model output.
1868
+
1869
+ Returns (valid_at, cleaned_datalog) where valid_at defaults to the current
1870
+ UTC ms timestamp if no hint is present.
1871
+ """
1872
+ valid_at = _now_utc_ms()
1873
+ kept = []
1874
+ for line in raw.splitlines():
1875
+ stripped = line.strip()
1876
+ if stripped.startswith("; valid-at:"):
1877
+ date_str = stripped[len("; valid-at:"):].strip()
1878
+ if re.match(r"^\d{4}-\d{2}-\d{2}$", date_str):
1879
+ valid_at = date_str
1880
+ else:
1881
+ kept.append(line)
1882
+ return valid_at, "\n".join(kept).strip()
1883
+
1884
+
1885
+ def _llm_extract_and_transact(conversation_delta: str) -> Dict[str, Any]:
1886
+ """Call a lightweight LLM to extract facts. Returns {ok, stored_count, strategy}."""
1887
+ try:
1888
+ model = os.environ.get("MINIGRAF_LLM_MODEL", "claude-haiku-4-5-20251001")
1889
+ canonical = _query_canonical_entities()
1890
+ if canonical:
1891
+ canonical_entities_section = (
1892
+ "\nExisting canonical entities (reuse these idents — do not invent synonyms):\n"
1893
+ + canonical
1894
+ )
1895
+ else:
1896
+ canonical_entities_section = ""
1897
+ prompt = _LLM_EXTRACTION_PROMPT.format(
1898
+ conversation=conversation_delta,
1899
+ canonical_entities_section=canonical_entities_section,
1900
+ )
1901
+ raw_facts = _strip_code_fences(_call_llm(model, prompt))
1902
+ if not raw_facts or raw_facts == "[]":
1903
+ return {"ok": True, "stored_count": 0, "strategy": "llm"}
1904
+ valid_at, datalog = _parse_valid_at_hint(raw_facts)
1905
+ if not datalog or datalog == "[]":
1906
+ return {"ok": True, "stored_count": 0, "strategy": "llm"}
1907
+ # Route through _transact_extracted_facts so each fact gets schema
1908
+ # validation and an :entity-type tag — same path as heuristic extraction.
1909
+ parsed = _parse_transact_facts(datalog)
1910
+ stored_count = _transact_extracted_facts(parsed, valid_from=valid_at)
1911
+ return {"ok": True, "stored_count": stored_count, "strategy": "llm"}
1912
+ except Exception as e:
1913
+ return {"ok": False, "error": str(e), "strategy": "llm"}
1914
+
1915
+
1916
+ # ---------------------------------------------------------------------------
1917
+ # Fact extraction — agent (MCP sampling) strategy
1918
+ # ---------------------------------------------------------------------------
1919
+
1920
+ _AGENT_SAMPLING_PROMPT = """Review this conversation turn and output ONLY a Datalog transact expression for any decisions, preferences, constraints, or dependencies worth storing in long-term memory.
1921
+
1922
+ Allowed entity type prefixes: :decision/ :preference/ :constraint/ :dependency/
1923
+ Canonical ident form: lowercase, hyphens only — :decision/redis not :decision/Redis_cache.
1924
+ {canonical_entities_section}
1925
+ Use these attributes: :description (required), :rationale (optional), :date (optional), :alias (optional).
1926
+ No other attributes are valid. If an entity matches an existing ident or alias, reuse it exactly.
1927
+
1928
+ Format:
1929
+ [[:entity/ident :attribute "value"]]
1930
+
1931
+ Return [] if nothing is worth storing.
1932
+
1933
+ {conversation}"""
1934
+
1935
+
1936
+ async def _request_agent_memory_block_async(conversation_delta: str, canonical_entities_section: str = "") -> str:
1937
+ """Use MCP sampling to ask the connected agent for a memory block."""
1938
+ if _server_ref is None:
1939
+ raise RuntimeError("Server reference not set")
1940
+ from mcp.types import SamplingMessage, TextContent as TC
1941
+ prompt = _AGENT_SAMPLING_PROMPT.format(
1942
+ conversation=conversation_delta,
1943
+ canonical_entities_section=canonical_entities_section,
1944
+ )
1945
+ result = await _server_ref.request_context.session.create_message(
1946
+ messages=[SamplingMessage(role="user", content=TC(type="text", text=prompt))],
1947
+ max_tokens=512,
1948
+ )
1949
+ return result.content.text if hasattr(result.content, "text") else str(result.content)
1950
+
1951
+
1952
+ async def _agent_extract_and_transact(conversation_delta: str) -> Dict[str, Any]:
1953
+ """Request a memory block from the agent via MCP sampling, then transact it."""
1954
+ try:
1955
+ canonical = _query_canonical_entities()
1956
+ if canonical:
1957
+ canonical_entities_section = (
1958
+ "\nExisting canonical entities (reuse these idents — do not invent synonyms):\n"
1959
+ + canonical
1960
+ )
1961
+ else:
1962
+ canonical_entities_section = ""
1963
+ raw_facts = _strip_code_fences(await _request_agent_memory_block_async(conversation_delta, canonical_entities_section))
1964
+ if not raw_facts or raw_facts == "[]":
1965
+ return {"ok": True, "stored_count": 0, "strategy": "agent"}
1966
+ valid_at, datalog = _parse_valid_at_hint(raw_facts)
1967
+ if not datalog or datalog == "[]":
1968
+ return {"ok": True, "stored_count": 0, "strategy": "agent"}
1969
+ _refresh_if_stale()
1970
+ db = get_db()
1971
+ db.execute(f'(transact {datalog} {{:valid-from "{valid_at}"}})')
1972
+ db.checkpoint()
1973
+ _update_mtime()
1974
+ # Approximate: count "[:" occurrences as a proxy for triple count.
1975
+ stored_count = datalog.count("[:")
1976
+ return {"ok": True, "stored_count": stored_count, "strategy": "agent"}
1977
+ except Exception as e:
1978
+ return {"ok": False, "error": str(e), "strategy": "agent"}
1979
+
1980
+
1981
+ # ---------------------------------------------------------------------------
1982
+ # memory_finalize_turn — dispatcher
1983
+ # ---------------------------------------------------------------------------
1984
+
1985
+ async def handle_memory_finalize_turn(conversation_delta: str) -> Dict[str, Any]:
1986
+ """
1987
+ Extract facts from conversation_delta and transact them.
1988
+ Strategy selected via MINIGRAF_EXTRACTION_STRATEGY env var (default: heuristic).
1989
+ """
1990
+ strategy = os.environ.get("MINIGRAF_EXTRACTION_STRATEGY", "heuristic")
1991
+
1992
+ if strategy == "heuristic":
1993
+ facts = heuristic_extract(conversation_delta)
1994
+ stored = _transact_extracted_facts(facts)
1995
+ return {"ok": True, "stored_count": stored, "strategy": "heuristic"}
1996
+
1997
+ if strategy == "llm":
1998
+ result = _llm_extract_and_transact(conversation_delta)
1999
+ if result["ok"]:
2000
+ return result
2001
+ # LLM failed — fall back to heuristic and surface a warning so the user
2002
+ # can see what went wrong (e.g. missing package, bad API key).
2003
+ llm_error = result.get("error", "")
2004
+ warning = _llm_missing_package_warning(llm_error)
2005
+ facts = heuristic_extract(conversation_delta)
2006
+ stored = _transact_extracted_facts(facts)
2007
+ response: Dict[str, Any] = {
2008
+ "ok": True,
2009
+ "stored_count": stored,
2010
+ "strategy": "heuristic (llm fallback)",
2011
+ }
2012
+ if warning:
2013
+ response["warning"] = warning
2014
+ elif llm_error:
2015
+ response["warning"] = f"LLM extraction failed ({llm_error}); fell back to heuristic."
2016
+ return response
2017
+
2018
+ if strategy == "agent":
2019
+ return await _agent_extract_and_transact(conversation_delta)
2020
+
2021
+ return {"ok": False, "error": f"Unknown strategy: {strategy}"}
2022
+
2023
+
2024
+ def _build_code_triples(
2025
+ file_path: str,
2026
+ extracted: Dict[str, List[str]],
2027
+ commit_ts_iso: str,
2028
+ entity_valid_from: Dict[str, str],
2029
+ entity_descriptions: Dict[str, str],
2030
+ file_entities: Dict[str, List[str]],
2031
+ commit_ident: str,
2032
+ ) -> List[str]:
2033
+ """Return Datalog triple strings for a file's extracted code entities.
2034
+
2035
+ Stable attributes (:entity-type, :ident, :description, :path/:file,
2036
+ :introduced-by, :contains) are written ONCE on first introduction. On
2037
+ subsequent modifications only a :modified-in edge is added. This prevents
2038
+ bi-temporal fact explosion from N re-assertions of the same attribute
2039
+ joining into N² result rows.
2040
+
2041
+ :depends-on edges are written in the commit loop by _run_ingestion as the
2042
+ file's imports change, giving them proper bi-temporal bounds.
2043
+ """
2044
+ triples: List[str] = []
2045
+ module_ident = _code_ident("module", file_path)
2046
+
2047
+ is_new_module = module_ident not in entity_valid_from
2048
+ # Track all idents for this file (for deletion cleanup)
2049
+ idents_for_file = file_entities.setdefault(file_path, [])
2050
+
2051
+ if is_new_module:
2052
+ # Write all stable attributes once, at introduction time
2053
+ triples += [
2054
+ f"[{module_ident} :entity-type :type/module]",
2055
+ f'[{module_ident} :ident "{module_ident}"]',
2056
+ f'[{module_ident} :description "{_edn_escape(file_path)}"]',
2057
+ f'[{module_ident} :path "{_edn_escape(file_path)}"]',
2058
+ f"[{module_ident} :introduced-by {commit_ident}]",
2059
+ ]
2060
+ if module_ident not in idents_for_file:
2061
+ idents_for_file.append(module_ident)
2062
+ entity_valid_from[module_ident] = commit_ts_iso
2063
+ entity_descriptions[module_ident] = file_path
2064
+
2065
+ else:
2066
+ # Existing module: only record that this commit modified it
2067
+ triples.append(f"[{module_ident} :modified-in {commit_ident}]")
2068
+
2069
+ for fn_name in extracted["functions"]:
2070
+ fn_ident = _code_ident("function", file_path, fn_name)
2071
+ if fn_ident not in entity_valid_from:
2072
+ # New function: write all stable attributes once
2073
+ triples += [
2074
+ f"[{fn_ident} :entity-type :type/function]",
2075
+ f'[{fn_ident} :ident "{fn_ident}"]',
2076
+ f'[{fn_ident} :description "{_edn_escape(fn_name)}"]',
2077
+ f'[{fn_ident} :file "{_edn_escape(file_path)}"]',
2078
+ f"[{module_ident} :contains {fn_ident}]",
2079
+ f"[{fn_ident} :introduced-by {commit_ident}]",
2080
+ ]
2081
+ if fn_ident not in idents_for_file:
2082
+ idents_for_file.append(fn_ident)
2083
+ entity_valid_from[fn_ident] = commit_ts_iso
2084
+ entity_descriptions[fn_ident] = fn_name
2085
+ else:
2086
+ # Pre-existing function: record that this commit modified it
2087
+ triples.append(f"[{fn_ident} :modified-in {commit_ident}]")
2088
+
2089
+ for cls_name in extracted["classes"]:
2090
+ cls_ident = _code_ident("class", file_path, cls_name)
2091
+ if cls_ident not in entity_valid_from:
2092
+ # New class: write all stable attributes once
2093
+ triples += [
2094
+ f"[{cls_ident} :entity-type :type/class]",
2095
+ f'[{cls_ident} :ident "{cls_ident}"]',
2096
+ f'[{cls_ident} :description "{_edn_escape(cls_name)}"]',
2097
+ f'[{cls_ident} :file "{_edn_escape(file_path)}"]',
2098
+ f"[{module_ident} :contains {cls_ident}]",
2099
+ f"[{cls_ident} :introduced-by {commit_ident}]",
2100
+ ]
2101
+ if cls_ident not in idents_for_file:
2102
+ idents_for_file.append(cls_ident)
2103
+ entity_valid_from[cls_ident] = commit_ts_iso
2104
+ entity_descriptions[cls_ident] = cls_name
2105
+ else:
2106
+ # Pre-existing class: record that this commit modified it
2107
+ triples.append(f"[{cls_ident} :modified-in {commit_ident}]")
2108
+
2109
+ return triples
2110
+
2111
+
2112
+ def _preload_known_entities(db: Any, repo_path: str) -> tuple:
2113
+ """Load all existing module/function/class idents from the DB, and pre-seed
2114
+ file_entities with all currently tracked files in the repo.
2115
+
2116
+ Pre-seeding from `git ls-files` ensures that _resolve_module_import can
2117
+ find any module file even when processing early commits — before those files
2118
+ have been introduced in the chronological commit walk.
2119
+
2120
+ Returns (entity_valid_from, entity_descriptions, file_entities).
2121
+ entity_valid_from maps ident → git commit timestamp of first introduction.
2122
+ entity_descriptions maps ident → human-readable name (function/class/file).
2123
+ """
2124
+ entity_valid_from: Dict[str, str] = {}
2125
+ entity_descriptions: Dict[str, str] = {}
2126
+ file_entities: Dict[str, List[str]] = {}
2127
+
2128
+ # Pre-seed file_entities with all files currently in the repo
2129
+ try:
2130
+ result = _subprocess.run(
2131
+ ["git", "ls-files", "--full-name"],
2132
+ cwd=repo_path, capture_output=True, text=True, timeout=30,
2133
+ )
2134
+ for filepath in result.stdout.strip().splitlines():
2135
+ if Path(filepath).suffix.lower() in _EXT_TO_LANG:
2136
+ file_entities.setdefault(filepath, [])
2137
+ except Exception:
2138
+ pass
2139
+
2140
+ for entity_type in ("module", "function", "class"):
2141
+ path_attr = "path" if entity_type == "module" else "file"
2142
+ try:
2143
+ raw = db.execute(
2144
+ f'(query [:find ?ident ?path ?desc ?date '
2145
+ f':where [?e :entity-type :type/{entity_type}] '
2146
+ f'[?e :ident ?ident] '
2147
+ f'[?e :{path_attr} ?path] '
2148
+ f'[?e :description ?desc] '
2149
+ f'[?e :introduced-by ?c] '
2150
+ f'[?c :date ?date]])'
2151
+ )
2152
+ rows = json.loads(raw).get("results", [])
2153
+ for ident, path, desc, date in rows:
2154
+ entity_valid_from[ident] = date
2155
+ entity_descriptions[ident] = desc
2156
+ file_entities.setdefault(path, [])
2157
+ if ident not in file_entities[path]:
2158
+ file_entities[path].append(ident)
2159
+ except Exception:
2160
+ pass
2161
+
2162
+ return entity_valid_from, entity_descriptions, file_entities
2163
+
2164
+
2165
+ def _ingest_tags(db: Any, repo_path: str, run_ts_iso: str) -> None:
2166
+ """Ingest git tags as :tag/<slug> entities with :tagged-commit references.
2167
+
2168
+ Called once after the commit walk. All tags are re-ingested on every run
2169
+ so newly created tags pointing to previously ingested commits are picked up.
2170
+ Re-transacting identical facts is idempotent in Minigraf.
2171
+ """
2172
+ try:
2173
+ tags = _git_tags(repo_path)
2174
+ except Exception:
2175
+ return # non-fatal
2176
+
2177
+ for tag_name, commit_hash, date_raw in tags:
2178
+ try:
2179
+ slug = re.sub(r"[^a-z0-9]+", "-", tag_name.lower()).strip("-")
2180
+ tag_ident = f":tag/{slug}"
2181
+ commit_ident = f":commit/{commit_hash[:12]}"
2182
+ triples = [
2183
+ f"[{tag_ident} :entity-type :type/tag]",
2184
+ f'[{tag_ident} :name "{_edn_escape(tag_name)}"]',
2185
+ f'[{tag_ident} :ident "{tag_ident}"]',
2186
+ f'[{tag_ident} :description "git tag {_edn_escape(tag_name)}"]',
2187
+ f"[{tag_ident} :tagged-commit {commit_ident}]",
2188
+ ]
2189
+ if date_raw:
2190
+ triples.append(f'[{tag_ident} :date "{_edn_escape(date_raw)}"]')
2191
+ db.execute(f'(transact [{" ".join(triples)}] {{:valid-from "{run_ts_iso}"}})')
2192
+ except Exception:
2193
+ pass # non-fatal per tag
2194
+
2195
+
2196
+ async def _run_ingestion(repo_path: str, branch: str) -> None:
2197
+ """Background coroutine: walk git history and ingest code structure."""
2198
+ global _db, _ingest_progress
2199
+ try:
2200
+ # Read watermark and pre-load known entities before releasing DB
2201
+ db = get_db()
2202
+ watermark = _watermark_query(db)
2203
+ prior_ingested = _total_ingested_query(db)
2204
+ entity_valid_from, entity_descriptions, file_entities = _preload_known_entities(db, repo_path)
2205
+ file_deps: Dict[str, set] = {} # file_path -> set of dep module idents
2206
+ dep_valid_from: Dict[tuple, str] = {} # (src_ident, dep_ident) -> intro commit ts
2207
+ _db = None # release file lock while enumerating commits
2208
+
2209
+ commits = _git_commits(repo_path, watermark, branch)
2210
+ repo_total_result = _subprocess.run(
2211
+ ["git", "rev-list", "--count", "HEAD"],
2212
+ cwd=repo_path, capture_output=True, text=True,
2213
+ )
2214
+ repo_total = int(repo_total_result.stdout.strip()) if repo_total_result.returncode == 0 else len(commits)
2215
+ _ingest_progress["total"] = repo_total
2216
+ _ingest_progress["status"] = "running"
2217
+ _ingest_progress["processed"] = prior_ingested
2218
+
2219
+ last_hash = watermark or ""
2220
+
2221
+ for commit_hash, commit_ts_iso, author, subject in commits:
2222
+ last_hash = commit_hash
2223
+ _ingest_progress["current_commit"] = commit_hash
2224
+ reason = f"git:{commit_hash} {author}: {subject}"
2225
+
2226
+ # Build commit entity ident from first 12 chars of hash
2227
+ commit_ident = f":commit/{commit_hash[:12]}"
2228
+
2229
+ # Acquire DB fresh each commit — never hold across yield
2230
+ db = get_db()
2231
+ try:
2232
+ changed = _git_changed_files(repo_path, commit_hash)
2233
+ add_triples: List[str] = [
2234
+ f"[{commit_ident} :entity-type :type/commit]",
2235
+ f'[{commit_ident} :ident "{commit_ident}"]',
2236
+ f'[{commit_ident} :description "{_edn_escape(subject[:120])}"]',
2237
+ f'[{commit_ident} :hash "{commit_hash}"]',
2238
+ f'[{commit_ident} :author "{_edn_escape(author)}"]',
2239
+ f'[{commit_ident} :subject "{_edn_escape(subject[:200])}"]',
2240
+ f'[{commit_ident} :date "{commit_ts_iso}"]',
2241
+ ]
2242
+ close_items: List[tuple] = [] # (triples, original_ts_iso)
2243
+ dep_add_triples: List[str] = [] # :depends-on triples to transact individually
2244
+
2245
+ for status, file_path in changed:
2246
+ parser = _get_parser(file_path)
2247
+ if parser is None:
2248
+ continue
2249
+
2250
+ if status == "D":
2251
+ # Close module and all known child entities for this file
2252
+ idents = file_entities.get(file_path, [_code_ident("module", file_path)])
2253
+ module_ident = _code_ident("module", file_path)
2254
+ for ident in idents:
2255
+ orig_ts = entity_valid_from.get(ident, commit_ts_iso)
2256
+ desc = entity_descriptions.get(ident, "")
2257
+ close_items.append(
2258
+ (_build_close_triples(ident, desc, module_ident), orig_ts)
2259
+ )
2260
+ # Close all :depends-on edges for the deleted module
2261
+ for dep_ident in file_deps.get(file_path, set()):
2262
+ orig_ts = dep_valid_from.get((module_ident, dep_ident), commit_ts_iso)
2263
+ close_items.append(
2264
+ ([f"[{module_ident} :depends-on {dep_ident}]"], orig_ts)
2265
+ )
2266
+ file_deps.pop(file_path, None)
2267
+ else: # A or M
2268
+ previous_idents = set(file_entities.get(file_path, []))
2269
+ try:
2270
+ content = _git_file_content(repo_path, commit_hash, file_path)
2271
+ except Exception:
2272
+ continue
2273
+ extracted = _extract_from_source(content, parser, file_path)
2274
+ triples = _build_code_triples(
2275
+ file_path, extracted, commit_ts_iso, entity_valid_from,
2276
+ entity_descriptions, file_entities, commit_ident,
2277
+ )
2278
+ add_triples.extend(triples)
2279
+ # Detect entities removed from a modified file.
2280
+ # _build_code_triples only appends to file_entities, never removes.
2281
+ # Compare previous idents against the idents derivable from the
2282
+ # current extraction to find what was deleted.
2283
+ if status == "M":
2284
+ module_ident = _code_ident("module", file_path)
2285
+ current_extracted_idents: set = {module_ident}
2286
+ for fn_name in extracted.get("functions", []):
2287
+ current_extracted_idents.add(_code_ident("function", file_path, fn_name))
2288
+ for cls_name in extracted.get("classes", []):
2289
+ current_extracted_idents.add(_code_ident("class", file_path, cls_name))
2290
+ removed_idents = previous_idents - current_extracted_idents
2291
+ for ident in removed_idents:
2292
+ orig_ts = entity_valid_from.get(ident, commit_ts_iso)
2293
+ desc = entity_descriptions.get(ident, "")
2294
+ close_items.append(
2295
+ (_build_close_triples(ident, desc, module_ident), orig_ts)
2296
+ )
2297
+ # Compute dep edges for this file and diff against previous
2298
+ module_ident = _code_ident("module", file_path)
2299
+ current_deps: set = set()
2300
+ for import_name in set(extracted.get("imports", [])):
2301
+ dep_ident = _resolve_module_import(import_name, file_entities)
2302
+ if dep_ident != module_ident:
2303
+ current_deps.add(dep_ident)
2304
+ previous_deps = file_deps.get(file_path, set())
2305
+ for dep_ident in current_deps - previous_deps:
2306
+ dep_add_triples.append(f"[{module_ident} :depends-on {dep_ident}]")
2307
+ dep_valid_from[(module_ident, dep_ident)] = commit_ts_iso
2308
+ if status == "M":
2309
+ for dep_ident in previous_deps - current_deps:
2310
+ orig_ts = dep_valid_from.get((module_ident, dep_ident), commit_ts_iso)
2311
+ close_items.append(
2312
+ ([f"[{module_ident} :depends-on {dep_ident}]"], orig_ts)
2313
+ )
2314
+ file_deps[file_path] = current_deps
2315
+
2316
+ # Split :contains triples out before batching. Minigraf's EAVT
2317
+ # pending index lacks value bytes in the key, so batching multiple
2318
+ # [module :contains fn] facts in one transact silently drops all
2319
+ # but the last. Each :contains triple gets its own transact so
2320
+ # they receive distinct tx_counts and avoid the index collision.
2321
+ contains_triples = [t for t in add_triples if ":contains" in t]
2322
+ other_triples = [t for t in add_triples if ":contains" not in t]
2323
+ _ingest_transact(db, other_triples, commit_ts_iso, reason)
2324
+ for ct in contains_triples:
2325
+ _ingest_transact(db, [ct], commit_ts_iso, reason)
2326
+ # :depends-on triples transacted individually — same EAVT collision risk
2327
+ # as :contains when multiple deps share the same source module
2328
+ for dt in dep_add_triples:
2329
+ _ingest_transact(db, [dt], commit_ts_iso, reason)
2330
+ for close_triples, orig_ts in close_items:
2331
+ _ingest_close(db, close_triples, orig_ts, commit_ts_iso, reason)
2332
+
2333
+ # Ingest :parent edges — one transact per parent to avoid EAVT
2334
+ # collision for merge commits (which have two parent hashes).
2335
+ try:
2336
+ for parent_hash in _git_parent_hashes(repo_path, commit_hash):
2337
+ parent_ident = f":commit/{parent_hash[:12]}"
2338
+ db.execute(
2339
+ f'(transact [[{commit_ident} :parent {parent_ident}]] '
2340
+ f'{{:valid-from "{commit_ts_iso}"}})'
2341
+ )
2342
+ except Exception:
2343
+ pass # non-fatal; parent edges are best-effort
2344
+
2345
+ _watermark_update(db, commit_hash, commit_ts_iso, reason)
2346
+ db.checkpoint()
2347
+
2348
+ finally:
2349
+ _db = None # release file lock between commits
2350
+
2351
+ _ingest_progress["processed"] += 1
2352
+ await asyncio.sleep(0) # yield to event loop
2353
+
2354
+ now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
2355
+ db = get_db()
2356
+ try:
2357
+ _ingest_tags(db, repo_path, now)
2358
+ _last_run_write(db, last_hash, now, _ingest_progress["processed"])
2359
+ db.checkpoint()
2360
+ finally:
2361
+ _db = None
2362
+
2363
+ _ingest_progress["status"] = "complete"
2364
+ _index_cache.invalidate()
2365
+
2366
+ except Exception as e:
2367
+ _ingest_progress["status"] = "error"
2368
+ _ingest_progress["error"] = str(e)
2369
+ _db = None
2370
+
2371
+
2372
+ async def handle_minigraf_ingest_git(
2373
+ repo_path: Optional[str] = None,
2374
+ branch: str = "HEAD",
2375
+ ) -> Dict[str, Any]:
2376
+ """Start background git ingestion. Returns immediately."""
2377
+ global _ingest_task, _ingest_progress
2378
+ if _ingest_task and not _ingest_task.done():
2379
+ return {"ok": False, "error": "ingestion already in progress"}
2380
+ repo = repo_path or str(Path.cwd())
2381
+ try:
2382
+ check = _subprocess.run(
2383
+ ["git", "rev-parse", "--git-dir"],
2384
+ cwd=repo, capture_output=True, text=True,
2385
+ )
2386
+ valid = check.returncode == 0
2387
+ except OSError:
2388
+ valid = False
2389
+ if not valid:
2390
+ return {
2391
+ "ok": False,
2392
+ "error": f"Not a git repository (or git not found): {repo}",
2393
+ }
2394
+ _ingest_progress = {
2395
+ "status": "idle", "processed": 0, "total": 0,
2396
+ "current_commit": "", "error": None,
2397
+ }
2398
+ _ingest_task = asyncio.create_task(_run_ingestion(repo, branch))
2399
+ return {"ok": True, "job_id": "git-ingest", "message": f"Ingestion started for {repo}"}
2400
+
2401
+
2402
+ def handle_minigraf_ingest_status() -> Dict[str, Any]:
2403
+ """Return current ingestion progress, augmented with graph-backed last-run info."""
2404
+ result: Dict[str, Any] = {"ok": True, **_ingest_progress}
2405
+ if _ingest_progress["status"] != "running":
2406
+ try:
2407
+ db = get_db()
2408
+ raw = db.execute(
2409
+ "(query [:find ?t ?h :any-valid-time "
2410
+ ":where [:ingestion/last-run-at :last-run-at ?t] "
2411
+ "[:ingestion/last-run-at :last-commit ?h]])"
2412
+ )
2413
+ rows = json.loads(raw).get("results", [])
2414
+ if rows:
2415
+ result["last_run_at"] = rows[0][0]
2416
+ result["last_commit"] = rows[0][1]
2417
+ else:
2418
+ result["last_run_at"] = None
2419
+ result["last_commit"] = None
2420
+ n = _total_ingested_query(db)
2421
+ result["total_ingested"] = n if n > 0 else None
2422
+ except Exception:
2423
+ result["last_run_at"] = None
2424
+ result["last_commit"] = None
2425
+ result["total_ingested"] = None
2426
+ return result
2427
+
2428
+
2429
+ # ---------------------------------------------------------------------------
2430
+ # MCP server
2431
+ # ---------------------------------------------------------------------------
2432
+
2433
+ from mcp.types import Tool, TextContent # noqa: E402
2434
+
2435
+ server = Server("temporal-reasoning")
2436
+
2437
+ _TOOLS: List[Tool] = [
2438
+ Tool(
2439
+ name="minigraf_query",
2440
+ description=(
2441
+ "Query Minigraf's persistent bi-temporal graph memory using Datalog. "
2442
+ "Call this BEFORE answering anything about past decisions, architecture, "
2443
+ "dependencies, or preferences. Supports :as-of for temporal queries to see "
2444
+ "what the graph contained at a past transaction time."
2445
+ ),
2446
+ inputSchema={
2447
+ "type": "object",
2448
+ "properties": {
2449
+ "datalog": {
2450
+ "type": "string",
2451
+ "description": "A valid Datalog query, e.g. [:find ?name :where [?e :component/name ?name]]",
2452
+ },
2453
+ },
2454
+ "required": ["datalog"],
2455
+ },
2456
+ ),
2457
+ Tool(
2458
+ name="minigraf_transact",
2459
+ description=(
2460
+ "Store a durable fact in Minigraf's graph memory. Only call this for decisions, "
2461
+ "architecture, dependencies, constraints, or preferences — NOT for transient "
2462
+ "observations or intermediate reasoning."
2463
+ ),
2464
+ inputSchema={
2465
+ "type": "object",
2466
+ "properties": {
2467
+ "facts": {
2468
+ "type": "string",
2469
+ "description": (
2470
+ 'A Datalog transact block, e.g. [[:decision/cache-strategy '
2471
+ ':decision/description "use Redis"]]'
2472
+ ),
2473
+ },
2474
+ "reason": {
2475
+ "type": "string",
2476
+ "description": (
2477
+ "Why this fact deserves long-term storage. "
2478
+ "Forces you to justify writes — only store facts worth remembering."
2479
+ ),
2480
+ },
2481
+ },
2482
+ "required": ["facts", "reason"],
2483
+ },
2484
+ ),
2485
+ Tool(
2486
+ name="minigraf_retract",
2487
+ description=(
2488
+ "Retract a fact from Minigraf's graph memory. Retraction records a new fact with "
2489
+ "asserted=false — the original stays in history for bi-temporal auditing."
2490
+ ),
2491
+ inputSchema={
2492
+ "type": "object",
2493
+ "properties": {
2494
+ "facts": {
2495
+ "type": "string",
2496
+ "description": "A Datalog retract block, e.g. [[:component/auth :calls :component/jwt]]",
2497
+ },
2498
+ "reason": {
2499
+ "type": "string",
2500
+ "description": "Why this fact is being retracted. Forces you to justify the removal.",
2501
+ },
2502
+ },
2503
+ "required": ["facts", "reason"],
2504
+ },
2505
+ ),
2506
+ Tool(
2507
+ name="minigraf_rule",
2508
+ description=(
2509
+ "Register a Datalog rule for use in subsequent queries. "
2510
+ "Rules enable recursive graph traversal (e.g. ancestor, reachable). "
2511
+ "A rule persists for the server session — re-register after a server restart. "
2512
+ "Syntax: [(rule-name ?arg ...) body-clause ...] — omit the outer (rule ...) wrapper."
2513
+ ),
2514
+ inputSchema={
2515
+ "type": "object",
2516
+ "properties": {
2517
+ "rule": {
2518
+ "type": "string",
2519
+ "description": (
2520
+ "Rule vector, e.g. [(ancestor ?a ?d) [?a :parent ?d]] "
2521
+ "or [(ancestor ?a ?d) [?a :parent ?m] (ancestor ?m ?d)]"
2522
+ ),
2523
+ },
2524
+ },
2525
+ "required": ["rule"],
2526
+ },
2527
+ ),
2528
+ Tool(
2529
+ name="minigraf_report_issue",
2530
+ description=(
2531
+ "Report an issue with Minigraf query or transact operations. "
2532
+ "Use this when Minigraf returns errors to file a GitHub issue for tracking."
2533
+ ),
2534
+ inputSchema={
2535
+ "type": "object",
2536
+ "properties": {
2537
+ "issue_type": {
2538
+ "type": "string",
2539
+ "description": "Type of issue to report",
2540
+ "enum": ["invalid_query", "transact_failure", "parse_error", "minigraf_bug"],
2541
+ },
2542
+ "description": {
2543
+ "type": "string",
2544
+ "description": "Human-readable description of the issue",
2545
+ },
2546
+ "datalog": {
2547
+ "type": "string",
2548
+ "description": "Optional Datalog query or transact that failed",
2549
+ },
2550
+ "error": {
2551
+ "type": "string",
2552
+ "description": "Optional error message returned by Minigraf",
2553
+ },
2554
+ },
2555
+ "required": ["issue_type", "description"],
2556
+ },
2557
+ ),
2558
+ Tool(
2559
+ name="memory_prepare_turn",
2560
+ description=(
2561
+ "Retrieve relevant memory context for the current user message. "
2562
+ "Call this at the START of every turn, before reading the user's message. "
2563
+ "Returns a context block string to prepend to your working context."
2564
+ ),
2565
+ inputSchema={
2566
+ "type": "object",
2567
+ "properties": {
2568
+ "user_message": {
2569
+ "type": "string",
2570
+ "description": "The user's message for this turn",
2571
+ },
2572
+ },
2573
+ "required": ["user_message"],
2574
+ },
2575
+ ),
2576
+ Tool(
2577
+ name="memory_finalize_turn",
2578
+ description=(
2579
+ "Extract and store memorable facts from the completed conversation turn. "
2580
+ "Call this at the END of every turn, after composing your response. "
2581
+ "Pass the full user+agent exchange for this turn."
2582
+ ),
2583
+ inputSchema={
2584
+ "type": "object",
2585
+ "properties": {
2586
+ "conversation_delta": {
2587
+ "type": "string",
2588
+ "description": "The user message and agent response for this turn",
2589
+ },
2590
+ },
2591
+ "required": ["conversation_delta"],
2592
+ },
2593
+ ),
2594
+ Tool(
2595
+ name="minigraf_audit",
2596
+ description=(
2597
+ "Audit all graph entities against the built-in schema. "
2598
+ "Retracts entities with schema violations (missing required attributes, "
2599
+ "unknown types, unknown attributes). Run periodically or after heavy write sessions. "
2600
+ "Pass as_of (transaction number) for a read-only point-in-time audit without retractions."
2601
+ ),
2602
+ inputSchema={
2603
+ "type": "object",
2604
+ "properties": {
2605
+ "as_of": {
2606
+ "type": "integer",
2607
+ "description": "Optional transaction number for point-in-time audit (read-only, no retractions)",
2608
+ },
2609
+ },
2610
+ "required": [],
2611
+ },
2612
+ ),
2613
+ Tool(
2614
+ name="minigraf_ingest_git",
2615
+ description=(
2616
+ "Ingest code structure from git history into the bi-temporal graph. "
2617
+ "Starts a background task and returns immediately. "
2618
+ "Call minigraf_ingest_status to poll progress."
2619
+ ),
2620
+ inputSchema={
2621
+ "type": "object",
2622
+ "properties": {
2623
+ "repo_path": {
2624
+ "type": "string",
2625
+ "description": "Absolute path to the git repo root. Defaults to cwd.",
2626
+ },
2627
+ "branch": {
2628
+ "type": "string",
2629
+ "description": "Branch or ref to walk. Defaults to HEAD.",
2630
+ },
2631
+ },
2632
+ "required": [],
2633
+ },
2634
+ ),
2635
+ Tool(
2636
+ name="minigraf_ingest_status",
2637
+ description=(
2638
+ "Return the current git ingestion progress. "
2639
+ "status is one of: idle, running, complete, error."
2640
+ ),
2641
+ inputSchema={"type": "object", "properties": {}, "required": []},
2642
+ ),
2643
+ ]
2644
+
2645
+
2646
+ @server.list_tools()
2647
+ async def list_tools() -> List[Tool]:
2648
+ return _TOOLS
2649
+
2650
+
2651
+ @server.call_tool()
2652
+ async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:
2653
+ global _db
2654
+ try:
2655
+ if name == "minigraf_query":
2656
+ result = handle_minigraf_query(arguments["datalog"])
2657
+ return [TextContent(type="text", text=json.dumps(result))]
2658
+
2659
+ if name == "minigraf_transact":
2660
+ result = handle_minigraf_transact(arguments["facts"], arguments["reason"])
2661
+ return [TextContent(type="text", text=json.dumps(result))]
2662
+
2663
+ if name == "minigraf_retract":
2664
+ result = handle_minigraf_retract(arguments["facts"], arguments["reason"])
2665
+ return [TextContent(type="text", text=json.dumps(result))]
2666
+
2667
+ if name == "minigraf_rule":
2668
+ result = handle_minigraf_rule(arguments["rule"])
2669
+ return [TextContent(type="text", text=json.dumps(result))]
2670
+
2671
+ if name == "minigraf_report_issue":
2672
+ result = handle_minigraf_report_issue(
2673
+ arguments["issue_type"],
2674
+ arguments["description"],
2675
+ datalog=arguments.get("datalog"),
2676
+ error=arguments.get("error"),
2677
+ )
2678
+ return [TextContent(type="text", text=json.dumps(result))]
2679
+
2680
+ if name == "memory_prepare_turn":
2681
+ block = handle_memory_prepare_turn(arguments["user_message"])
2682
+ return [TextContent(type="text", text=block)]
2683
+
2684
+ if name == "memory_finalize_turn":
2685
+ result = await handle_memory_finalize_turn(arguments["conversation_delta"])
2686
+ return [TextContent(type="text", text=json.dumps(result))]
2687
+
2688
+ if name == "minigraf_audit":
2689
+ as_of = arguments.get("as_of")
2690
+ result = handle_minigraf_audit(as_of=as_of)
2691
+ return [TextContent(type="text", text=json.dumps(result))]
2692
+
2693
+ if name == "minigraf_ingest_git":
2694
+ result = await handle_minigraf_ingest_git(
2695
+ repo_path=arguments.get("repo_path"),
2696
+ branch=arguments.get("branch", "HEAD"),
2697
+ )
2698
+ return [TextContent(type="text", text=json.dumps(result))]
2699
+
2700
+
2701
+ if name == "minigraf_ingest_status":
2702
+ result = handle_minigraf_ingest_status()
2703
+ return [TextContent(type="text", text=json.dumps(result))]
2704
+
2705
+ raise ValueError(f"Unknown tool: {name}")
2706
+ finally:
2707
+ # Release the file lock after every tool call so that the prepare_hook
2708
+ # subprocess can open the DB between turns. get_db() re-opens on demand.
2709
+ _db = None
2710
+
2711
+
2712
+ async def main() -> None:
2713
+ global _server_ref, _ingest_task, _ingest_progress
2714
+ _server_ref = server
2715
+ # Auto-start incremental ingest on server startup so ingestion begins
2716
+ # immediately without waiting for a user prompt. Runs as a background
2717
+ # asyncio task — never blocks the message loop.
2718
+ # Set MINIGRAF_NO_AUTO_INGEST=1 to skip auto-start (used by eval sandboxes).
2719
+ _ingest_progress = {
2720
+ "status": "idle", "processed": 0, "total": 0,
2721
+ "current_commit": "", "error": None,
2722
+ }
2723
+ if not os.environ.get("MINIGRAF_NO_AUTO_INGEST"):
2724
+ _ingest_task = asyncio.create_task(_run_ingestion(str(Path.cwd()), "HEAD"))
2725
+ async with stdio_server() as (read_stream, write_stream):
2726
+ await server.run(
2727
+ read_stream,
2728
+ write_stream,
2729
+ server.create_initialization_options(),
2730
+ )
2731
+
2732
+
2733
+ if __name__ == "__main__":
2734
+ asyncio.run(main())