temporal-reasoning 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_server.py +2734 -0
- report_issue.py +233 -0
- temporal_reasoning-0.3.2.dist-info/METADATA +287 -0
- temporal_reasoning-0.3.2.dist-info/RECORD +8 -0
- temporal_reasoning-0.3.2.dist-info/WHEEL +5 -0
- temporal_reasoning-0.3.2.dist-info/entry_points.txt +2 -0
- temporal_reasoning-0.3.2.dist-info/licenses/LICENSE +21 -0
- temporal_reasoning-0.3.2.dist-info/top_level.txt +2 -0
mcp_server.py
ADDED
|
@@ -0,0 +1,2734 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Temporal Reasoning MCP Server.
|
|
4
|
+
|
|
5
|
+
Persistent stdio MCP server providing bi-temporal graph memory for AI coding agents.
|
|
6
|
+
Sole interface to the minigraf .graph file via the MiniGrafDb Python binding.
|
|
7
|
+
"""
|
|
8
|
+
import asyncio
|
|
9
|
+
import datetime
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import subprocess as _subprocess
|
|
14
|
+
import sys
|
|
15
|
+
import threading
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
from mcp.server import Server
|
|
20
|
+
from mcp.server.stdio import stdio_server
|
|
21
|
+
from minigraf import MiniGrafDb, MiniGrafError
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from rank_bm25 import BM25Okapi as _BM25Okapi
|
|
25
|
+
_BM25_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
_BM25Okapi = None # type: ignore[assignment,misc]
|
|
28
|
+
_BM25_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Session-scoped rules — registered once at startup, cached in RuleRegistry
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
SESSION_RULES = [
|
|
34
|
+
"(rule [(linked ?a ?b) [?a :depends-on ?b]])",
|
|
35
|
+
"(rule [(linked ?a ?b) [?a :calls ?b]])",
|
|
36
|
+
"(rule [(reachable ?a ?b) [?a :depends-on ?b]])",
|
|
37
|
+
"(rule [(reachable ?a ?b) [?a :calls ?b]])",
|
|
38
|
+
"(rule [(linked ?a ?b) [?a :contains ?b]])",
|
|
39
|
+
"(rule [(reachable ?a ?b) [?a :contains ?b]])",
|
|
40
|
+
# Commit-graph traversal: (ancestor ?child ?anc) holds when ?anc is a
|
|
41
|
+
# (possibly transitive) git ancestor of ?child via :parent edges.
|
|
42
|
+
# Only evaluated when a query explicitly calls (ancestor ...).
|
|
43
|
+
"(rule [(ancestor ?child ?anc) [?child :parent ?anc]])",
|
|
44
|
+
"(rule [(ancestor ?child ?anc) [?child :parent ?mid] (ancestor ?mid ?anc)])",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# User-registered rules — persisted across DB reopens (unlike SESSION_RULES,
|
|
48
|
+
# these are accumulated at runtime via minigraf_rule and re-applied on every open).
|
|
49
|
+
_user_rules: List[str] = []
|
|
50
|
+
|
|
51
|
+
# Module-level DB instance — opened once, held for the session lifetime
|
|
52
|
+
_db: Optional[MiniGrafDb] = None
|
|
53
|
+
|
|
54
|
+
# Track graph path and last-known mtime so we can detect external modifications.
|
|
55
|
+
# minigraf's Drop impl writes to the file even for read-only handles, which
|
|
56
|
+
# invalidates any other open handle's in-memory page table. Reopening on
|
|
57
|
+
# mtime change is the workaround until the upstream bug is fixed.
|
|
58
|
+
_graph_path: str = ""
|
|
59
|
+
_db_mtime: float = 0.0
|
|
60
|
+
|
|
61
|
+
# Module-level server reference — set after server creation for MCP sampling
|
|
62
|
+
_server_ref: Optional[Server] = None
|
|
63
|
+
|
|
64
|
+
# Ingestion state
|
|
65
|
+
_ingest_task: Optional[asyncio.Task] = None
|
|
66
|
+
_ingest_progress: Dict[str, Any] = {
|
|
67
|
+
"status": "idle", "processed": 0, "total": 0,
|
|
68
|
+
"current_commit": "", "error": None,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Language detection and grammar caching
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
_EXT_TO_LANG: Dict[str, str] = {
|
|
76
|
+
".py": "python", ".js": "javascript", ".ts": "typescript",
|
|
77
|
+
".tsx": "tsx", ".jsx": "javascript", ".rs": "rust",
|
|
78
|
+
".go": "go", ".java": "java", ".c": "c", ".cpp": "cpp",
|
|
79
|
+
".cs": "c_sharp", ".rb": "ruby", ".php": "php",
|
|
80
|
+
".kt": "kotlin", ".swift": "swift", ".scala": "scala",
|
|
81
|
+
".hs": "haskell", ".lua": "lua", ".ex": "elixir", ".exs": "elixir",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
_grammar_cache: Dict[str, Any] = {} # lang_name → Parser or None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _get_parser(file_path: str) -> Optional[Any]:
|
|
88
|
+
"""Return a cached tree_sitter.Parser for the file's language, or None if unsupported.
|
|
89
|
+
|
|
90
|
+
Tries two backends in order:
|
|
91
|
+
1. tree_sitter_languages (bundled, requires Python <=3.12)
|
|
92
|
+
2. Individual tree-sitter-<lang> packages (e.g. tree-sitter-rust, tree-sitter-python)
|
|
93
|
+
— compatible with Python 3.13+ and tree-sitter >=0.22
|
|
94
|
+
"""
|
|
95
|
+
ext = Path(file_path).suffix.lower()
|
|
96
|
+
lang_name = _EXT_TO_LANG.get(ext)
|
|
97
|
+
if not lang_name:
|
|
98
|
+
return None
|
|
99
|
+
if lang_name in _grammar_cache:
|
|
100
|
+
return _grammar_cache[lang_name]
|
|
101
|
+
|
|
102
|
+
parser = None
|
|
103
|
+
|
|
104
|
+
# Attempt 1: tree_sitter_languages (bundled grammars, old-style API)
|
|
105
|
+
try:
|
|
106
|
+
import tree_sitter_languages # type: ignore
|
|
107
|
+
import tree_sitter # type: ignore
|
|
108
|
+
lang = tree_sitter_languages.get_language(lang_name)
|
|
109
|
+
p = tree_sitter.Parser()
|
|
110
|
+
p.set_language(lang)
|
|
111
|
+
parser = p
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Attempt 2: individual tree-sitter-<lang> packages (new-style API, Python 3.13+)
|
|
116
|
+
if parser is None:
|
|
117
|
+
try:
|
|
118
|
+
mod = __import__(f"tree_sitter_{lang_name}", fromlist=["language"])
|
|
119
|
+
from tree_sitter import Language, Parser # type: ignore
|
|
120
|
+
# PHP exposes language_php() instead of language()
|
|
121
|
+
lang_fn = getattr(mod, f"language_{lang_name}", None) or mod.language
|
|
122
|
+
lang_obj = Language(lang_fn())
|
|
123
|
+
parser = Parser(lang_obj)
|
|
124
|
+
except Exception:
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
_grammar_cache[lang_name] = parser
|
|
128
|
+
return parser
|
|
129
|
+
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
# AST extraction
|
|
132
|
+
# ---------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
_LANG_NODE_TYPES: Dict[str, Dict[str, set]] = {
|
|
135
|
+
"python": {
|
|
136
|
+
"functions": {"function_definition", "async_function_definition"},
|
|
137
|
+
"classes": {"class_definition"},
|
|
138
|
+
"imports": {"import_statement", "import_from_statement"},
|
|
139
|
+
"calls": {"call"},
|
|
140
|
+
},
|
|
141
|
+
"javascript": {
|
|
142
|
+
"functions": {"function_declaration", "function_expression", "method_definition"},
|
|
143
|
+
"classes": {"class_declaration"},
|
|
144
|
+
"imports": {"import_statement"},
|
|
145
|
+
"calls": {"call_expression"},
|
|
146
|
+
},
|
|
147
|
+
"typescript": {
|
|
148
|
+
"functions": {"function_declaration", "function_expression", "method_definition"},
|
|
149
|
+
"classes": {"class_declaration"},
|
|
150
|
+
"imports": {"import_statement"},
|
|
151
|
+
"calls": {"call_expression"},
|
|
152
|
+
},
|
|
153
|
+
"rust": {
|
|
154
|
+
"functions": {"function_item"},
|
|
155
|
+
"classes": {"struct_item", "impl_item"},
|
|
156
|
+
"imports": {"use_declaration"},
|
|
157
|
+
"calls": {"call_expression"},
|
|
158
|
+
},
|
|
159
|
+
"go": {
|
|
160
|
+
"functions": {"function_declaration", "method_declaration"},
|
|
161
|
+
"classes": {"type_declaration"},
|
|
162
|
+
"imports": {"import_declaration"},
|
|
163
|
+
"calls": {"call_expression"},
|
|
164
|
+
},
|
|
165
|
+
"java": {
|
|
166
|
+
"functions": {"method_declaration"},
|
|
167
|
+
"classes": {"class_declaration"},
|
|
168
|
+
"imports": {"import_declaration"},
|
|
169
|
+
"calls": {"method_invocation"},
|
|
170
|
+
},
|
|
171
|
+
"c": {
|
|
172
|
+
"functions": {"function_definition"},
|
|
173
|
+
"classes": {"struct_specifier"},
|
|
174
|
+
"imports": {"preproc_include"},
|
|
175
|
+
"calls": {"call_expression"},
|
|
176
|
+
},
|
|
177
|
+
"cpp": {
|
|
178
|
+
"functions": {"function_definition"},
|
|
179
|
+
"classes": {"class_specifier", "struct_specifier"},
|
|
180
|
+
"imports": {"preproc_include"},
|
|
181
|
+
"calls": {"call_expression"},
|
|
182
|
+
},
|
|
183
|
+
"c_sharp": {
|
|
184
|
+
"functions": {"method_declaration"},
|
|
185
|
+
"classes": {"class_declaration"},
|
|
186
|
+
"imports": {"using_directive"},
|
|
187
|
+
"calls": {"invocation_expression"},
|
|
188
|
+
},
|
|
189
|
+
"ruby": {
|
|
190
|
+
"functions": {"method"},
|
|
191
|
+
"classes": {"class"},
|
|
192
|
+
"imports": {"call"},
|
|
193
|
+
"calls": set(),
|
|
194
|
+
},
|
|
195
|
+
"php": {
|
|
196
|
+
"functions": {"function_definition", "method_declaration"},
|
|
197
|
+
"classes": {"class_declaration"},
|
|
198
|
+
"imports": {"require_expression", "include_expression",
|
|
199
|
+
"require_once_expression", "include_once_expression"},
|
|
200
|
+
"calls": {"function_call_expression"},
|
|
201
|
+
},
|
|
202
|
+
"kotlin": {
|
|
203
|
+
"functions": {"function_declaration"},
|
|
204
|
+
"classes": {"class_declaration"},
|
|
205
|
+
"imports": {"import"},
|
|
206
|
+
"calls": {"call_expression"},
|
|
207
|
+
},
|
|
208
|
+
"swift": {
|
|
209
|
+
"functions": {"function_declaration"},
|
|
210
|
+
"classes": {"class_declaration"},
|
|
211
|
+
"imports": {"import_declaration"},
|
|
212
|
+
"calls": {"call_expression"},
|
|
213
|
+
},
|
|
214
|
+
"scala": {
|
|
215
|
+
"functions": {"function_definition"},
|
|
216
|
+
"classes": {"class_definition"},
|
|
217
|
+
"imports": {"import_declaration"},
|
|
218
|
+
"calls": {"call_expression"},
|
|
219
|
+
},
|
|
220
|
+
"haskell": {
|
|
221
|
+
"functions": {"function"},
|
|
222
|
+
"classes": {"data_type"},
|
|
223
|
+
"imports": {"import"},
|
|
224
|
+
"calls": {"apply"},
|
|
225
|
+
},
|
|
226
|
+
"lua": {
|
|
227
|
+
"functions": {"function_definition"},
|
|
228
|
+
"classes": set(),
|
|
229
|
+
"imports": {"function_call"},
|
|
230
|
+
"calls": set(),
|
|
231
|
+
},
|
|
232
|
+
"elixir": {
|
|
233
|
+
"functions": {"def", "defp"},
|
|
234
|
+
"classes": {"defmodule"},
|
|
235
|
+
"imports": {"call"},
|
|
236
|
+
"calls": set(),
|
|
237
|
+
},
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _rust_use_root(node) -> Optional[str]:
|
|
242
|
+
"""Return the root crate/module name from a Rust use_declaration node.
|
|
243
|
+
|
|
244
|
+
Rust use paths have these shapes in the tree-sitter AST:
|
|
245
|
+
use_declaration
|
|
246
|
+
scoped_identifier → std::collections::HashMap
|
|
247
|
+
scoped_use_list → crate::storage::{mod1, mod2}
|
|
248
|
+
identifier → use foo;
|
|
249
|
+
use_as_clause → use foo as bar;
|
|
250
|
+
|
|
251
|
+
We always want the leftmost identifier in the path, which is the crate name
|
|
252
|
+
(e.g. "std", "tokio") or "crate"/"super"/"self" for intra-project paths.
|
|
253
|
+
For crate-relative paths we return the first path segment after "crate" so
|
|
254
|
+
the edge points to the local module, not the generic keyword "crate".
|
|
255
|
+
"""
|
|
256
|
+
def leftmost_ident(n) -> Optional[str]:
|
|
257
|
+
"""Recursively find the leftmost identifier/keyword in a path node."""
|
|
258
|
+
if n.type == "identifier":
|
|
259
|
+
return n.text.decode("utf-8")
|
|
260
|
+
if n.type in ("crate", "super", "self"):
|
|
261
|
+
# intra-project: find first real identifier among siblings/children
|
|
262
|
+
return None # caller will try the next path segment
|
|
263
|
+
# scoped_identifier / scoped_use_list: path is in named children
|
|
264
|
+
for child in n.named_children:
|
|
265
|
+
result = leftmost_ident(child)
|
|
266
|
+
if result is not None:
|
|
267
|
+
return result
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
def root_from_path(n) -> Optional[str]:
|
|
271
|
+
"""Extract root module name from a path-like node."""
|
|
272
|
+
if n.type == "identifier":
|
|
273
|
+
return n.text.decode("utf-8")
|
|
274
|
+
if n.type in ("crate", "super", "self"):
|
|
275
|
+
return None # skip; caller handles intra-project
|
|
276
|
+
if n.type in ("scoped_identifier", "scoped_use_list"):
|
|
277
|
+
children = n.named_children
|
|
278
|
+
if not children:
|
|
279
|
+
return None
|
|
280
|
+
first = children[0]
|
|
281
|
+
if first.type in ("crate", "super", "self"):
|
|
282
|
+
# intra-project: return the next segment
|
|
283
|
+
if len(children) > 1:
|
|
284
|
+
seg = children[1]
|
|
285
|
+
if seg.type == "identifier":
|
|
286
|
+
return seg.text.decode("utf-8")
|
|
287
|
+
return None
|
|
288
|
+
return root_from_path(first)
|
|
289
|
+
if n.type == "use_as_clause":
|
|
290
|
+
path_node = n.child_by_field_name("path")
|
|
291
|
+
return root_from_path(path_node) if path_node else None
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
for child in node.named_children:
|
|
295
|
+
result = root_from_path(child)
|
|
296
|
+
if result:
|
|
297
|
+
return result
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _c_include_name(node) -> Optional[str]:
|
|
302
|
+
"""Return the header name (no path, no extension) from a C/C++ preproc_include node.
|
|
303
|
+
|
|
304
|
+
Handles both:
|
|
305
|
+
#include <stdio.h> → system_lib_string → "stdio"
|
|
306
|
+
#include "myheader.h" → string_literal → "myheader"
|
|
307
|
+
"""
|
|
308
|
+
import os
|
|
309
|
+
for child in node.children:
|
|
310
|
+
if child.type in ("system_lib_string", "string_literal"):
|
|
311
|
+
raw = child.text.decode("utf-8").strip("<>\"'")
|
|
312
|
+
return os.path.splitext(os.path.basename(raw))[0]
|
|
313
|
+
return None
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _csharp_using_name(node) -> Optional[str]:
|
|
317
|
+
"""Return the root namespace from a C# using_directive node.
|
|
318
|
+
|
|
319
|
+
using System; → "System"
|
|
320
|
+
using System.Collections.Generic; → "System"
|
|
321
|
+
"""
|
|
322
|
+
def _first_ident(n) -> Optional[str]:
|
|
323
|
+
if n.type == "identifier":
|
|
324
|
+
return n.text.decode("utf-8")
|
|
325
|
+
for c in n.named_children:
|
|
326
|
+
result = _first_ident(c)
|
|
327
|
+
if result:
|
|
328
|
+
return result
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
return _first_ident(node)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _ruby_require_name(node) -> Optional[str]:
|
|
335
|
+
"""Return the required module name from a Ruby call node.
|
|
336
|
+
|
|
337
|
+
Handles:
|
|
338
|
+
require 'rails' → "rails"
|
|
339
|
+
require_relative 'my_mod' → "my_mod"
|
|
340
|
+
Returns None for non-require calls.
|
|
341
|
+
"""
|
|
342
|
+
import os
|
|
343
|
+
method = node.child_by_field_name("method")
|
|
344
|
+
if method is None or method.text.decode("utf-8") not in ("require", "require_relative"):
|
|
345
|
+
return None
|
|
346
|
+
args = node.child_by_field_name("arguments")
|
|
347
|
+
if args is None:
|
|
348
|
+
return None
|
|
349
|
+
for child in args.named_children:
|
|
350
|
+
if child.type == "string":
|
|
351
|
+
content_node = next(
|
|
352
|
+
(c for c in child.named_children if c.type == "string_content"),
|
|
353
|
+
None,
|
|
354
|
+
)
|
|
355
|
+
if content_node:
|
|
356
|
+
val = content_node.text.decode("utf-8")
|
|
357
|
+
else:
|
|
358
|
+
val = child.text.decode("utf-8").strip("'\"")
|
|
359
|
+
return os.path.splitext(os.path.basename(val))[0]
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _lua_require_name(node) -> Optional[str]:
|
|
364
|
+
"""Return the module name from a Lua function_call to require().
|
|
365
|
+
|
|
366
|
+
require("socket") → "socket"
|
|
367
|
+
Returns None for non-require calls.
|
|
368
|
+
|
|
369
|
+
AST shape:
|
|
370
|
+
function_call
|
|
371
|
+
identifier b'require'
|
|
372
|
+
arguments
|
|
373
|
+
( b'('
|
|
374
|
+
string b'"socket"'
|
|
375
|
+
) b')'
|
|
376
|
+
"""
|
|
377
|
+
fn_node = None
|
|
378
|
+
for child in node.children:
|
|
379
|
+
if child.type == "identifier":
|
|
380
|
+
fn_node = child
|
|
381
|
+
break
|
|
382
|
+
if fn_node is None or fn_node.text.decode("utf-8") != "require":
|
|
383
|
+
return None
|
|
384
|
+
for child in node.children:
|
|
385
|
+
if child.type == "arguments":
|
|
386
|
+
for arg in child.children:
|
|
387
|
+
if arg.type == "string":
|
|
388
|
+
return arg.text.decode("utf-8").strip("'\"")
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _elixir_module_name(node) -> Optional[str]:
|
|
393
|
+
"""Return the root module name from an Elixir alias/import/use/require call.
|
|
394
|
+
|
|
395
|
+
alias MyApp.Router → "MyApp"
|
|
396
|
+
import Ecto.Query → "Ecto"
|
|
397
|
+
use Phoenix.Controller → "Phoenix"
|
|
398
|
+
require Logger → "Logger"
|
|
399
|
+
Returns None for non-module calls (e.g. IO.puts/1 where target is a dot node).
|
|
400
|
+
"""
|
|
401
|
+
_ELIXIR_MODULE_CALLS = {"alias", "import", "use", "require"}
|
|
402
|
+
# The call target is the field named "target" — an identifier for alias/import/use/require,
|
|
403
|
+
# or a dot node for things like IO.puts/1.
|
|
404
|
+
target = node.child_by_field_name("target")
|
|
405
|
+
if target is None or target.type != "identifier":
|
|
406
|
+
return None
|
|
407
|
+
if target.text.decode("utf-8") not in _ELIXIR_MODULE_CALLS:
|
|
408
|
+
return None
|
|
409
|
+
# The module argument is in an "arguments" child (unnamed field).
|
|
410
|
+
# It contains an "alias" node whose text is the full dotted module name.
|
|
411
|
+
for child in node.children:
|
|
412
|
+
if child.type == "arguments":
|
|
413
|
+
for arg in child.children:
|
|
414
|
+
if arg.type == "alias":
|
|
415
|
+
txt = arg.text.decode("utf-8")
|
|
416
|
+
return txt.split(".")[0]
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _extract_import_name(node, lang_name: str) -> List[str]:
|
|
421
|
+
"""Extract top-level module names from an import node (may return multiple)."""
|
|
422
|
+
names: List[str] = []
|
|
423
|
+
if lang_name == "python":
|
|
424
|
+
if node.type == "import_from_statement":
|
|
425
|
+
m = node.child_by_field_name("module_name")
|
|
426
|
+
if m:
|
|
427
|
+
names.append(m.text.decode("utf-8").split(".")[0])
|
|
428
|
+
else:
|
|
429
|
+
# import_statement: collect all top-level module names
|
|
430
|
+
for child in node.named_children:
|
|
431
|
+
if child.type == "aliased_import":
|
|
432
|
+
n = child.child_by_field_name("name")
|
|
433
|
+
if n:
|
|
434
|
+
names.append(n.text.decode("utf-8").split(".")[0])
|
|
435
|
+
elif child.type == "dotted_name":
|
|
436
|
+
names.append(child.text.decode("utf-8").split(".")[0])
|
|
437
|
+
elif lang_name in ("javascript", "typescript"):
|
|
438
|
+
src = node.child_by_field_name("source")
|
|
439
|
+
if src:
|
|
440
|
+
names.append(src.text.decode("utf-8").strip("'\""))
|
|
441
|
+
elif lang_name == "rust":
|
|
442
|
+
name = _rust_use_root(node)
|
|
443
|
+
if name:
|
|
444
|
+
names.append(name)
|
|
445
|
+
elif lang_name == "go":
|
|
446
|
+
def _go_spec(spec_node):
|
|
447
|
+
path = spec_node.child_by_field_name("path")
|
|
448
|
+
if path:
|
|
449
|
+
val = path.text.decode("utf-8").strip('"')
|
|
450
|
+
names.append(val.split("/")[-1])
|
|
451
|
+
|
|
452
|
+
for child in node.named_children:
|
|
453
|
+
if child.type == "import_spec":
|
|
454
|
+
_go_spec(child)
|
|
455
|
+
elif child.type == "import_spec_list":
|
|
456
|
+
for spec in child.named_children:
|
|
457
|
+
if spec.type == "import_spec":
|
|
458
|
+
_go_spec(spec)
|
|
459
|
+
elif lang_name == "java":
|
|
460
|
+
def _java_leftmost(n) -> Optional[str]:
|
|
461
|
+
if n.type == "identifier":
|
|
462
|
+
return n.text.decode("utf-8")
|
|
463
|
+
for c in n.named_children:
|
|
464
|
+
result = _java_leftmost(c)
|
|
465
|
+
if result:
|
|
466
|
+
return result
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
result = _java_leftmost(node)
|
|
470
|
+
if result:
|
|
471
|
+
names.append(result)
|
|
472
|
+
elif lang_name in ("c", "cpp"):
|
|
473
|
+
name = _c_include_name(node)
|
|
474
|
+
if name:
|
|
475
|
+
names.append(name)
|
|
476
|
+
elif lang_name == "c_sharp":
|
|
477
|
+
name = _csharp_using_name(node)
|
|
478
|
+
if name:
|
|
479
|
+
names.append(name)
|
|
480
|
+
elif lang_name == "ruby":
|
|
481
|
+
name = _ruby_require_name(node)
|
|
482
|
+
if name:
|
|
483
|
+
names.append(name)
|
|
484
|
+
elif lang_name == "php":
|
|
485
|
+
import os
|
|
486
|
+
for child in node.children:
|
|
487
|
+
if child.type in ("string", "encapsed_string", "string_literal"):
|
|
488
|
+
val = child.text.decode("utf-8").strip("'\"")
|
|
489
|
+
names.append(os.path.splitext(os.path.basename(val))[0])
|
|
490
|
+
break
|
|
491
|
+
elif lang_name == "kotlin":
|
|
492
|
+
def _kotlin_first_seg(n) -> Optional[str]:
|
|
493
|
+
if n.type in ("simple_identifier", "identifier"):
|
|
494
|
+
return n.text.decode("utf-8")
|
|
495
|
+
for c in n.named_children:
|
|
496
|
+
result = _kotlin_first_seg(c)
|
|
497
|
+
if result:
|
|
498
|
+
return result
|
|
499
|
+
return None
|
|
500
|
+
|
|
501
|
+
result = _kotlin_first_seg(node)
|
|
502
|
+
if result:
|
|
503
|
+
names.append(result)
|
|
504
|
+
elif lang_name == "swift":
|
|
505
|
+
for child in node.named_children:
|
|
506
|
+
if child.type in ("identifier", "simple_identifier"):
|
|
507
|
+
names.append(child.text.decode("utf-8"))
|
|
508
|
+
break
|
|
509
|
+
elif lang_name == "scala":
|
|
510
|
+
for child in node.named_children:
|
|
511
|
+
txt = child.text.decode("utf-8")
|
|
512
|
+
names.append(txt.split(".")[0])
|
|
513
|
+
break
|
|
514
|
+
elif lang_name == "haskell":
|
|
515
|
+
for child in node.named_children:
|
|
516
|
+
if child.type in ("module", "qualified_module", "constructor"):
|
|
517
|
+
txt = child.text.decode("utf-8")
|
|
518
|
+
names.append(txt.split(".")[0])
|
|
519
|
+
break
|
|
520
|
+
elif lang_name == "lua":
|
|
521
|
+
name = _lua_require_name(node)
|
|
522
|
+
if name:
|
|
523
|
+
names.append(name)
|
|
524
|
+
elif lang_name == "elixir":
|
|
525
|
+
name = _elixir_module_name(node)
|
|
526
|
+
if name:
|
|
527
|
+
names.append(name)
|
|
528
|
+
return names
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def _extract_call_name(node, lang_name: str) -> Optional[str]:
|
|
532
|
+
"""Extract the function name from a call node (best-effort, identifiers only)."""
|
|
533
|
+
fn = node.child_by_field_name("function")
|
|
534
|
+
if fn and fn.type == "identifier":
|
|
535
|
+
return fn.text.decode("utf-8")
|
|
536
|
+
return None
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _walk_ast(node, results: Dict[str, List[str]], lang_name: str) -> None:
|
|
540
|
+
"""Recursively extract code entities from a tree-sitter AST node."""
|
|
541
|
+
node_types = _LANG_NODE_TYPES.get(lang_name)
|
|
542
|
+
if node_types is None:
|
|
543
|
+
return
|
|
544
|
+
|
|
545
|
+
if node.type in node_types.get("functions", set()):
|
|
546
|
+
name_node = node.child_by_field_name("name")
|
|
547
|
+
if name_node:
|
|
548
|
+
results["functions"].append(name_node.text.decode("utf-8"))
|
|
549
|
+
|
|
550
|
+
elif node.type in node_types.get("classes", set()):
|
|
551
|
+
name_node = node.child_by_field_name("name")
|
|
552
|
+
if name_node:
|
|
553
|
+
results["classes"].append(name_node.text.decode("utf-8"))
|
|
554
|
+
|
|
555
|
+
elif node.type in node_types.get("imports", set()):
|
|
556
|
+
names = _extract_import_name(node, lang_name)
|
|
557
|
+
results["imports"].extend(names)
|
|
558
|
+
|
|
559
|
+
elif node.type in node_types.get("calls", set()):
|
|
560
|
+
name = _extract_call_name(node, lang_name)
|
|
561
|
+
if name:
|
|
562
|
+
results["calls"].append(name)
|
|
563
|
+
|
|
564
|
+
for child in node.children:
|
|
565
|
+
_walk_ast(child, results, lang_name)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _extract_from_source(
|
|
569
|
+
source: bytes, parser: Any, file_path: str
|
|
570
|
+
) -> Dict[str, List[str]]:
|
|
571
|
+
"""Parse source bytes and extract functions, classes, imports, calls."""
|
|
572
|
+
results: Dict[str, List[str]] = {
|
|
573
|
+
"functions": [], "classes": [], "imports": [], "calls": []
|
|
574
|
+
}
|
|
575
|
+
try:
|
|
576
|
+
tree = parser.parse(source)
|
|
577
|
+
lang_name = _EXT_TO_LANG.get(Path(file_path).suffix.lower(), "")
|
|
578
|
+
_walk_ast(tree.root_node, results, lang_name)
|
|
579
|
+
except Exception:
|
|
580
|
+
pass # best-effort; parse failures are non-fatal
|
|
581
|
+
return results
|
|
582
|
+
|
|
583
|
+
# ---------------------------------------------------------------------------
|
|
584
|
+
# DB lifecycle
|
|
585
|
+
# ---------------------------------------------------------------------------
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _get_graph_path() -> str:
|
|
589
|
+
return os.environ.get("MINIGRAF_GRAPH_PATH", str(Path.cwd() / "memory.graph"))
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _open_db_at(path: str) -> MiniGrafDb:
|
|
593
|
+
"""Open MiniGrafDb at path, register session rules, update mtime tracking."""
|
|
594
|
+
global _db, _graph_path, _db_mtime
|
|
595
|
+
_db = MiniGrafDb.open(path)
|
|
596
|
+
for rule in SESSION_RULES:
|
|
597
|
+
_db.execute(rule)
|
|
598
|
+
for rule in _user_rules:
|
|
599
|
+
_db.execute(rule)
|
|
600
|
+
_graph_path = path
|
|
601
|
+
try:
|
|
602
|
+
_db_mtime = os.path.getmtime(path)
|
|
603
|
+
except OSError:
|
|
604
|
+
_db_mtime = 0.0
|
|
605
|
+
return _db
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def open_db(graph_path: Optional[str] = None) -> MiniGrafDb:
|
|
609
|
+
"""Open MiniGrafDb and register session-scoped rules. Called once at startup."""
|
|
610
|
+
return _open_db_at(graph_path or _get_graph_path())
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def _update_mtime() -> None:
|
|
614
|
+
"""Record the graph file mtime after our own checkpoint so we don't
|
|
615
|
+
treat our own write as an external modification on the next call."""
|
|
616
|
+
global _db_mtime
|
|
617
|
+
if not _graph_path:
|
|
618
|
+
return
|
|
619
|
+
try:
|
|
620
|
+
_db_mtime = os.path.getmtime(_graph_path)
|
|
621
|
+
except OSError:
|
|
622
|
+
pass
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def _refresh_if_stale() -> None:
|
|
626
|
+
"""Reopen the DB if the graph file was modified externally since last open.
|
|
627
|
+
|
|
628
|
+
minigraf's Drop impl writes to the file even for read-only handles (upstream
|
|
629
|
+
bug). Any subprocess that opens the same file — including the prepare/finalize
|
|
630
|
+
hooks — will change the mtime and invalidate this process's in-memory page
|
|
631
|
+
table. Detect this via mtime and reopen transparently.
|
|
632
|
+
"""
|
|
633
|
+
global _db_mtime
|
|
634
|
+
if not _graph_path:
|
|
635
|
+
return
|
|
636
|
+
try:
|
|
637
|
+
current_mtime = os.path.getmtime(_graph_path)
|
|
638
|
+
except OSError:
|
|
639
|
+
return
|
|
640
|
+
if current_mtime != _db_mtime:
|
|
641
|
+
_open_db_at(_graph_path)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def get_db() -> MiniGrafDb:
|
|
645
|
+
"""Return the open DB instance, opening it if not currently held.
|
|
646
|
+
|
|
647
|
+
The DB is opened per-operation and released after each call_tool() invocation
|
|
648
|
+
so that the prepare_hook subprocess can acquire the file lock between turns.
|
|
649
|
+
"""
|
|
650
|
+
if _db is None:
|
|
651
|
+
_open_db_at(_graph_path or _get_graph_path())
|
|
652
|
+
return _db
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
# ---------------------------------------------------------------------------
|
|
656
|
+
# Result parsing
|
|
657
|
+
# ---------------------------------------------------------------------------
|
|
658
|
+
|
|
659
|
+
def _parse_query_result(raw_json: str) -> Dict[str, Any]:
|
|
660
|
+
"""Parse JSON returned by MiniGrafDb.execute() for a query command."""
|
|
661
|
+
try:
|
|
662
|
+
data = json.loads(raw_json)
|
|
663
|
+
return {"ok": True, "results": data.get("results", [])}
|
|
664
|
+
except json.JSONDecodeError as e:
|
|
665
|
+
return {"ok": False, "error": f"Unexpected result format: {e} — raw: {raw_json[:200]}"}
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def _parse_tx_result(raw_json: str) -> Dict[str, Any]:
|
|
669
|
+
"""Parse JSON returned by MiniGrafDb.execute() for a transact/retract command."""
|
|
670
|
+
try:
|
|
671
|
+
data = json.loads(raw_json)
|
|
672
|
+
return {"ok": True, "tx": str(data.get("tx", "unknown"))}
|
|
673
|
+
except json.JSONDecodeError as e:
|
|
674
|
+
return {"ok": False, "error": f"Unexpected result format: {e} — raw: {raw_json[:200]}"}
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
# ---------------------------------------------------------------------------
|
|
678
|
+
# Explicit agent tool handlers
|
|
679
|
+
# ---------------------------------------------------------------------------
|
|
680
|
+
|
|
681
|
+
def handle_minigraf_query(datalog: str) -> Dict[str, Any]:
|
|
682
|
+
"""Query the graph. Returns {ok, results} or {ok, error}."""
|
|
683
|
+
db = get_db()
|
|
684
|
+
try:
|
|
685
|
+
raw = db.execute(f"(query {datalog})")
|
|
686
|
+
return _parse_query_result(raw)
|
|
687
|
+
except MiniGrafError as e:
|
|
688
|
+
return {"ok": False, "error": str(e)}
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def handle_minigraf_transact(facts: str, reason: str) -> Dict[str, Any]:
|
|
692
|
+
"""Transact facts into the graph. reason is required.
|
|
693
|
+
|
|
694
|
+
:valid-at is set to the current UTC ms timestamp so every agent-initiated
|
|
695
|
+
write has a recorded valid time, enabling correct bi-temporal queries.
|
|
696
|
+
"""
|
|
697
|
+
if not reason or not reason.strip():
|
|
698
|
+
return {"ok": False, "error": "reason is required for all writes"}
|
|
699
|
+
# Schema validation — closed-world enforcement on parseable string-valued triples.
|
|
700
|
+
# Only string-valued triples are schema-validated. Keyword-valued triples
|
|
701
|
+
# (e.g. relationship edges like [:service/auth :calls :component/jwt]) are
|
|
702
|
+
# not covered by MINIGRAF_SCHEMA and pass through unvalidated by design.
|
|
703
|
+
parsed = _parse_transact_facts(facts)
|
|
704
|
+
if parsed:
|
|
705
|
+
violations = _validate_facts(parsed)
|
|
706
|
+
if violations:
|
|
707
|
+
return {"ok": False, "error": f"schema violations: {'; '.join(violations)}"}
|
|
708
|
+
_refresh_if_stale()
|
|
709
|
+
db = get_db()
|
|
710
|
+
try:
|
|
711
|
+
raw = db.execute(f'(transact {facts} {{:valid-from "{_now_utc_ms()}"}})')
|
|
712
|
+
db.checkpoint()
|
|
713
|
+
_update_mtime()
|
|
714
|
+
result = _parse_tx_result(raw)
|
|
715
|
+
if result["ok"]:
|
|
716
|
+
result["reason"] = reason
|
|
717
|
+
_index_cache.invalidate()
|
|
718
|
+
return result
|
|
719
|
+
except MiniGrafError as e:
|
|
720
|
+
return {"ok": False, "error": str(e)}
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def handle_minigraf_retract(facts: str, reason: str) -> Dict[str, Any]:
|
|
724
|
+
"""Retract facts from the graph. reason is required."""
|
|
725
|
+
if not reason or not reason.strip():
|
|
726
|
+
return {"ok": False, "error": "reason is required for retract"}
|
|
727
|
+
_refresh_if_stale()
|
|
728
|
+
db = get_db()
|
|
729
|
+
try:
|
|
730
|
+
raw = db.execute(f"(retract {facts})")
|
|
731
|
+
db.checkpoint()
|
|
732
|
+
_update_mtime()
|
|
733
|
+
result = _parse_tx_result(raw)
|
|
734
|
+
if result["ok"]:
|
|
735
|
+
result["reason"] = reason
|
|
736
|
+
_index_cache.invalidate()
|
|
737
|
+
return result
|
|
738
|
+
except MiniGrafError as e:
|
|
739
|
+
return {"ok": False, "error": str(e)}
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def handle_minigraf_rule(rule: str) -> Dict[str, Any]:
|
|
743
|
+
"""Register a Datalog rule for use in subsequent queries.
|
|
744
|
+
|
|
745
|
+
Rules persist for the lifetime of the server session and are re-registered
|
|
746
|
+
whenever the DB is reopened. To make a rule permanent across server restarts,
|
|
747
|
+
add it to SESSION_RULES in mcp_server.py.
|
|
748
|
+
|
|
749
|
+
Syntax: [(rule-name ?arg ...) body-clause ...]
|
|
750
|
+
Example: [(ancestor ?a ?d) [?a :parent ?d]]
|
|
751
|
+
"""
|
|
752
|
+
global _user_rules
|
|
753
|
+
db = get_db()
|
|
754
|
+
try:
|
|
755
|
+
db.execute(f"(rule {rule})")
|
|
756
|
+
rule_expr = f"(rule {rule})"
|
|
757
|
+
if rule_expr not in _user_rules:
|
|
758
|
+
_user_rules.append(rule_expr)
|
|
759
|
+
return {"ok": True, "rule": rule}
|
|
760
|
+
except MiniGrafError as e:
|
|
761
|
+
return {"ok": False, "error": str(e)}
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def handle_minigraf_report_issue(
|
|
765
|
+
category: str,
|
|
766
|
+
description: str,
|
|
767
|
+
datalog: Optional[str] = None,
|
|
768
|
+
error: Optional[str] = None,
|
|
769
|
+
) -> Dict[str, Any]:
|
|
770
|
+
"""Delegate to report_issue.py."""
|
|
771
|
+
try:
|
|
772
|
+
from report_issue import report_issue
|
|
773
|
+
report_issue(category, description, datalog=datalog, error=error)
|
|
774
|
+
return {"ok": True}
|
|
775
|
+
except Exception as e:
|
|
776
|
+
return {"ok": False, "error": str(e)}
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def handle_minigraf_audit(as_of: Optional[int] = None) -> Dict[str, Any]:
|
|
780
|
+
"""Audit graph entities against MINIGRAF_SCHEMA.
|
|
781
|
+
|
|
782
|
+
Current state (as_of=None): validates all entities and retracts violators.
|
|
783
|
+
Point-in-time (as_of=N): reports violations only — no retractions.
|
|
784
|
+
|
|
785
|
+
Ported from Schema.audit_as_of() in minigraf-examples minigraf-schema crate.
|
|
786
|
+
"""
|
|
787
|
+
_refresh_if_stale()
|
|
788
|
+
db = get_db()
|
|
789
|
+
audited = 0
|
|
790
|
+
retracted = 0
|
|
791
|
+
all_violations: List[Dict[str, Any]] = []
|
|
792
|
+
|
|
793
|
+
as_of_clause = f":as-of {as_of} " if as_of is not None else ""
|
|
794
|
+
|
|
795
|
+
for entity_type in MINIGRAF_SCHEMA:
|
|
796
|
+
# Step 1: Find all entity UUIDs of this type.
|
|
797
|
+
type_query = (
|
|
798
|
+
f"[:find ?e {as_of_clause}"
|
|
799
|
+
f":where [?e :entity-type :type/{entity_type}]]"
|
|
800
|
+
)
|
|
801
|
+
try:
|
|
802
|
+
type_result = handle_minigraf_query(type_query)
|
|
803
|
+
type_rows = type_result.get("results", [])
|
|
804
|
+
except Exception:
|
|
805
|
+
continue
|
|
806
|
+
|
|
807
|
+
for row in type_rows:
|
|
808
|
+
if not row:
|
|
809
|
+
continue
|
|
810
|
+
entity_uuid = row[0]
|
|
811
|
+
audited += 1
|
|
812
|
+
|
|
813
|
+
# Step 2: Fetch all attributes using #uuid tagged literal.
|
|
814
|
+
# minigraf's EDN parser treats #uuid "..." as EdnValue::Uuid and routes
|
|
815
|
+
# it through edn_to_entity_id directly — no keyword-to-UUID derivation
|
|
816
|
+
# needed and no join-variable round-trip problem.
|
|
817
|
+
attr_query = (
|
|
818
|
+
f'[:find ?a ?v {as_of_clause}'
|
|
819
|
+
f':where [#uuid "{entity_uuid}" ?a ?v]]'
|
|
820
|
+
)
|
|
821
|
+
try:
|
|
822
|
+
attr_result = handle_minigraf_query(attr_query)
|
|
823
|
+
attr_rows = attr_result.get("results", [])
|
|
824
|
+
except Exception:
|
|
825
|
+
continue
|
|
826
|
+
|
|
827
|
+
# Extract keyword ident from the stored :ident datom for reporting.
|
|
828
|
+
# Falls back to the UUID string if :ident was not written.
|
|
829
|
+
kw_ident = next(
|
|
830
|
+
(v for a, v in attr_rows if a == ":ident" and isinstance(v, str)),
|
|
831
|
+
entity_uuid,
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
# Exclude system attributes from schema validation.
|
|
835
|
+
attr_facts = [
|
|
836
|
+
{
|
|
837
|
+
"entity": kw_ident,
|
|
838
|
+
"entity_type": entity_type,
|
|
839
|
+
"attribute": a,
|
|
840
|
+
"value": v,
|
|
841
|
+
}
|
|
842
|
+
for a, v in attr_rows
|
|
843
|
+
if a not in _SYSTEM_ATTRS
|
|
844
|
+
]
|
|
845
|
+
|
|
846
|
+
if not attr_facts:
|
|
847
|
+
attr_facts = [{"entity": kw_ident, "entity_type": entity_type,
|
|
848
|
+
"attribute": ":__no_attributes__", "value": ""}]
|
|
849
|
+
|
|
850
|
+
violations = _validate_facts(attr_facts)
|
|
851
|
+
if violations:
|
|
852
|
+
for v in violations:
|
|
853
|
+
all_violations.append({"entity": kw_ident, "detail": v})
|
|
854
|
+
|
|
855
|
+
if as_of is None:
|
|
856
|
+
# Retract using #uuid tagged literal — works even without knowing
|
|
857
|
+
# the original keyword ident. History preserved (bi-temporal).
|
|
858
|
+
try:
|
|
859
|
+
retract_triples = [
|
|
860
|
+
f'[#uuid "{entity_uuid}" :entity-type :type/{entity_type}]',
|
|
861
|
+
]
|
|
862
|
+
for a, v in attr_rows:
|
|
863
|
+
if isinstance(v, str):
|
|
864
|
+
escaped = v.replace('"', '\\"')
|
|
865
|
+
retract_triples.append(
|
|
866
|
+
f'[#uuid "{entity_uuid}" {a} "{escaped}"]'
|
|
867
|
+
)
|
|
868
|
+
retract_expr = f"(retract [{' '.join(retract_triples)}])"
|
|
869
|
+
db.execute(retract_expr)
|
|
870
|
+
db.checkpoint()
|
|
871
|
+
_update_mtime()
|
|
872
|
+
retracted += 1
|
|
873
|
+
except Exception:
|
|
874
|
+
pass
|
|
875
|
+
|
|
876
|
+
return {
|
|
877
|
+
"ok": True,
|
|
878
|
+
"audited": audited,
|
|
879
|
+
"retracted": retracted,
|
|
880
|
+
"violations": all_violations,
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
# ---------------------------------------------------------------------------
|
|
885
|
+
# memory_prepare_turn
|
|
886
|
+
# ---------------------------------------------------------------------------
|
|
887
|
+
|
|
888
|
+
_STOP_WORDS = frozenset(
|
|
889
|
+
"a an the is are was were be been being have has had do does did will would could should "
|
|
890
|
+
"may might shall can need dare ought used to am i we you he she it they what which who "
|
|
891
|
+
"this that these those my our your his her its their about above after all also and as at "
|
|
892
|
+
"before but by for from if in into just me more most no not of on only or other our out "
|
|
893
|
+
"same so than then there they through to too under up us very via was we what when where "
|
|
894
|
+
"which while who why with".split()
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
_MIN_ENTITY_LEN = 4
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def _canonical_ident(entity_type: str, value: str) -> str:
|
|
901
|
+
"""Slug-canonicalize a value into a Minigraf keyword ident.
|
|
902
|
+
|
|
903
|
+
Lowercases, replaces any character outside [a-z0-9-] with a hyphen,
|
|
904
|
+
collapses consecutive hyphens, strips leading/trailing hyphens.
|
|
905
|
+
Ported from _to_kw() in minigraf-examples LlamaIndex integration.
|
|
906
|
+
"""
|
|
907
|
+
slug = re.sub(r"[^a-z0-9-]", "-", value.lower())
|
|
908
|
+
slug = re.sub(r"-+", "-", slug).strip("-")
|
|
909
|
+
return f":{entity_type}/{slug}"
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def _resolve_module_import(import_name: str, file_entities: Dict[str, List[str]]) -> str:
|
|
913
|
+
"""Resolve an import name to a module ident that joins with stored module entities.
|
|
914
|
+
|
|
915
|
+
For a name like "storage", tries standard Rust source-root locations first
|
|
916
|
+
(src/storage.rs, src/storage/mod.rs) before falling back to a broader name
|
|
917
|
+
search. The ordered-priority approach prevents e.g. src/graph/storage.rs
|
|
918
|
+
from matching a top-level `use crate::storage` import.
|
|
919
|
+
|
|
920
|
+
Falls back to _canonical_ident for external crate names (std, tokio, …)
|
|
921
|
+
so they still get an edge even though they have no :path attribute.
|
|
922
|
+
"""
|
|
923
|
+
# Priority 1: canonical Rust module root paths under common source roots
|
|
924
|
+
for src_root in ("src", "lib", ""):
|
|
925
|
+
prefix = f"{src_root}/" if src_root else ""
|
|
926
|
+
candidate_file = f"{prefix}{import_name}.rs"
|
|
927
|
+
candidate_mod = f"{prefix}{import_name}/mod.rs"
|
|
928
|
+
if candidate_file in file_entities:
|
|
929
|
+
return _code_ident("module", candidate_file)
|
|
930
|
+
if candidate_mod in file_entities:
|
|
931
|
+
return _code_ident("module", candidate_mod)
|
|
932
|
+
|
|
933
|
+
# Priority 2: broader search — only match files directly under a src root
|
|
934
|
+
# (parent.parent is the source root, not a nested subdir)
|
|
935
|
+
for file_path in file_entities:
|
|
936
|
+
p = Path(file_path)
|
|
937
|
+
if p.stem == "mod" and p.parent.name == import_name:
|
|
938
|
+
return _code_ident("module", file_path)
|
|
939
|
+
|
|
940
|
+
return _canonical_ident("module", import_name)
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def _code_ident(entity_type: str, file_path: str, name: Optional[str] = None) -> str:
|
|
944
|
+
"""Return a canonical ident for a code entity.
|
|
945
|
+
|
|
946
|
+
Appends '::name' to file_path before slugging so that the function
|
|
947
|
+
name appears AFTER the file extension in the slug, keeping it distinct
|
|
948
|
+
from a file whose path ends with the name (e.g. 'src/auth_login.py').
|
|
949
|
+
|
|
950
|
+
This is best-effort — the separator itself becomes '-' after slugging,
|
|
951
|
+
so collisions are still possible for contrived path/name combinations.
|
|
952
|
+
"""
|
|
953
|
+
if name:
|
|
954
|
+
value = f"{file_path}::{name}"
|
|
955
|
+
else:
|
|
956
|
+
value = file_path
|
|
957
|
+
return _canonical_ident(entity_type, value)
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
# ---------------------------------------------------------------------------
|
|
961
|
+
# Git helpers
|
|
962
|
+
# ---------------------------------------------------------------------------
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def _git_commits(
|
|
966
|
+
repo_path: str,
|
|
967
|
+
watermark_hash: Optional[str],
|
|
968
|
+
branch: str = "HEAD",
|
|
969
|
+
) -> List[tuple]:
|
|
970
|
+
"""Return list of (hash, ts_iso, author_email, subject) in chronological order."""
|
|
971
|
+
range_spec = f"{watermark_hash}..{branch}" if watermark_hash else branch
|
|
972
|
+
result = _subprocess.run(
|
|
973
|
+
["git", "log", "--reverse", "--format=%H %at %ae %s", range_spec],
|
|
974
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
975
|
+
)
|
|
976
|
+
commits = []
|
|
977
|
+
for line in result.stdout.strip().splitlines():
|
|
978
|
+
if not line.strip():
|
|
979
|
+
continue
|
|
980
|
+
parts = line.split(" ", 3)
|
|
981
|
+
hash_ = parts[0]
|
|
982
|
+
ts_unix = int(parts[1])
|
|
983
|
+
author = parts[2]
|
|
984
|
+
subject = parts[3] if len(parts) > 3 else ""
|
|
985
|
+
ts_iso = datetime.datetime.fromtimestamp(ts_unix, datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
986
|
+
commits.append((hash_, ts_iso, author, subject))
|
|
987
|
+
return commits
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def _git_changed_files(repo_path: str, commit_hash: str) -> List[tuple]:
|
|
991
|
+
"""Return list of (status_char, path) for files changed in this commit."""
|
|
992
|
+
result = _subprocess.run(
|
|
993
|
+
["git", "diff-tree", "--no-commit-id", "-r", "--name-status", "--root", commit_hash],
|
|
994
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
995
|
+
)
|
|
996
|
+
changes = []
|
|
997
|
+
for line in result.stdout.strip().splitlines():
|
|
998
|
+
if not line.strip():
|
|
999
|
+
continue
|
|
1000
|
+
parts = line.split("\t", 1)
|
|
1001
|
+
if len(parts) == 2:
|
|
1002
|
+
status_char = parts[0][0] # A, M, D, R, C → take first char
|
|
1003
|
+
changes.append((status_char, parts[1]))
|
|
1004
|
+
return changes
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def _edn_escape(s: str) -> str:
|
|
1008
|
+
"""Escape a string for embedding in an EDN double-quoted literal."""
|
|
1009
|
+
return s.replace("\\", "\\\\").replace('"', '\\"')
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def _git_file_content(repo_path: str, commit_hash: str, file_path: str) -> bytes:
|
|
1013
|
+
"""Return raw bytes of a file at the given commit."""
|
|
1014
|
+
result = _subprocess.run(
|
|
1015
|
+
["git", "show", f"{commit_hash}:{file_path}"],
|
|
1016
|
+
cwd=repo_path, capture_output=True, check=True,
|
|
1017
|
+
)
|
|
1018
|
+
return result.stdout
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def _git_parent_hashes(repo_path: str, commit_hash: str) -> List[str]:
|
|
1022
|
+
"""Return the parent commit hashes for the given commit (empty for root commits)."""
|
|
1023
|
+
result = _subprocess.run(
|
|
1024
|
+
["git", "log", "-1", "--format=%P", commit_hash],
|
|
1025
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
1026
|
+
)
|
|
1027
|
+
raw = result.stdout.strip()
|
|
1028
|
+
return raw.split() if raw else []
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
def _git_tags(repo_path: str) -> List[tuple]:
|
|
1032
|
+
"""Return list of (tag_name, commit_hash, date_iso) for all tags in the repo.
|
|
1033
|
+
|
|
1034
|
+
For annotated tags, returns the dereferenced commit hash.
|
|
1035
|
+
For lightweight tags, returns the tagged commit directly.
|
|
1036
|
+
Date is the tagger date for annotated tags, or commit date for lightweight.
|
|
1037
|
+
"""
|
|
1038
|
+
result = _subprocess.run(
|
|
1039
|
+
["git", "tag", "-l", "--sort=version:refname",
|
|
1040
|
+
"--format=%(refname:short)\t%(*objectname)\t%(objectname)\t%(creatordate:iso-strict)"],
|
|
1041
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
1042
|
+
)
|
|
1043
|
+
tags = []
|
|
1044
|
+
for line in result.stdout.strip().splitlines():
|
|
1045
|
+
if not line.strip():
|
|
1046
|
+
continue
|
|
1047
|
+
parts = line.split("\t", 3)
|
|
1048
|
+
if len(parts) < 3:
|
|
1049
|
+
continue
|
|
1050
|
+
tag_name = parts[0]
|
|
1051
|
+
deref_hash = parts[1].strip() # non-empty for annotated tags
|
|
1052
|
+
obj_hash = parts[2].strip()
|
|
1053
|
+
date_raw = parts[3].strip() if len(parts) > 3 else ""
|
|
1054
|
+
commit_hash = deref_hash if deref_hash else obj_hash
|
|
1055
|
+
if not commit_hash:
|
|
1056
|
+
continue
|
|
1057
|
+
tags.append((tag_name, commit_hash, date_raw))
|
|
1058
|
+
return tags
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
# ---------------------------------------------------------------------------
|
|
1062
|
+
# Bi-temporal write helpers
|
|
1063
|
+
# ---------------------------------------------------------------------------
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
def _build_close_triples(
|
|
1067
|
+
ident: str,
|
|
1068
|
+
description: str,
|
|
1069
|
+
module_ident: str,
|
|
1070
|
+
) -> List[str]:
|
|
1071
|
+
"""Return triple strings needed to bi-temporally close an entity.
|
|
1072
|
+
|
|
1073
|
+
Closes :ident (canonical existence fact), :description (with real value),
|
|
1074
|
+
and the parent module's :contains edge. The module's own :contains triple
|
|
1075
|
+
is omitted when ident == module_ident (modules have no parent module here).
|
|
1076
|
+
"""
|
|
1077
|
+
triples = [
|
|
1078
|
+
f'[{ident} :ident "{_edn_escape(ident)}"]',
|
|
1079
|
+
f'[{ident} :description "{_edn_escape(description)}"]',
|
|
1080
|
+
]
|
|
1081
|
+
if ident != module_ident:
|
|
1082
|
+
triples.append(f"[{module_ident} :contains {ident}]")
|
|
1083
|
+
return triples
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _ingest_transact(
|
|
1087
|
+
db: Any,
|
|
1088
|
+
triples: List[str],
|
|
1089
|
+
commit_ts_iso: str,
|
|
1090
|
+
reason: str,
|
|
1091
|
+
) -> None:
|
|
1092
|
+
"""Transact code-structure facts with :valid-from set to the commit timestamp."""
|
|
1093
|
+
if not triples:
|
|
1094
|
+
return
|
|
1095
|
+
facts_str = "[" + " ".join(triples) + "]"
|
|
1096
|
+
db.execute(f'(transact {facts_str} {{:valid-from "{commit_ts_iso}"}})')
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def _ingest_close(
|
|
1100
|
+
db: Any,
|
|
1101
|
+
triples: List[str],
|
|
1102
|
+
original_ts_iso: str,
|
|
1103
|
+
commit_ts_iso: str,
|
|
1104
|
+
reason: str,
|
|
1105
|
+
) -> None:
|
|
1106
|
+
"""Close a fact's valid window at the deletion commit timestamp.
|
|
1107
|
+
|
|
1108
|
+
Two-step process:
|
|
1109
|
+
1. Retract each original open-ended fact so it vanishes from current-time
|
|
1110
|
+
queries (retract has no temporal options, so this removes the unbounded
|
|
1111
|
+
assertion from the live view while keeping it in transaction history).
|
|
1112
|
+
2. Re-transact the same facts with explicit :valid-from + :valid-to so the
|
|
1113
|
+
historical valid window is preserved for point-in-time queries.
|
|
1114
|
+
|
|
1115
|
+
Triples are retracted one-by-one to avoid EAVT collision on :contains edges
|
|
1116
|
+
(Minigraf's pending index omits value bytes, so batching multiple
|
|
1117
|
+
[module :contains fn] retracts could collide).
|
|
1118
|
+
"""
|
|
1119
|
+
if not triples:
|
|
1120
|
+
return
|
|
1121
|
+
for triple in triples:
|
|
1122
|
+
try:
|
|
1123
|
+
db.execute(f"(retract [{triple}])")
|
|
1124
|
+
except Exception:
|
|
1125
|
+
pass # best-effort: original may not exist if preload was incomplete
|
|
1126
|
+
facts_str = "[" + " ".join(triples) + "]"
|
|
1127
|
+
db.execute(
|
|
1128
|
+
f'(transact {facts_str} {{:valid-from "{original_ts_iso}" :valid-to "{commit_ts_iso}"}})'
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def _watermark_query(db: Any) -> Optional[str]:
|
|
1133
|
+
"""Return the hash of the last ingested commit, or None if no watermark exists."""
|
|
1134
|
+
raw = db.execute("(query [:find ?h :where [:ingestion/watermark :hash ?h]])")
|
|
1135
|
+
results = json.loads(raw).get("results", [])
|
|
1136
|
+
return results[0][0] if results else None
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def _total_ingested_query(db: Any) -> int:
|
|
1140
|
+
"""Return the cumulative number of commits ingested across all runs, or 0."""
|
|
1141
|
+
raw = db.execute("(query [:find ?n :any-valid-time :where [:ingestion/last-run-at :total-ingested ?n]])")
|
|
1142
|
+
results = json.loads(raw).get("results", [])
|
|
1143
|
+
return int(results[0][0]) if results else 0
|
|
1144
|
+
|
|
1145
|
+
|
|
1146
|
+
def _watermark_update(db: Any, commit_hash: str, commit_ts_iso: str, reason: str) -> None:
|
|
1147
|
+
"""Record the last successfully ingested commit hash in the graph."""
|
|
1148
|
+
existing = _watermark_query(db)
|
|
1149
|
+
if existing:
|
|
1150
|
+
db.execute(f'(retract [[:ingestion/watermark :hash "{existing}"]])')
|
|
1151
|
+
db.execute(
|
|
1152
|
+
f'(transact [[:ingestion/watermark :entity-type :type/ingestion] '
|
|
1153
|
+
f'[:ingestion/watermark :ident ":ingestion/watermark"] '
|
|
1154
|
+
f'[:ingestion/watermark :description "git ingestion watermark"] '
|
|
1155
|
+
f'[:ingestion/watermark :hash "{commit_hash}"]] '
|
|
1156
|
+
f'{{:valid-from "{commit_ts_iso}"}})'
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def _last_run_write(db: Any, commit_hash: str, run_at: str, total_ingested: int) -> None:
|
|
1161
|
+
"""Record the wall-clock time, final commit hash, and cumulative ingested count."""
|
|
1162
|
+
db.execute(
|
|
1163
|
+
f'(transact [[:ingestion/last-run-at :entity-type :type/ingestion] '
|
|
1164
|
+
f'[:ingestion/last-run-at :ident ":ingestion/last-run-at"] '
|
|
1165
|
+
f'[:ingestion/last-run-at :description "last ingestion run timestamp"] '
|
|
1166
|
+
f'[:ingestion/last-run-at :last-run-at "{run_at}"] '
|
|
1167
|
+
f'[:ingestion/last-run-at :last-commit "{commit_hash}"] '
|
|
1168
|
+
f'[:ingestion/last-run-at :total-ingested {total_ingested}]])'
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
# System attributes written by _transact_extracted_facts alongside domain attributes.
|
|
1173
|
+
# They are invisible to schema validation and filtered from attr_facts in minigraf_audit.
|
|
1174
|
+
_SYSTEM_ATTRS: frozenset = frozenset({":entity-type", ":ident"})
|
|
1175
|
+
|
|
1176
|
+
MINIGRAF_SCHEMA: Dict[str, Dict[str, Dict[str, type]]] = {
|
|
1177
|
+
"decision": {
|
|
1178
|
+
"required": {":description": str},
|
|
1179
|
+
"optional": {":rationale": str, ":date": str, ":alias": str},
|
|
1180
|
+
},
|
|
1181
|
+
"preference": {
|
|
1182
|
+
"required": {":description": str},
|
|
1183
|
+
"optional": {":rationale": str, ":alias": str},
|
|
1184
|
+
},
|
|
1185
|
+
"constraint": {
|
|
1186
|
+
"required": {":description": str},
|
|
1187
|
+
"optional": {":rationale": str, ":alias": str},
|
|
1188
|
+
},
|
|
1189
|
+
"dependency": {
|
|
1190
|
+
"required": {":description": str},
|
|
1191
|
+
"optional": {":rationale": str, ":alias": str},
|
|
1192
|
+
},
|
|
1193
|
+
"module": {
|
|
1194
|
+
"required": {":description": str},
|
|
1195
|
+
"optional": {
|
|
1196
|
+
":path": str, ":alias": str,
|
|
1197
|
+
# graph edges (keyword-valued, stored as strings)
|
|
1198
|
+
":contains": str, ":depends-on": str, ":calls": str,
|
|
1199
|
+
# commit cross-references
|
|
1200
|
+
":introduced-by": str, ":modified-in": str,
|
|
1201
|
+
},
|
|
1202
|
+
},
|
|
1203
|
+
"function": {
|
|
1204
|
+
"required": {":description": str},
|
|
1205
|
+
"optional": {
|
|
1206
|
+
":file": str, ":alias": str,
|
|
1207
|
+
":introduced-by": str, ":modified-in": str,
|
|
1208
|
+
},
|
|
1209
|
+
},
|
|
1210
|
+
"class": {
|
|
1211
|
+
"required": {":description": str},
|
|
1212
|
+
"optional": {
|
|
1213
|
+
":file": str, ":alias": str,
|
|
1214
|
+
":introduced-by": str, ":modified-in": str,
|
|
1215
|
+
},
|
|
1216
|
+
},
|
|
1217
|
+
"ingestion": {
|
|
1218
|
+
"required": {":description": str},
|
|
1219
|
+
"optional": {":hash": str, ":alias": str, ":last-run-at": str, ":last-commit": str, ":total-ingested": int},
|
|
1220
|
+
},
|
|
1221
|
+
"commit": {
|
|
1222
|
+
"required": {":description": str},
|
|
1223
|
+
"optional": {
|
|
1224
|
+
":hash": str, ":author": str, ":subject": str, ":date": str, ":alias": str,
|
|
1225
|
+
# parent commit reference (keyword-valued edge, stored as string)
|
|
1226
|
+
":parent": str,
|
|
1227
|
+
},
|
|
1228
|
+
},
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
|
|
1232
|
+
def _validate_facts(facts: List[Dict[str, Any]]) -> List[str]:
|
|
1233
|
+
"""Validate proposed facts against MINIGRAF_SCHEMA. Returns violation strings.
|
|
1234
|
+
|
|
1235
|
+
Closed-world: unknown entity types and unknown attributes are both violations.
|
|
1236
|
+
System attributes (_SYSTEM_ATTRS) are silently skipped — they are internal
|
|
1237
|
+
tags added by _transact_extracted_facts, not domain attributes.
|
|
1238
|
+
Pure function — no DB access. Mirrors Schema.validate() from minigraf-schema.
|
|
1239
|
+
"""
|
|
1240
|
+
violations: List[str] = []
|
|
1241
|
+
|
|
1242
|
+
# Group facts by entity to check required attributes across all facts for one entity.
|
|
1243
|
+
entity_attrs: Dict[str, Dict[str, Any]] = {}
|
|
1244
|
+
entity_types: Dict[str, str] = {}
|
|
1245
|
+
for fact in facts:
|
|
1246
|
+
entity = fact.get("entity", "")
|
|
1247
|
+
entity_type = fact.get("entity_type", "")
|
|
1248
|
+
attribute = fact.get("attribute", "")
|
|
1249
|
+
value = fact.get("value")
|
|
1250
|
+
if attribute in _SYSTEM_ATTRS:
|
|
1251
|
+
continue # system attributes bypass schema validation
|
|
1252
|
+
entity_attrs.setdefault(entity, {})[attribute] = value
|
|
1253
|
+
if entity_type:
|
|
1254
|
+
entity_types[entity] = entity_type
|
|
1255
|
+
|
|
1256
|
+
for entity, attrs in entity_attrs.items():
|
|
1257
|
+
entity_type = entity_types.get(entity, "")
|
|
1258
|
+
|
|
1259
|
+
# Closed-world: unknown entity type is a violation.
|
|
1260
|
+
if entity_type not in MINIGRAF_SCHEMA:
|
|
1261
|
+
violations.append(
|
|
1262
|
+
f"entity '{entity}' has unknown type '{entity_type}' — "
|
|
1263
|
+
f"allowed: {list(MINIGRAF_SCHEMA)}"
|
|
1264
|
+
)
|
|
1265
|
+
continue
|
|
1266
|
+
|
|
1267
|
+
schema = MINIGRAF_SCHEMA[entity_type]
|
|
1268
|
+
required = schema["required"]
|
|
1269
|
+
optional = schema["optional"]
|
|
1270
|
+
allowed = set(required) | set(optional)
|
|
1271
|
+
|
|
1272
|
+
# Check required attributes are present with correct type.
|
|
1273
|
+
for attr, expected_type in required.items():
|
|
1274
|
+
if attr not in attrs:
|
|
1275
|
+
violations.append(
|
|
1276
|
+
f"entity '{entity}' missing required attribute '{attr}'"
|
|
1277
|
+
)
|
|
1278
|
+
elif not isinstance(attrs[attr], expected_type):
|
|
1279
|
+
violations.append(
|
|
1280
|
+
f"entity '{entity}' attribute '{attr}' has wrong type "
|
|
1281
|
+
f"(expected {expected_type.__name__}, got {type(attrs[attr]).__name__})"
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
# Check optional attributes, if present, have correct type.
|
|
1285
|
+
for attr, value in attrs.items():
|
|
1286
|
+
if attr in optional and not isinstance(value, optional[attr]):
|
|
1287
|
+
violations.append(
|
|
1288
|
+
f"entity '{entity}' attribute '{attr}' has wrong type "
|
|
1289
|
+
f"(expected {optional[attr].__name__}, got {type(value).__name__})"
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
# Closed-world: unknown attributes are violations.
|
|
1293
|
+
for attr in attrs:
|
|
1294
|
+
if attr not in allowed:
|
|
1295
|
+
violations.append(
|
|
1296
|
+
f"entity '{entity}' has unknown attribute '{attr}' — "
|
|
1297
|
+
f"allowed: {sorted(allowed)}"
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
return violations
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
def _parse_transact_facts(facts_str: str) -> List[Dict[str, Any]]:
|
|
1304
|
+
"""Parse a Datalog transact string into fact dicts for schema validation.
|
|
1305
|
+
|
|
1306
|
+
Only captures string-valued triples (quoted values). Keyword values
|
|
1307
|
+
like :type/decision are skipped — they are internal type tags, not
|
|
1308
|
+
user-authored facts subject to schema validation.
|
|
1309
|
+
"""
|
|
1310
|
+
pattern = r'\[(\:[^\s\]]+)\s+(\:[^\s\]]+)\s+"([^"]+)"\]'
|
|
1311
|
+
result = []
|
|
1312
|
+
for match in re.finditer(pattern, facts_str):
|
|
1313
|
+
entity, attribute, value = match.groups()
|
|
1314
|
+
entity_type = entity.split("/")[0].lstrip(":") if "/" in entity else ""
|
|
1315
|
+
result.append({
|
|
1316
|
+
"entity": entity,
|
|
1317
|
+
"entity_type": entity_type,
|
|
1318
|
+
"attribute": attribute,
|
|
1319
|
+
"value": value,
|
|
1320
|
+
})
|
|
1321
|
+
return result
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
def _query_canonical_entities() -> str:
|
|
1325
|
+
"""Query existing canonical entity idents for schema-aware prompt injection.
|
|
1326
|
+
|
|
1327
|
+
Returns a formatted string listing up to 50 entity idents and their
|
|
1328
|
+
descriptions. Returns empty string if the graph has no entities — in
|
|
1329
|
+
that case the caller omits the section from the prompt entirely.
|
|
1330
|
+
|
|
1331
|
+
Uses a two-step approach: first fetches all stored :ident keyword strings,
|
|
1332
|
+
then fetches each entity's :description using the keyword ident as a literal.
|
|
1333
|
+
This returns proper keyword idents (e.g. :decision/redis) rather than the
|
|
1334
|
+
internal UUIDs that join-variable queries would return for ?e.
|
|
1335
|
+
"""
|
|
1336
|
+
try:
|
|
1337
|
+
ident_result = handle_minigraf_query("[:find ?id :where [?e :ident ?id]]")
|
|
1338
|
+
ident_rows = ident_result.get("results", [])
|
|
1339
|
+
except Exception:
|
|
1340
|
+
return ""
|
|
1341
|
+
if not ident_rows:
|
|
1342
|
+
return ""
|
|
1343
|
+
lines = []
|
|
1344
|
+
for row in ident_rows[:50]:
|
|
1345
|
+
kw_ident = row[0] if row else None
|
|
1346
|
+
if not isinstance(kw_ident, str) or not kw_ident.startswith(":"):
|
|
1347
|
+
continue
|
|
1348
|
+
try:
|
|
1349
|
+
desc_result = handle_minigraf_query(
|
|
1350
|
+
f"[:find ?desc :where [{kw_ident} :description ?desc]]"
|
|
1351
|
+
)
|
|
1352
|
+
desc_rows = desc_result.get("results", [])
|
|
1353
|
+
desc = desc_rows[0][0] if desc_rows else ""
|
|
1354
|
+
except Exception:
|
|
1355
|
+
desc = ""
|
|
1356
|
+
if desc:
|
|
1357
|
+
lines.append(f" {kw_ident} — {desc}")
|
|
1358
|
+
return "\n".join(lines)
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def _extract_entities(text: str) -> List[str]:
|
|
1362
|
+
"""Extract candidate entity tokens from user message text."""
|
|
1363
|
+
tokens = text.lower().split()
|
|
1364
|
+
result = []
|
|
1365
|
+
for t in tokens:
|
|
1366
|
+
stripped = t.strip(".,?!;:\"'()[]")
|
|
1367
|
+
if len(stripped) >= _MIN_ENTITY_LEN and stripped not in _STOP_WORDS:
|
|
1368
|
+
result.append(stripped)
|
|
1369
|
+
return result
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
def _format_facts(results: List[List[str]]) -> str:
|
|
1373
|
+
"""Format a list of [attr, val] or [e, attr, val] rows as a readable block."""
|
|
1374
|
+
if not results:
|
|
1375
|
+
return ""
|
|
1376
|
+
lines = []
|
|
1377
|
+
for row in results:
|
|
1378
|
+
lines.append(" " + " | ".join(str(v) for v in row))
|
|
1379
|
+
return "\n".join(lines)
|
|
1380
|
+
|
|
1381
|
+
|
|
1382
|
+
_HISTORICAL_SIGNALS = re.compile(
|
|
1383
|
+
r"\b(last\s+\w+|yesterday|before|earlier|as\s+of|at\s+the\s+time|back\s+when|previously)\b",
|
|
1384
|
+
re.IGNORECASE,
|
|
1385
|
+
)
|
|
1386
|
+
# Note: "last <word>" is a broad pattern — "last resort", "last mile", etc. will match.
|
|
1387
|
+
# Without an explicit ISO date in the message, _build_query_clauses falls back to the
|
|
1388
|
+
# current UTC timestamp regardless, so false positives cause no harm in practice.
|
|
1389
|
+
_DATE_PATTERN = re.compile(
|
|
1390
|
+
r"\b(\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4})\b"
|
|
1391
|
+
)
|
|
1392
|
+
|
|
1393
|
+
|
|
1394
|
+
def _is_historical_query(user_message: str) -> bool:
|
|
1395
|
+
return bool(_HISTORICAL_SIGNALS.search(user_message))
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def _now_utc_ms() -> str:
|
|
1399
|
+
"""Return current UTC time as an ISO 8601 string with millisecond precision and Z suffix.
|
|
1400
|
+
|
|
1401
|
+
minigraf requires UTC (no timezone offsets) and millisecond precision to
|
|
1402
|
+
reliably find facts transacted in the same second as the query.
|
|
1403
|
+
e.g. "2026-05-02T15:44:52.184Z"
|
|
1404
|
+
"""
|
|
1405
|
+
return datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def _build_query_clauses(user_message: str) -> str:
|
|
1409
|
+
"""
|
|
1410
|
+
Return temporal clauses to append to a Datalog query.
|
|
1411
|
+
|
|
1412
|
+
For current-state queries use :valid-at with the current UTC timestamp
|
|
1413
|
+
(millisecond precision). This correctly finds all facts whose valid window
|
|
1414
|
+
includes right now — including facts transacted earlier the same second —
|
|
1415
|
+
while excluding expired/retracted facts and future-dated facts.
|
|
1416
|
+
|
|
1417
|
+
For historical queries where an explicit ISO date is detected in the user
|
|
1418
|
+
message, use :valid-at with that date (resolves to midnight UTC on that
|
|
1419
|
+
date — intentional for point-in-time historical semantics).
|
|
1420
|
+
|
|
1421
|
+
minigraf :valid-at accepts: ISO 8601 date ("YYYY-MM-DD" → midnight UTC)
|
|
1422
|
+
or UTC datetime with Z suffix ("YYYY-MM-DDTHH:MM:SS.mmmZ").
|
|
1423
|
+
Timezone offsets are not supported; :any-valid-time disables filtering.
|
|
1424
|
+
"""
|
|
1425
|
+
if _is_historical_query(user_message):
|
|
1426
|
+
date_match = _DATE_PATTERN.search(user_message)
|
|
1427
|
+
if date_match:
|
|
1428
|
+
valid_at = date_match.group(1)
|
|
1429
|
+
return f':valid-at "{valid_at}"'
|
|
1430
|
+
return f':valid-at "{_now_utc_ms()}"'
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
# ---------------------------------------------------------------------------
|
|
1434
|
+
# BM25 index — semantic retrieval primitives
|
|
1435
|
+
# ---------------------------------------------------------------------------
|
|
1436
|
+
|
|
1437
|
+
_MEMORY_PREFIXES = (":decision/", ":preference/", ":constraint/", ":dependency/")
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def _tokenize(text: str) -> List[str]:
|
|
1441
|
+
"""Split text on non-alphanumeric chars, lowercase, filter empties.
|
|
1442
|
+
|
|
1443
|
+
Works on raw fact values and keyword idents alike:
|
|
1444
|
+
":decision/use-redis" → ["decision", "use", "redis"]
|
|
1445
|
+
"use Redis for caching" → ["use", "redis", "for", "caching"]
|
|
1446
|
+
"""
|
|
1447
|
+
return [t for t in re.split(r"[^a-z0-9]+", text.lower()) if t]
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
class FactIndex:
|
|
1451
|
+
"""Immutable BM25 snapshot over a set of graph facts.
|
|
1452
|
+
|
|
1453
|
+
Each fact row [e, a, v] is tokenised as a single document.
|
|
1454
|
+
Memory facts (entity idents with a known memory prefix) receive
|
|
1455
|
+
a configurable score multiplier at query time.
|
|
1456
|
+
"""
|
|
1457
|
+
|
|
1458
|
+
def __init__(self, facts: List[List], boost: float = 2.0) -> None:
|
|
1459
|
+
self._boost = boost
|
|
1460
|
+
docs = [_tokenize(" ".join(str(x) for x in row)) for row in facts]
|
|
1461
|
+
# Filter out rows whose full text produces no tokens
|
|
1462
|
+
valid = [
|
|
1463
|
+
(row, doc, any(str(row[0]).startswith(p) for p in _MEMORY_PREFIXES))
|
|
1464
|
+
for row, doc in zip(facts, docs)
|
|
1465
|
+
if doc
|
|
1466
|
+
]
|
|
1467
|
+
if not valid or _BM25Okapi is None:
|
|
1468
|
+
self._bm25 = None
|
|
1469
|
+
self._facts: List[List] = []
|
|
1470
|
+
self._is_memory: List[bool] = []
|
|
1471
|
+
self._docs: List[List[str]] = []
|
|
1472
|
+
return
|
|
1473
|
+
rows, valid_docs, memory_flags = zip(*valid)
|
|
1474
|
+
self._facts = list(rows)
|
|
1475
|
+
self._is_memory = list(memory_flags)
|
|
1476
|
+
self._docs: List[List[str]] = list(valid_docs)
|
|
1477
|
+
self._bm25 = _BM25Okapi(self._docs)
|
|
1478
|
+
|
|
1479
|
+
def query(self, text: str, top_n: int = 50) -> List[List]:
|
|
1480
|
+
"""Return up to top_n facts ranked by BM25 score (memory boost applied).
|
|
1481
|
+
|
|
1482
|
+
Facts with no token overlap with the query are excluded. Returns []
|
|
1483
|
+
if the index is empty or no query tokens appear in any indexed fact.
|
|
1484
|
+
"""
|
|
1485
|
+
if self._bm25 is None or not self._facts:
|
|
1486
|
+
return []
|
|
1487
|
+
tokens = _tokenize(text)
|
|
1488
|
+
if not tokens:
|
|
1489
|
+
return []
|
|
1490
|
+
raw_scores = self._bm25.get_scores(tokens).tolist()
|
|
1491
|
+
# Identify docs with any token overlap.
|
|
1492
|
+
# BM25Okapi can return negative scores in small corpora (negative IDF),
|
|
1493
|
+
# so we detect overlap via a per-token presence check rather than relying on score > 0.
|
|
1494
|
+
token_set = set(tokens)
|
|
1495
|
+
has_overlap = [bool(token_set & set(doc)) for doc in self._docs]
|
|
1496
|
+
overlapping_scores = [raw_scores[i] for i in range(len(raw_scores)) if has_overlap[i]]
|
|
1497
|
+
if not overlapping_scores:
|
|
1498
|
+
return []
|
|
1499
|
+
# Shift so minimum overlapping score is 1.0 — ensures boost always raises
|
|
1500
|
+
# memory facts in rank, even when BM25 produces negative IDF in small corpora.
|
|
1501
|
+
shift = max(0.0, 1.0 - min(overlapping_scores))
|
|
1502
|
+
scores = [raw_scores[i] + shift for i in range(len(raw_scores))]
|
|
1503
|
+
for i, is_mem in enumerate(self._is_memory):
|
|
1504
|
+
if is_mem:
|
|
1505
|
+
scores[i] *= self._boost
|
|
1506
|
+
ranked = sorted(
|
|
1507
|
+
[(scores[i], self._facts[i]) for i in range(len(self._facts)) if has_overlap[i]],
|
|
1508
|
+
key=lambda x: x[0],
|
|
1509
|
+
reverse=True,
|
|
1510
|
+
)
|
|
1511
|
+
return [row for _, row in ranked[:top_n]]
|
|
1512
|
+
|
|
1513
|
+
|
|
1514
|
+
class IndexCache:
|
|
1515
|
+
"""Module-level singleton managing the live BM25 FactIndex.
|
|
1516
|
+
|
|
1517
|
+
Rebuilds asynchronously in a background thread. Serves the stale index
|
|
1518
|
+
during rebuilds; returns None before the first successful rebuild.
|
|
1519
|
+
Invalidation is idempotent while a rebuild is in progress.
|
|
1520
|
+
"""
|
|
1521
|
+
|
|
1522
|
+
def __init__(self) -> None:
|
|
1523
|
+
self._current: Optional[FactIndex] = None
|
|
1524
|
+
self._rebuilding: bool = False
|
|
1525
|
+
self._lock = threading.Lock()
|
|
1526
|
+
|
|
1527
|
+
def get(self) -> Optional[FactIndex]:
|
|
1528
|
+
"""Return the current index (may be stale or None)."""
|
|
1529
|
+
return self._current
|
|
1530
|
+
|
|
1531
|
+
def invalidate(self) -> None:
|
|
1532
|
+
"""Trigger an async rebuild if one is not already running."""
|
|
1533
|
+
if self._rebuilding:
|
|
1534
|
+
return
|
|
1535
|
+
self._rebuilding = True
|
|
1536
|
+
t = threading.Thread(target=self._rebuild, daemon=True)
|
|
1537
|
+
t.start()
|
|
1538
|
+
|
|
1539
|
+
def _rebuild(self) -> None:
|
|
1540
|
+
"""Fetch all currently-valid facts from the DB and swap the index."""
|
|
1541
|
+
try:
|
|
1542
|
+
db = get_db()
|
|
1543
|
+
boost = float(os.environ.get("MINIGRAF_MEMORY_BOOST", "2.0"))
|
|
1544
|
+
raw = db.execute(
|
|
1545
|
+
f'(query [:find ?e ?a ?v :valid-at "{_now_utc_ms()}" :where [?e ?a ?v]])'
|
|
1546
|
+
)
|
|
1547
|
+
facts = json.loads(raw).get("results", [])
|
|
1548
|
+
new_index = FactIndex(facts, boost=boost)
|
|
1549
|
+
with self._lock:
|
|
1550
|
+
self._current = new_index
|
|
1551
|
+
except Exception as e:
|
|
1552
|
+
print(f"[IndexCache] rebuild failed: {e}", file=sys.stderr)
|
|
1553
|
+
finally:
|
|
1554
|
+
self._rebuilding = False
|
|
1555
|
+
|
|
1556
|
+
|
|
1557
|
+
_index_cache = IndexCache()
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def _handle_memory_prepare_turn_heuristic(user_message: str) -> str:
|
|
1561
|
+
"""Heuristic fallback for handle_memory_prepare_turn.
|
|
1562
|
+
|
|
1563
|
+
Used when rank_bm25 is unavailable. Queries the graph using substring
|
|
1564
|
+
token matching (contains?) for entities extracted from the user message,
|
|
1565
|
+
falling back to a broad scan when no targeted results are found.
|
|
1566
|
+
|
|
1567
|
+
For current-state queries, uses :valid-at with the current UTC ms timestamp
|
|
1568
|
+
(via _build_query_clauses) so facts whose valid window includes right now
|
|
1569
|
+
are returned. For historical queries where an explicit ISO date is detected
|
|
1570
|
+
in the user message, :valid-at is set to that date (midnight UTC).
|
|
1571
|
+
"""
|
|
1572
|
+
db = get_db()
|
|
1573
|
+
scan_limit = int(os.environ.get("MINIGRAF_PREPARE_SCAN_LIMIT", "50"))
|
|
1574
|
+
temporal_clauses = _build_query_clauses(user_message)
|
|
1575
|
+
|
|
1576
|
+
entities = _extract_entities(user_message)
|
|
1577
|
+
collected: List[List[str]] = []
|
|
1578
|
+
seen: set = set()
|
|
1579
|
+
|
|
1580
|
+
for entity in entities:
|
|
1581
|
+
try:
|
|
1582
|
+
raw = db.execute(
|
|
1583
|
+
f'(query [:find ?a ?v {temporal_clauses} :where [?e ?a ?v] (contains? ?v "{entity}")])'
|
|
1584
|
+
)
|
|
1585
|
+
data = json.loads(raw)
|
|
1586
|
+
for row in data.get("results", []):
|
|
1587
|
+
key = tuple(row)
|
|
1588
|
+
if key not in seen:
|
|
1589
|
+
seen.add(key)
|
|
1590
|
+
collected.append(row)
|
|
1591
|
+
except (MiniGrafError, json.JSONDecodeError):
|
|
1592
|
+
continue
|
|
1593
|
+
|
|
1594
|
+
if not collected:
|
|
1595
|
+
# Broad fallback scan — still respect temporal clause
|
|
1596
|
+
try:
|
|
1597
|
+
raw = db.execute(
|
|
1598
|
+
f"(query [:find ?e ?a ?v {temporal_clauses} :where [?e ?a ?v]])"
|
|
1599
|
+
)
|
|
1600
|
+
data = json.loads(raw)
|
|
1601
|
+
all_results = data.get("results", [])
|
|
1602
|
+
collected = all_results[:scan_limit]
|
|
1603
|
+
except (MiniGrafError, json.JSONDecodeError):
|
|
1604
|
+
pass
|
|
1605
|
+
|
|
1606
|
+
if not collected:
|
|
1607
|
+
return ""
|
|
1608
|
+
|
|
1609
|
+
block = _format_facts(collected)
|
|
1610
|
+
return f"Relevant memory context:\n{block}"
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
def handle_memory_prepare_turn(user_message: str) -> str:
|
|
1614
|
+
"""Query graph for facts relevant to the user message.
|
|
1615
|
+
|
|
1616
|
+
Uses BM25-ranked retrieval over a cached FactIndex when rank_bm25 is
|
|
1617
|
+
available. Falls back to the heuristic (substring token) implementation
|
|
1618
|
+
when rank_bm25 is not installed.
|
|
1619
|
+
|
|
1620
|
+
Returns a formatted context block string for injection as additionalContext,
|
|
1621
|
+
or an empty string if no relevant facts are found.
|
|
1622
|
+
"""
|
|
1623
|
+
if not _BM25_AVAILABLE:
|
|
1624
|
+
return _handle_memory_prepare_turn_heuristic(user_message)
|
|
1625
|
+
|
|
1626
|
+
scan_limit = int(os.environ.get("MINIGRAF_PREPARE_SCAN_LIMIT", "50"))
|
|
1627
|
+
index = _index_cache.get()
|
|
1628
|
+
if index is None:
|
|
1629
|
+
return ""
|
|
1630
|
+
results = index.query(user_message, top_n=scan_limit)
|
|
1631
|
+
if not results:
|
|
1632
|
+
return ""
|
|
1633
|
+
return f"Relevant memory context:\n{_format_facts(results)}"
|
|
1634
|
+
|
|
1635
|
+
|
|
1636
|
+
# ---------------------------------------------------------------------------
|
|
1637
|
+
# Fact extraction — heuristic strategy
|
|
1638
|
+
# ---------------------------------------------------------------------------
|
|
1639
|
+
|
|
1640
|
+
_SIGNAL_PATTERNS = [
|
|
1641
|
+
# Each pattern captures a single token after the signal phrase. Articles ("a", "the", etc.)
|
|
1642
|
+
# will match first if present (e.g. "depends on the auth-service" → captures "the"), but
|
|
1643
|
+
# the stop-word filter below drops them, producing zero facts for that phrase. Users should
|
|
1644
|
+
# write "depends on auth-service" (no article) to ensure capture.
|
|
1645
|
+
(r"we(?:'ll?|\s+will)\s+use\s+([\w\-]+)", "decision", ":description", "chosen technology or approach"),
|
|
1646
|
+
(r"going\s+with\s+([\w\-]+)", "decision", ":description", "chosen approach"),
|
|
1647
|
+
(r"decided\s+(?:to\s+)?(?:use\s+)?([\w\-]+)", "decision", ":description", "decided approach"),
|
|
1648
|
+
(r"we\s+chose\s+([\w\-]+)", "decision", ":description", "chosen option"),
|
|
1649
|
+
(r"I\s+prefer\s+([\w\-]+)", "preference", ":description", "stated preference"),
|
|
1650
|
+
(r"I\s+don'?t\s+like\s+([\w\-]+)", "preference", ":description", "stated dislike"),
|
|
1651
|
+
(r"always\s+use\s+([\w\-]+)", "preference", ":description", "always-use preference"),
|
|
1652
|
+
(r"never\s+use\s+([\w\-]+)", "preference", ":description", "never-use preference"),
|
|
1653
|
+
(r"prioritize\s+([\w\-]+)", "preference", ":description", "priority preference"),
|
|
1654
|
+
(r"must\s+be\s+([\w\-]+)", "constraint", ":description", "hard constraint"),
|
|
1655
|
+
(r"can'?t\s+use\s+([\w\-]+)", "constraint", ":description", "exclusion constraint"),
|
|
1656
|
+
(r"depends\s+on\s+([\w\-]+)", "dependency", ":description", "dependency relationship"),
|
|
1657
|
+
(r"requires?\s+([\w\-]+)", "dependency", ":description", "required dependency"),
|
|
1658
|
+
]
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
def heuristic_extract(text: str) -> List[Dict[str, str]]:
|
|
1662
|
+
"""
|
|
1663
|
+
Scan text for decision-signal phrases and return a list of fact dicts.
|
|
1664
|
+
Each dict has keys: entity, attribute, value, reason.
|
|
1665
|
+
"""
|
|
1666
|
+
facts = []
|
|
1667
|
+
seen_values: set = set()
|
|
1668
|
+
|
|
1669
|
+
for pattern, entity_type, attribute, reason_prefix in _SIGNAL_PATTERNS:
|
|
1670
|
+
for match in re.finditer(pattern, text, re.IGNORECASE):
|
|
1671
|
+
value = match.group(1).strip()
|
|
1672
|
+
if len(value) < 2 or value.lower() in _STOP_WORDS:
|
|
1673
|
+
continue
|
|
1674
|
+
key = (entity_type, value.lower())
|
|
1675
|
+
if key in seen_values:
|
|
1676
|
+
continue
|
|
1677
|
+
seen_values.add(key)
|
|
1678
|
+
entity_ident = _canonical_ident(entity_type, value)
|
|
1679
|
+
facts.append({
|
|
1680
|
+
"entity": entity_ident,
|
|
1681
|
+
"entity_type": entity_type,
|
|
1682
|
+
"attribute": attribute,
|
|
1683
|
+
"value": value,
|
|
1684
|
+
"reason": f"{reason_prefix} — extracted by heuristic strategy",
|
|
1685
|
+
})
|
|
1686
|
+
|
|
1687
|
+
return facts
|
|
1688
|
+
|
|
1689
|
+
|
|
1690
|
+
def _transact_extracted_facts(facts: List[Dict[str, str]], valid_from: Optional[str] = None) -> int:
|
|
1691
|
+
"""
|
|
1692
|
+
Transact a list of extracted fact dicts. Returns count of successfully stored facts.
|
|
1693
|
+
|
|
1694
|
+
Sets :valid-from to the current UTC ms timestamp on every write so that
|
|
1695
|
+
valid-time is recorded. Combined with :as-of in queries this enables true
|
|
1696
|
+
bi-temporal point-in-time reads.
|
|
1697
|
+
|
|
1698
|
+
valid_from: override the :valid-from timestamp (ISO 8601). If None, defaults
|
|
1699
|
+
to the current UTC time. Pass a past date to backdate facts (e.g. from
|
|
1700
|
+
LLM-annotated '; valid-at: YYYY-MM-DD' hints).
|
|
1701
|
+
"""
|
|
1702
|
+
_refresh_if_stale()
|
|
1703
|
+
db = get_db()
|
|
1704
|
+
stored = 0
|
|
1705
|
+
for fact in facts:
|
|
1706
|
+
entity = fact["entity"]
|
|
1707
|
+
entity_type = fact.get("entity_type", "")
|
|
1708
|
+
attribute = fact["attribute"]
|
|
1709
|
+
value = fact["value"]
|
|
1710
|
+
# Schema validation — closed-world: skip invalid facts.
|
|
1711
|
+
violations = _validate_facts([fact])
|
|
1712
|
+
if violations:
|
|
1713
|
+
continue
|
|
1714
|
+
now_z = valid_from or _now_utc_ms()
|
|
1715
|
+
try:
|
|
1716
|
+
# Combine main fact, :entity-type tag, and :ident into one transact so
|
|
1717
|
+
# all triples are written atomically — a single (transact [...]) is one
|
|
1718
|
+
# transaction. :ident stores the keyword ident as a string value so that
|
|
1719
|
+
# handle_minigraf_audit and _query_canonical_entities can surface it for
|
|
1720
|
+
# display without knowing the UUID (audits retract via #uuid "..." syntax).
|
|
1721
|
+
if entity_type:
|
|
1722
|
+
triples = (
|
|
1723
|
+
f'[{entity} {attribute} "{value}"]'
|
|
1724
|
+
f' [{entity} :entity-type :type/{entity_type}]'
|
|
1725
|
+
f' [{entity} :ident "{entity}"]'
|
|
1726
|
+
)
|
|
1727
|
+
else:
|
|
1728
|
+
triples = f'[{entity} {attribute} "{value}"]'
|
|
1729
|
+
db.execute(f'(transact [{triples}] {{:valid-from "{now_z}"}})')
|
|
1730
|
+
stored += 1
|
|
1731
|
+
except MiniGrafError:
|
|
1732
|
+
continue
|
|
1733
|
+
if stored:
|
|
1734
|
+
db.checkpoint()
|
|
1735
|
+
_update_mtime()
|
|
1736
|
+
return stored
|
|
1737
|
+
|
|
1738
|
+
|
|
1739
|
+
# ---------------------------------------------------------------------------
|
|
1740
|
+
# Fact extraction — llm strategy
|
|
1741
|
+
# ---------------------------------------------------------------------------
|
|
1742
|
+
|
|
1743
|
+
_LLM_EXTRACTION_PROMPT = """You are a memory extraction assistant for a bi-temporal graph database. Review the conversation below and identify any decisions, preferences, constraints, or dependencies that should be stored in long-term memory.
|
|
1744
|
+
|
|
1745
|
+
Return ONLY a Datalog transact expression — a list of triples in this exact format:
|
|
1746
|
+
[[:entity/ident :attribute "value"]
|
|
1747
|
+
[:entity/ident :attribute "value"]]
|
|
1748
|
+
|
|
1749
|
+
If nothing worth storing was found, return an empty list: []
|
|
1750
|
+
|
|
1751
|
+
Allowed entity type prefixes: :decision/ :preference/ :constraint/ :dependency/
|
|
1752
|
+
Canonical ident form: lowercase, hyphens only — :decision/redis not :decision/Redis_cache.
|
|
1753
|
+
{canonical_entities_section}
|
|
1754
|
+
Use these attributes: :description (required), :rationale (optional), :date (optional), :alias (optional).
|
|
1755
|
+
No other attributes are valid.
|
|
1756
|
+
|
|
1757
|
+
IMPORTANT — entity resolution: if a reference matches an existing canonical ident or alias above,
|
|
1758
|
+
reuse that exact ident. Only mint a new ident if the entity is genuinely new.
|
|
1759
|
+
|
|
1760
|
+
IMPORTANT — bi-temporality: this database is bi-temporal. Facts have both a transaction time
|
|
1761
|
+
(when they were recorded) and a valid time (when they were true in the world). When the conversation
|
|
1762
|
+
mentions that something was decided or true at a specific past date, note that date alongside the
|
|
1763
|
+
fact so the caller can set :valid-at accordingly. Wrap such facts in a comment line:
|
|
1764
|
+
; valid-at: 2024-03-15
|
|
1765
|
+
[[:entity/ident :attribute "value"]]
|
|
1766
|
+
|
|
1767
|
+
For point-in-time historical queries, always use :as-of N and :valid-at "date" TOGETHER —
|
|
1768
|
+
using only one gives a partial view.
|
|
1769
|
+
|
|
1770
|
+
Conversation:
|
|
1771
|
+
{conversation}"""
|
|
1772
|
+
|
|
1773
|
+
|
|
1774
|
+
def _get_anthropic_client():
|
|
1775
|
+
"""Return an Anthropic client. Raises if anthropic package or API key is missing."""
|
|
1776
|
+
try:
|
|
1777
|
+
import anthropic
|
|
1778
|
+
except ImportError:
|
|
1779
|
+
raise RuntimeError("anthropic package not installed — pip install anthropic")
|
|
1780
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
1781
|
+
if not api_key:
|
|
1782
|
+
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
|
1783
|
+
return anthropic.Anthropic(api_key=api_key)
|
|
1784
|
+
|
|
1785
|
+
|
|
1786
|
+
_OPENAI_MODEL_PREFIXES = ("gpt-", "o1", "o3", "o4")
|
|
1787
|
+
|
|
1788
|
+
|
|
1789
|
+
def _is_openai_model(model: str) -> bool:
|
|
1790
|
+
return any(model.startswith(p) for p in _OPENAI_MODEL_PREFIXES)
|
|
1791
|
+
|
|
1792
|
+
|
|
1793
|
+
def _get_openai_client():
|
|
1794
|
+
"""Return an OpenAI client. Raises if openai package or API key is missing."""
|
|
1795
|
+
try:
|
|
1796
|
+
import openai
|
|
1797
|
+
except ImportError:
|
|
1798
|
+
raise RuntimeError("openai package not installed — pip install openai")
|
|
1799
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
1800
|
+
if not api_key:
|
|
1801
|
+
raise RuntimeError("OPENAI_API_KEY not set")
|
|
1802
|
+
return openai.OpenAI(api_key=api_key)
|
|
1803
|
+
|
|
1804
|
+
|
|
1805
|
+
def _strip_code_fences(text: str) -> str:
|
|
1806
|
+
"""Remove markdown code fences that LLMs sometimes wrap around Datalog output.
|
|
1807
|
+
|
|
1808
|
+
Handles both ``` and ```datalog (or any language tag). Returns the inner
|
|
1809
|
+
content, stripped. If no fences are present, returns the input unchanged.
|
|
1810
|
+
"""
|
|
1811
|
+
text = text.strip()
|
|
1812
|
+
if text.startswith("```"):
|
|
1813
|
+
# Drop the opening fence line (``` or ```datalog etc.)
|
|
1814
|
+
first_newline = text.find("\n")
|
|
1815
|
+
if first_newline != -1:
|
|
1816
|
+
text = text[first_newline + 1:]
|
|
1817
|
+
# Drop the closing fence if present
|
|
1818
|
+
if text.rstrip().endswith("```"):
|
|
1819
|
+
text = text.rstrip()[:-3]
|
|
1820
|
+
return text.strip()
|
|
1821
|
+
|
|
1822
|
+
|
|
1823
|
+
def _llm_missing_package_warning(error: str) -> str:
|
|
1824
|
+
"""Return a user-facing install instruction when the LLM package is absent.
|
|
1825
|
+
|
|
1826
|
+
Inspects the error string from _llm_extract_and_transact and maps it to
|
|
1827
|
+
the correct pip install command based on the configured model.
|
|
1828
|
+
Returns an empty string when the error is not a missing-package error.
|
|
1829
|
+
"""
|
|
1830
|
+
model = os.environ.get("MINIGRAF_LLM_MODEL", "claude-haiku-4-5-20251001")
|
|
1831
|
+
if "anthropic package not installed" in error:
|
|
1832
|
+
return (
|
|
1833
|
+
"ACTION REQUIRED: pip install anthropic\n"
|
|
1834
|
+
f" The configured model '{model}' requires the anthropic package.\n"
|
|
1835
|
+
" Set MINIGRAF_LLM_MODEL in .mcp.json if you want to use an OpenAI model instead."
|
|
1836
|
+
)
|
|
1837
|
+
if "openai package not installed" in error:
|
|
1838
|
+
return (
|
|
1839
|
+
"ACTION REQUIRED: pip install openai\n"
|
|
1840
|
+
f" The configured model '{model}' requires the openai package.\n"
|
|
1841
|
+
" Set MINIGRAF_LLM_MODEL in .mcp.json if you want to use an Anthropic model instead."
|
|
1842
|
+
)
|
|
1843
|
+
return ""
|
|
1844
|
+
|
|
1845
|
+
|
|
1846
|
+
def _call_llm(model: str, prompt: str) -> str:
|
|
1847
|
+
"""Call an LLM and return the response text. Dispatches to OpenAI or Anthropic by model name."""
|
|
1848
|
+
if _is_openai_model(model):
|
|
1849
|
+
client = _get_openai_client()
|
|
1850
|
+
response = client.chat.completions.create(
|
|
1851
|
+
model=model,
|
|
1852
|
+
max_tokens=1024,
|
|
1853
|
+
messages=[{"role": "user", "content": prompt}],
|
|
1854
|
+
)
|
|
1855
|
+
return response.choices[0].message.content
|
|
1856
|
+
else:
|
|
1857
|
+
client = _get_anthropic_client()
|
|
1858
|
+
message = client.messages.create(
|
|
1859
|
+
model=model,
|
|
1860
|
+
max_tokens=1024,
|
|
1861
|
+
messages=[{"role": "user", "content": prompt}],
|
|
1862
|
+
)
|
|
1863
|
+
return message.content[0].text
|
|
1864
|
+
|
|
1865
|
+
|
|
1866
|
+
def _parse_valid_at_hint(raw: str):
|
|
1867
|
+
"""Extract optional '; valid-at: YYYY-MM-DD' comment from model output.
|
|
1868
|
+
|
|
1869
|
+
Returns (valid_at, cleaned_datalog) where valid_at defaults to the current
|
|
1870
|
+
UTC ms timestamp if no hint is present.
|
|
1871
|
+
"""
|
|
1872
|
+
valid_at = _now_utc_ms()
|
|
1873
|
+
kept = []
|
|
1874
|
+
for line in raw.splitlines():
|
|
1875
|
+
stripped = line.strip()
|
|
1876
|
+
if stripped.startswith("; valid-at:"):
|
|
1877
|
+
date_str = stripped[len("; valid-at:"):].strip()
|
|
1878
|
+
if re.match(r"^\d{4}-\d{2}-\d{2}$", date_str):
|
|
1879
|
+
valid_at = date_str
|
|
1880
|
+
else:
|
|
1881
|
+
kept.append(line)
|
|
1882
|
+
return valid_at, "\n".join(kept).strip()
|
|
1883
|
+
|
|
1884
|
+
|
|
1885
|
+
def _llm_extract_and_transact(conversation_delta: str) -> Dict[str, Any]:
|
|
1886
|
+
"""Call a lightweight LLM to extract facts. Returns {ok, stored_count, strategy}."""
|
|
1887
|
+
try:
|
|
1888
|
+
model = os.environ.get("MINIGRAF_LLM_MODEL", "claude-haiku-4-5-20251001")
|
|
1889
|
+
canonical = _query_canonical_entities()
|
|
1890
|
+
if canonical:
|
|
1891
|
+
canonical_entities_section = (
|
|
1892
|
+
"\nExisting canonical entities (reuse these idents — do not invent synonyms):\n"
|
|
1893
|
+
+ canonical
|
|
1894
|
+
)
|
|
1895
|
+
else:
|
|
1896
|
+
canonical_entities_section = ""
|
|
1897
|
+
prompt = _LLM_EXTRACTION_PROMPT.format(
|
|
1898
|
+
conversation=conversation_delta,
|
|
1899
|
+
canonical_entities_section=canonical_entities_section,
|
|
1900
|
+
)
|
|
1901
|
+
raw_facts = _strip_code_fences(_call_llm(model, prompt))
|
|
1902
|
+
if not raw_facts or raw_facts == "[]":
|
|
1903
|
+
return {"ok": True, "stored_count": 0, "strategy": "llm"}
|
|
1904
|
+
valid_at, datalog = _parse_valid_at_hint(raw_facts)
|
|
1905
|
+
if not datalog or datalog == "[]":
|
|
1906
|
+
return {"ok": True, "stored_count": 0, "strategy": "llm"}
|
|
1907
|
+
# Route through _transact_extracted_facts so each fact gets schema
|
|
1908
|
+
# validation and an :entity-type tag — same path as heuristic extraction.
|
|
1909
|
+
parsed = _parse_transact_facts(datalog)
|
|
1910
|
+
stored_count = _transact_extracted_facts(parsed, valid_from=valid_at)
|
|
1911
|
+
return {"ok": True, "stored_count": stored_count, "strategy": "llm"}
|
|
1912
|
+
except Exception as e:
|
|
1913
|
+
return {"ok": False, "error": str(e), "strategy": "llm"}
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
# ---------------------------------------------------------------------------
|
|
1917
|
+
# Fact extraction — agent (MCP sampling) strategy
|
|
1918
|
+
# ---------------------------------------------------------------------------
|
|
1919
|
+
|
|
1920
|
+
_AGENT_SAMPLING_PROMPT = """Review this conversation turn and output ONLY a Datalog transact expression for any decisions, preferences, constraints, or dependencies worth storing in long-term memory.
|
|
1921
|
+
|
|
1922
|
+
Allowed entity type prefixes: :decision/ :preference/ :constraint/ :dependency/
|
|
1923
|
+
Canonical ident form: lowercase, hyphens only — :decision/redis not :decision/Redis_cache.
|
|
1924
|
+
{canonical_entities_section}
|
|
1925
|
+
Use these attributes: :description (required), :rationale (optional), :date (optional), :alias (optional).
|
|
1926
|
+
No other attributes are valid. If an entity matches an existing ident or alias, reuse it exactly.
|
|
1927
|
+
|
|
1928
|
+
Format:
|
|
1929
|
+
[[:entity/ident :attribute "value"]]
|
|
1930
|
+
|
|
1931
|
+
Return [] if nothing is worth storing.
|
|
1932
|
+
|
|
1933
|
+
{conversation}"""
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
async def _request_agent_memory_block_async(conversation_delta: str, canonical_entities_section: str = "") -> str:
|
|
1937
|
+
"""Use MCP sampling to ask the connected agent for a memory block."""
|
|
1938
|
+
if _server_ref is None:
|
|
1939
|
+
raise RuntimeError("Server reference not set")
|
|
1940
|
+
from mcp.types import SamplingMessage, TextContent as TC
|
|
1941
|
+
prompt = _AGENT_SAMPLING_PROMPT.format(
|
|
1942
|
+
conversation=conversation_delta,
|
|
1943
|
+
canonical_entities_section=canonical_entities_section,
|
|
1944
|
+
)
|
|
1945
|
+
result = await _server_ref.request_context.session.create_message(
|
|
1946
|
+
messages=[SamplingMessage(role="user", content=TC(type="text", text=prompt))],
|
|
1947
|
+
max_tokens=512,
|
|
1948
|
+
)
|
|
1949
|
+
return result.content.text if hasattr(result.content, "text") else str(result.content)
|
|
1950
|
+
|
|
1951
|
+
|
|
1952
|
+
async def _agent_extract_and_transact(conversation_delta: str) -> Dict[str, Any]:
|
|
1953
|
+
"""Request a memory block from the agent via MCP sampling, then transact it."""
|
|
1954
|
+
try:
|
|
1955
|
+
canonical = _query_canonical_entities()
|
|
1956
|
+
if canonical:
|
|
1957
|
+
canonical_entities_section = (
|
|
1958
|
+
"\nExisting canonical entities (reuse these idents — do not invent synonyms):\n"
|
|
1959
|
+
+ canonical
|
|
1960
|
+
)
|
|
1961
|
+
else:
|
|
1962
|
+
canonical_entities_section = ""
|
|
1963
|
+
raw_facts = _strip_code_fences(await _request_agent_memory_block_async(conversation_delta, canonical_entities_section))
|
|
1964
|
+
if not raw_facts or raw_facts == "[]":
|
|
1965
|
+
return {"ok": True, "stored_count": 0, "strategy": "agent"}
|
|
1966
|
+
valid_at, datalog = _parse_valid_at_hint(raw_facts)
|
|
1967
|
+
if not datalog or datalog == "[]":
|
|
1968
|
+
return {"ok": True, "stored_count": 0, "strategy": "agent"}
|
|
1969
|
+
_refresh_if_stale()
|
|
1970
|
+
db = get_db()
|
|
1971
|
+
db.execute(f'(transact {datalog} {{:valid-from "{valid_at}"}})')
|
|
1972
|
+
db.checkpoint()
|
|
1973
|
+
_update_mtime()
|
|
1974
|
+
# Approximate: count "[:" occurrences as a proxy for triple count.
|
|
1975
|
+
stored_count = datalog.count("[:")
|
|
1976
|
+
return {"ok": True, "stored_count": stored_count, "strategy": "agent"}
|
|
1977
|
+
except Exception as e:
|
|
1978
|
+
return {"ok": False, "error": str(e), "strategy": "agent"}
|
|
1979
|
+
|
|
1980
|
+
|
|
1981
|
+
# ---------------------------------------------------------------------------
|
|
1982
|
+
# memory_finalize_turn — dispatcher
|
|
1983
|
+
# ---------------------------------------------------------------------------
|
|
1984
|
+
|
|
1985
|
+
async def handle_memory_finalize_turn(conversation_delta: str) -> Dict[str, Any]:
|
|
1986
|
+
"""
|
|
1987
|
+
Extract facts from conversation_delta and transact them.
|
|
1988
|
+
Strategy selected via MINIGRAF_EXTRACTION_STRATEGY env var (default: heuristic).
|
|
1989
|
+
"""
|
|
1990
|
+
strategy = os.environ.get("MINIGRAF_EXTRACTION_STRATEGY", "heuristic")
|
|
1991
|
+
|
|
1992
|
+
if strategy == "heuristic":
|
|
1993
|
+
facts = heuristic_extract(conversation_delta)
|
|
1994
|
+
stored = _transact_extracted_facts(facts)
|
|
1995
|
+
return {"ok": True, "stored_count": stored, "strategy": "heuristic"}
|
|
1996
|
+
|
|
1997
|
+
if strategy == "llm":
|
|
1998
|
+
result = _llm_extract_and_transact(conversation_delta)
|
|
1999
|
+
if result["ok"]:
|
|
2000
|
+
return result
|
|
2001
|
+
# LLM failed — fall back to heuristic and surface a warning so the user
|
|
2002
|
+
# can see what went wrong (e.g. missing package, bad API key).
|
|
2003
|
+
llm_error = result.get("error", "")
|
|
2004
|
+
warning = _llm_missing_package_warning(llm_error)
|
|
2005
|
+
facts = heuristic_extract(conversation_delta)
|
|
2006
|
+
stored = _transact_extracted_facts(facts)
|
|
2007
|
+
response: Dict[str, Any] = {
|
|
2008
|
+
"ok": True,
|
|
2009
|
+
"stored_count": stored,
|
|
2010
|
+
"strategy": "heuristic (llm fallback)",
|
|
2011
|
+
}
|
|
2012
|
+
if warning:
|
|
2013
|
+
response["warning"] = warning
|
|
2014
|
+
elif llm_error:
|
|
2015
|
+
response["warning"] = f"LLM extraction failed ({llm_error}); fell back to heuristic."
|
|
2016
|
+
return response
|
|
2017
|
+
|
|
2018
|
+
if strategy == "agent":
|
|
2019
|
+
return await _agent_extract_and_transact(conversation_delta)
|
|
2020
|
+
|
|
2021
|
+
return {"ok": False, "error": f"Unknown strategy: {strategy}"}
|
|
2022
|
+
|
|
2023
|
+
|
|
2024
|
+
def _build_code_triples(
|
|
2025
|
+
file_path: str,
|
|
2026
|
+
extracted: Dict[str, List[str]],
|
|
2027
|
+
commit_ts_iso: str,
|
|
2028
|
+
entity_valid_from: Dict[str, str],
|
|
2029
|
+
entity_descriptions: Dict[str, str],
|
|
2030
|
+
file_entities: Dict[str, List[str]],
|
|
2031
|
+
commit_ident: str,
|
|
2032
|
+
) -> List[str]:
|
|
2033
|
+
"""Return Datalog triple strings for a file's extracted code entities.
|
|
2034
|
+
|
|
2035
|
+
Stable attributes (:entity-type, :ident, :description, :path/:file,
|
|
2036
|
+
:introduced-by, :contains) are written ONCE on first introduction. On
|
|
2037
|
+
subsequent modifications only a :modified-in edge is added. This prevents
|
|
2038
|
+
bi-temporal fact explosion from N re-assertions of the same attribute
|
|
2039
|
+
joining into N² result rows.
|
|
2040
|
+
|
|
2041
|
+
:depends-on edges are written in the commit loop by _run_ingestion as the
|
|
2042
|
+
file's imports change, giving them proper bi-temporal bounds.
|
|
2043
|
+
"""
|
|
2044
|
+
triples: List[str] = []
|
|
2045
|
+
module_ident = _code_ident("module", file_path)
|
|
2046
|
+
|
|
2047
|
+
is_new_module = module_ident not in entity_valid_from
|
|
2048
|
+
# Track all idents for this file (for deletion cleanup)
|
|
2049
|
+
idents_for_file = file_entities.setdefault(file_path, [])
|
|
2050
|
+
|
|
2051
|
+
if is_new_module:
|
|
2052
|
+
# Write all stable attributes once, at introduction time
|
|
2053
|
+
triples += [
|
|
2054
|
+
f"[{module_ident} :entity-type :type/module]",
|
|
2055
|
+
f'[{module_ident} :ident "{module_ident}"]',
|
|
2056
|
+
f'[{module_ident} :description "{_edn_escape(file_path)}"]',
|
|
2057
|
+
f'[{module_ident} :path "{_edn_escape(file_path)}"]',
|
|
2058
|
+
f"[{module_ident} :introduced-by {commit_ident}]",
|
|
2059
|
+
]
|
|
2060
|
+
if module_ident not in idents_for_file:
|
|
2061
|
+
idents_for_file.append(module_ident)
|
|
2062
|
+
entity_valid_from[module_ident] = commit_ts_iso
|
|
2063
|
+
entity_descriptions[module_ident] = file_path
|
|
2064
|
+
|
|
2065
|
+
else:
|
|
2066
|
+
# Existing module: only record that this commit modified it
|
|
2067
|
+
triples.append(f"[{module_ident} :modified-in {commit_ident}]")
|
|
2068
|
+
|
|
2069
|
+
for fn_name in extracted["functions"]:
|
|
2070
|
+
fn_ident = _code_ident("function", file_path, fn_name)
|
|
2071
|
+
if fn_ident not in entity_valid_from:
|
|
2072
|
+
# New function: write all stable attributes once
|
|
2073
|
+
triples += [
|
|
2074
|
+
f"[{fn_ident} :entity-type :type/function]",
|
|
2075
|
+
f'[{fn_ident} :ident "{fn_ident}"]',
|
|
2076
|
+
f'[{fn_ident} :description "{_edn_escape(fn_name)}"]',
|
|
2077
|
+
f'[{fn_ident} :file "{_edn_escape(file_path)}"]',
|
|
2078
|
+
f"[{module_ident} :contains {fn_ident}]",
|
|
2079
|
+
f"[{fn_ident} :introduced-by {commit_ident}]",
|
|
2080
|
+
]
|
|
2081
|
+
if fn_ident not in idents_for_file:
|
|
2082
|
+
idents_for_file.append(fn_ident)
|
|
2083
|
+
entity_valid_from[fn_ident] = commit_ts_iso
|
|
2084
|
+
entity_descriptions[fn_ident] = fn_name
|
|
2085
|
+
else:
|
|
2086
|
+
# Pre-existing function: record that this commit modified it
|
|
2087
|
+
triples.append(f"[{fn_ident} :modified-in {commit_ident}]")
|
|
2088
|
+
|
|
2089
|
+
for cls_name in extracted["classes"]:
|
|
2090
|
+
cls_ident = _code_ident("class", file_path, cls_name)
|
|
2091
|
+
if cls_ident not in entity_valid_from:
|
|
2092
|
+
# New class: write all stable attributes once
|
|
2093
|
+
triples += [
|
|
2094
|
+
f"[{cls_ident} :entity-type :type/class]",
|
|
2095
|
+
f'[{cls_ident} :ident "{cls_ident}"]',
|
|
2096
|
+
f'[{cls_ident} :description "{_edn_escape(cls_name)}"]',
|
|
2097
|
+
f'[{cls_ident} :file "{_edn_escape(file_path)}"]',
|
|
2098
|
+
f"[{module_ident} :contains {cls_ident}]",
|
|
2099
|
+
f"[{cls_ident} :introduced-by {commit_ident}]",
|
|
2100
|
+
]
|
|
2101
|
+
if cls_ident not in idents_for_file:
|
|
2102
|
+
idents_for_file.append(cls_ident)
|
|
2103
|
+
entity_valid_from[cls_ident] = commit_ts_iso
|
|
2104
|
+
entity_descriptions[cls_ident] = cls_name
|
|
2105
|
+
else:
|
|
2106
|
+
# Pre-existing class: record that this commit modified it
|
|
2107
|
+
triples.append(f"[{cls_ident} :modified-in {commit_ident}]")
|
|
2108
|
+
|
|
2109
|
+
return triples
|
|
2110
|
+
|
|
2111
|
+
|
|
2112
|
+
def _preload_known_entities(db: Any, repo_path: str) -> tuple:
|
|
2113
|
+
"""Load all existing module/function/class idents from the DB, and pre-seed
|
|
2114
|
+
file_entities with all currently tracked files in the repo.
|
|
2115
|
+
|
|
2116
|
+
Pre-seeding from `git ls-files` ensures that _resolve_module_import can
|
|
2117
|
+
find any module file even when processing early commits — before those files
|
|
2118
|
+
have been introduced in the chronological commit walk.
|
|
2119
|
+
|
|
2120
|
+
Returns (entity_valid_from, entity_descriptions, file_entities).
|
|
2121
|
+
entity_valid_from maps ident → git commit timestamp of first introduction.
|
|
2122
|
+
entity_descriptions maps ident → human-readable name (function/class/file).
|
|
2123
|
+
"""
|
|
2124
|
+
entity_valid_from: Dict[str, str] = {}
|
|
2125
|
+
entity_descriptions: Dict[str, str] = {}
|
|
2126
|
+
file_entities: Dict[str, List[str]] = {}
|
|
2127
|
+
|
|
2128
|
+
# Pre-seed file_entities with all files currently in the repo
|
|
2129
|
+
try:
|
|
2130
|
+
result = _subprocess.run(
|
|
2131
|
+
["git", "ls-files", "--full-name"],
|
|
2132
|
+
cwd=repo_path, capture_output=True, text=True, timeout=30,
|
|
2133
|
+
)
|
|
2134
|
+
for filepath in result.stdout.strip().splitlines():
|
|
2135
|
+
if Path(filepath).suffix.lower() in _EXT_TO_LANG:
|
|
2136
|
+
file_entities.setdefault(filepath, [])
|
|
2137
|
+
except Exception:
|
|
2138
|
+
pass
|
|
2139
|
+
|
|
2140
|
+
for entity_type in ("module", "function", "class"):
|
|
2141
|
+
path_attr = "path" if entity_type == "module" else "file"
|
|
2142
|
+
try:
|
|
2143
|
+
raw = db.execute(
|
|
2144
|
+
f'(query [:find ?ident ?path ?desc ?date '
|
|
2145
|
+
f':where [?e :entity-type :type/{entity_type}] '
|
|
2146
|
+
f'[?e :ident ?ident] '
|
|
2147
|
+
f'[?e :{path_attr} ?path] '
|
|
2148
|
+
f'[?e :description ?desc] '
|
|
2149
|
+
f'[?e :introduced-by ?c] '
|
|
2150
|
+
f'[?c :date ?date]])'
|
|
2151
|
+
)
|
|
2152
|
+
rows = json.loads(raw).get("results", [])
|
|
2153
|
+
for ident, path, desc, date in rows:
|
|
2154
|
+
entity_valid_from[ident] = date
|
|
2155
|
+
entity_descriptions[ident] = desc
|
|
2156
|
+
file_entities.setdefault(path, [])
|
|
2157
|
+
if ident not in file_entities[path]:
|
|
2158
|
+
file_entities[path].append(ident)
|
|
2159
|
+
except Exception:
|
|
2160
|
+
pass
|
|
2161
|
+
|
|
2162
|
+
return entity_valid_from, entity_descriptions, file_entities
|
|
2163
|
+
|
|
2164
|
+
|
|
2165
|
+
def _ingest_tags(db: Any, repo_path: str, run_ts_iso: str) -> None:
|
|
2166
|
+
"""Ingest git tags as :tag/<slug> entities with :tagged-commit references.
|
|
2167
|
+
|
|
2168
|
+
Called once after the commit walk. All tags are re-ingested on every run
|
|
2169
|
+
so newly created tags pointing to previously ingested commits are picked up.
|
|
2170
|
+
Re-transacting identical facts is idempotent in Minigraf.
|
|
2171
|
+
"""
|
|
2172
|
+
try:
|
|
2173
|
+
tags = _git_tags(repo_path)
|
|
2174
|
+
except Exception:
|
|
2175
|
+
return # non-fatal
|
|
2176
|
+
|
|
2177
|
+
for tag_name, commit_hash, date_raw in tags:
|
|
2178
|
+
try:
|
|
2179
|
+
slug = re.sub(r"[^a-z0-9]+", "-", tag_name.lower()).strip("-")
|
|
2180
|
+
tag_ident = f":tag/{slug}"
|
|
2181
|
+
commit_ident = f":commit/{commit_hash[:12]}"
|
|
2182
|
+
triples = [
|
|
2183
|
+
f"[{tag_ident} :entity-type :type/tag]",
|
|
2184
|
+
f'[{tag_ident} :name "{_edn_escape(tag_name)}"]',
|
|
2185
|
+
f'[{tag_ident} :ident "{tag_ident}"]',
|
|
2186
|
+
f'[{tag_ident} :description "git tag {_edn_escape(tag_name)}"]',
|
|
2187
|
+
f"[{tag_ident} :tagged-commit {commit_ident}]",
|
|
2188
|
+
]
|
|
2189
|
+
if date_raw:
|
|
2190
|
+
triples.append(f'[{tag_ident} :date "{_edn_escape(date_raw)}"]')
|
|
2191
|
+
db.execute(f'(transact [{" ".join(triples)}] {{:valid-from "{run_ts_iso}"}})')
|
|
2192
|
+
except Exception:
|
|
2193
|
+
pass # non-fatal per tag
|
|
2194
|
+
|
|
2195
|
+
|
|
2196
|
+
async def _run_ingestion(repo_path: str, branch: str) -> None:
|
|
2197
|
+
"""Background coroutine: walk git history and ingest code structure."""
|
|
2198
|
+
global _db, _ingest_progress
|
|
2199
|
+
try:
|
|
2200
|
+
# Read watermark and pre-load known entities before releasing DB
|
|
2201
|
+
db = get_db()
|
|
2202
|
+
watermark = _watermark_query(db)
|
|
2203
|
+
prior_ingested = _total_ingested_query(db)
|
|
2204
|
+
entity_valid_from, entity_descriptions, file_entities = _preload_known_entities(db, repo_path)
|
|
2205
|
+
file_deps: Dict[str, set] = {} # file_path -> set of dep module idents
|
|
2206
|
+
dep_valid_from: Dict[tuple, str] = {} # (src_ident, dep_ident) -> intro commit ts
|
|
2207
|
+
_db = None # release file lock while enumerating commits
|
|
2208
|
+
|
|
2209
|
+
commits = _git_commits(repo_path, watermark, branch)
|
|
2210
|
+
repo_total_result = _subprocess.run(
|
|
2211
|
+
["git", "rev-list", "--count", "HEAD"],
|
|
2212
|
+
cwd=repo_path, capture_output=True, text=True,
|
|
2213
|
+
)
|
|
2214
|
+
repo_total = int(repo_total_result.stdout.strip()) if repo_total_result.returncode == 0 else len(commits)
|
|
2215
|
+
_ingest_progress["total"] = repo_total
|
|
2216
|
+
_ingest_progress["status"] = "running"
|
|
2217
|
+
_ingest_progress["processed"] = prior_ingested
|
|
2218
|
+
|
|
2219
|
+
last_hash = watermark or ""
|
|
2220
|
+
|
|
2221
|
+
for commit_hash, commit_ts_iso, author, subject in commits:
|
|
2222
|
+
last_hash = commit_hash
|
|
2223
|
+
_ingest_progress["current_commit"] = commit_hash
|
|
2224
|
+
reason = f"git:{commit_hash} {author}: {subject}"
|
|
2225
|
+
|
|
2226
|
+
# Build commit entity ident from first 12 chars of hash
|
|
2227
|
+
commit_ident = f":commit/{commit_hash[:12]}"
|
|
2228
|
+
|
|
2229
|
+
# Acquire DB fresh each commit — never hold across yield
|
|
2230
|
+
db = get_db()
|
|
2231
|
+
try:
|
|
2232
|
+
changed = _git_changed_files(repo_path, commit_hash)
|
|
2233
|
+
add_triples: List[str] = [
|
|
2234
|
+
f"[{commit_ident} :entity-type :type/commit]",
|
|
2235
|
+
f'[{commit_ident} :ident "{commit_ident}"]',
|
|
2236
|
+
f'[{commit_ident} :description "{_edn_escape(subject[:120])}"]',
|
|
2237
|
+
f'[{commit_ident} :hash "{commit_hash}"]',
|
|
2238
|
+
f'[{commit_ident} :author "{_edn_escape(author)}"]',
|
|
2239
|
+
f'[{commit_ident} :subject "{_edn_escape(subject[:200])}"]',
|
|
2240
|
+
f'[{commit_ident} :date "{commit_ts_iso}"]',
|
|
2241
|
+
]
|
|
2242
|
+
close_items: List[tuple] = [] # (triples, original_ts_iso)
|
|
2243
|
+
dep_add_triples: List[str] = [] # :depends-on triples to transact individually
|
|
2244
|
+
|
|
2245
|
+
for status, file_path in changed:
|
|
2246
|
+
parser = _get_parser(file_path)
|
|
2247
|
+
if parser is None:
|
|
2248
|
+
continue
|
|
2249
|
+
|
|
2250
|
+
if status == "D":
|
|
2251
|
+
# Close module and all known child entities for this file
|
|
2252
|
+
idents = file_entities.get(file_path, [_code_ident("module", file_path)])
|
|
2253
|
+
module_ident = _code_ident("module", file_path)
|
|
2254
|
+
for ident in idents:
|
|
2255
|
+
orig_ts = entity_valid_from.get(ident, commit_ts_iso)
|
|
2256
|
+
desc = entity_descriptions.get(ident, "")
|
|
2257
|
+
close_items.append(
|
|
2258
|
+
(_build_close_triples(ident, desc, module_ident), orig_ts)
|
|
2259
|
+
)
|
|
2260
|
+
# Close all :depends-on edges for the deleted module
|
|
2261
|
+
for dep_ident in file_deps.get(file_path, set()):
|
|
2262
|
+
orig_ts = dep_valid_from.get((module_ident, dep_ident), commit_ts_iso)
|
|
2263
|
+
close_items.append(
|
|
2264
|
+
([f"[{module_ident} :depends-on {dep_ident}]"], orig_ts)
|
|
2265
|
+
)
|
|
2266
|
+
file_deps.pop(file_path, None)
|
|
2267
|
+
else: # A or M
|
|
2268
|
+
previous_idents = set(file_entities.get(file_path, []))
|
|
2269
|
+
try:
|
|
2270
|
+
content = _git_file_content(repo_path, commit_hash, file_path)
|
|
2271
|
+
except Exception:
|
|
2272
|
+
continue
|
|
2273
|
+
extracted = _extract_from_source(content, parser, file_path)
|
|
2274
|
+
triples = _build_code_triples(
|
|
2275
|
+
file_path, extracted, commit_ts_iso, entity_valid_from,
|
|
2276
|
+
entity_descriptions, file_entities, commit_ident,
|
|
2277
|
+
)
|
|
2278
|
+
add_triples.extend(triples)
|
|
2279
|
+
# Detect entities removed from a modified file.
|
|
2280
|
+
# _build_code_triples only appends to file_entities, never removes.
|
|
2281
|
+
# Compare previous idents against the idents derivable from the
|
|
2282
|
+
# current extraction to find what was deleted.
|
|
2283
|
+
if status == "M":
|
|
2284
|
+
module_ident = _code_ident("module", file_path)
|
|
2285
|
+
current_extracted_idents: set = {module_ident}
|
|
2286
|
+
for fn_name in extracted.get("functions", []):
|
|
2287
|
+
current_extracted_idents.add(_code_ident("function", file_path, fn_name))
|
|
2288
|
+
for cls_name in extracted.get("classes", []):
|
|
2289
|
+
current_extracted_idents.add(_code_ident("class", file_path, cls_name))
|
|
2290
|
+
removed_idents = previous_idents - current_extracted_idents
|
|
2291
|
+
for ident in removed_idents:
|
|
2292
|
+
orig_ts = entity_valid_from.get(ident, commit_ts_iso)
|
|
2293
|
+
desc = entity_descriptions.get(ident, "")
|
|
2294
|
+
close_items.append(
|
|
2295
|
+
(_build_close_triples(ident, desc, module_ident), orig_ts)
|
|
2296
|
+
)
|
|
2297
|
+
# Compute dep edges for this file and diff against previous
|
|
2298
|
+
module_ident = _code_ident("module", file_path)
|
|
2299
|
+
current_deps: set = set()
|
|
2300
|
+
for import_name in set(extracted.get("imports", [])):
|
|
2301
|
+
dep_ident = _resolve_module_import(import_name, file_entities)
|
|
2302
|
+
if dep_ident != module_ident:
|
|
2303
|
+
current_deps.add(dep_ident)
|
|
2304
|
+
previous_deps = file_deps.get(file_path, set())
|
|
2305
|
+
for dep_ident in current_deps - previous_deps:
|
|
2306
|
+
dep_add_triples.append(f"[{module_ident} :depends-on {dep_ident}]")
|
|
2307
|
+
dep_valid_from[(module_ident, dep_ident)] = commit_ts_iso
|
|
2308
|
+
if status == "M":
|
|
2309
|
+
for dep_ident in previous_deps - current_deps:
|
|
2310
|
+
orig_ts = dep_valid_from.get((module_ident, dep_ident), commit_ts_iso)
|
|
2311
|
+
close_items.append(
|
|
2312
|
+
([f"[{module_ident} :depends-on {dep_ident}]"], orig_ts)
|
|
2313
|
+
)
|
|
2314
|
+
file_deps[file_path] = current_deps
|
|
2315
|
+
|
|
2316
|
+
# Split :contains triples out before batching. Minigraf's EAVT
|
|
2317
|
+
# pending index lacks value bytes in the key, so batching multiple
|
|
2318
|
+
# [module :contains fn] facts in one transact silently drops all
|
|
2319
|
+
# but the last. Each :contains triple gets its own transact so
|
|
2320
|
+
# they receive distinct tx_counts and avoid the index collision.
|
|
2321
|
+
contains_triples = [t for t in add_triples if ":contains" in t]
|
|
2322
|
+
other_triples = [t for t in add_triples if ":contains" not in t]
|
|
2323
|
+
_ingest_transact(db, other_triples, commit_ts_iso, reason)
|
|
2324
|
+
for ct in contains_triples:
|
|
2325
|
+
_ingest_transact(db, [ct], commit_ts_iso, reason)
|
|
2326
|
+
# :depends-on triples transacted individually — same EAVT collision risk
|
|
2327
|
+
# as :contains when multiple deps share the same source module
|
|
2328
|
+
for dt in dep_add_triples:
|
|
2329
|
+
_ingest_transact(db, [dt], commit_ts_iso, reason)
|
|
2330
|
+
for close_triples, orig_ts in close_items:
|
|
2331
|
+
_ingest_close(db, close_triples, orig_ts, commit_ts_iso, reason)
|
|
2332
|
+
|
|
2333
|
+
# Ingest :parent edges — one transact per parent to avoid EAVT
|
|
2334
|
+
# collision for merge commits (which have two parent hashes).
|
|
2335
|
+
try:
|
|
2336
|
+
for parent_hash in _git_parent_hashes(repo_path, commit_hash):
|
|
2337
|
+
parent_ident = f":commit/{parent_hash[:12]}"
|
|
2338
|
+
db.execute(
|
|
2339
|
+
f'(transact [[{commit_ident} :parent {parent_ident}]] '
|
|
2340
|
+
f'{{:valid-from "{commit_ts_iso}"}})'
|
|
2341
|
+
)
|
|
2342
|
+
except Exception:
|
|
2343
|
+
pass # non-fatal; parent edges are best-effort
|
|
2344
|
+
|
|
2345
|
+
_watermark_update(db, commit_hash, commit_ts_iso, reason)
|
|
2346
|
+
db.checkpoint()
|
|
2347
|
+
|
|
2348
|
+
finally:
|
|
2349
|
+
_db = None # release file lock between commits
|
|
2350
|
+
|
|
2351
|
+
_ingest_progress["processed"] += 1
|
|
2352
|
+
await asyncio.sleep(0) # yield to event loop
|
|
2353
|
+
|
|
2354
|
+
now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
|
2355
|
+
db = get_db()
|
|
2356
|
+
try:
|
|
2357
|
+
_ingest_tags(db, repo_path, now)
|
|
2358
|
+
_last_run_write(db, last_hash, now, _ingest_progress["processed"])
|
|
2359
|
+
db.checkpoint()
|
|
2360
|
+
finally:
|
|
2361
|
+
_db = None
|
|
2362
|
+
|
|
2363
|
+
_ingest_progress["status"] = "complete"
|
|
2364
|
+
_index_cache.invalidate()
|
|
2365
|
+
|
|
2366
|
+
except Exception as e:
|
|
2367
|
+
_ingest_progress["status"] = "error"
|
|
2368
|
+
_ingest_progress["error"] = str(e)
|
|
2369
|
+
_db = None
|
|
2370
|
+
|
|
2371
|
+
|
|
2372
|
+
async def handle_minigraf_ingest_git(
|
|
2373
|
+
repo_path: Optional[str] = None,
|
|
2374
|
+
branch: str = "HEAD",
|
|
2375
|
+
) -> Dict[str, Any]:
|
|
2376
|
+
"""Start background git ingestion. Returns immediately."""
|
|
2377
|
+
global _ingest_task, _ingest_progress
|
|
2378
|
+
if _ingest_task and not _ingest_task.done():
|
|
2379
|
+
return {"ok": False, "error": "ingestion already in progress"}
|
|
2380
|
+
repo = repo_path or str(Path.cwd())
|
|
2381
|
+
try:
|
|
2382
|
+
check = _subprocess.run(
|
|
2383
|
+
["git", "rev-parse", "--git-dir"],
|
|
2384
|
+
cwd=repo, capture_output=True, text=True,
|
|
2385
|
+
)
|
|
2386
|
+
valid = check.returncode == 0
|
|
2387
|
+
except OSError:
|
|
2388
|
+
valid = False
|
|
2389
|
+
if not valid:
|
|
2390
|
+
return {
|
|
2391
|
+
"ok": False,
|
|
2392
|
+
"error": f"Not a git repository (or git not found): {repo}",
|
|
2393
|
+
}
|
|
2394
|
+
_ingest_progress = {
|
|
2395
|
+
"status": "idle", "processed": 0, "total": 0,
|
|
2396
|
+
"current_commit": "", "error": None,
|
|
2397
|
+
}
|
|
2398
|
+
_ingest_task = asyncio.create_task(_run_ingestion(repo, branch))
|
|
2399
|
+
return {"ok": True, "job_id": "git-ingest", "message": f"Ingestion started for {repo}"}
|
|
2400
|
+
|
|
2401
|
+
|
|
2402
|
+
def handle_minigraf_ingest_status() -> Dict[str, Any]:
|
|
2403
|
+
"""Return current ingestion progress, augmented with graph-backed last-run info."""
|
|
2404
|
+
result: Dict[str, Any] = {"ok": True, **_ingest_progress}
|
|
2405
|
+
if _ingest_progress["status"] != "running":
|
|
2406
|
+
try:
|
|
2407
|
+
db = get_db()
|
|
2408
|
+
raw = db.execute(
|
|
2409
|
+
"(query [:find ?t ?h :any-valid-time "
|
|
2410
|
+
":where [:ingestion/last-run-at :last-run-at ?t] "
|
|
2411
|
+
"[:ingestion/last-run-at :last-commit ?h]])"
|
|
2412
|
+
)
|
|
2413
|
+
rows = json.loads(raw).get("results", [])
|
|
2414
|
+
if rows:
|
|
2415
|
+
result["last_run_at"] = rows[0][0]
|
|
2416
|
+
result["last_commit"] = rows[0][1]
|
|
2417
|
+
else:
|
|
2418
|
+
result["last_run_at"] = None
|
|
2419
|
+
result["last_commit"] = None
|
|
2420
|
+
n = _total_ingested_query(db)
|
|
2421
|
+
result["total_ingested"] = n if n > 0 else None
|
|
2422
|
+
except Exception:
|
|
2423
|
+
result["last_run_at"] = None
|
|
2424
|
+
result["last_commit"] = None
|
|
2425
|
+
result["total_ingested"] = None
|
|
2426
|
+
return result
|
|
2427
|
+
|
|
2428
|
+
|
|
2429
|
+
# ---------------------------------------------------------------------------
|
|
2430
|
+
# MCP server
|
|
2431
|
+
# ---------------------------------------------------------------------------
|
|
2432
|
+
|
|
2433
|
+
from mcp.types import Tool, TextContent # noqa: E402
|
|
2434
|
+
|
|
2435
|
+
server = Server("temporal-reasoning")
|
|
2436
|
+
|
|
2437
|
+
_TOOLS: List[Tool] = [
|
|
2438
|
+
Tool(
|
|
2439
|
+
name="minigraf_query",
|
|
2440
|
+
description=(
|
|
2441
|
+
"Query Minigraf's persistent bi-temporal graph memory using Datalog. "
|
|
2442
|
+
"Call this BEFORE answering anything about past decisions, architecture, "
|
|
2443
|
+
"dependencies, or preferences. Supports :as-of for temporal queries to see "
|
|
2444
|
+
"what the graph contained at a past transaction time."
|
|
2445
|
+
),
|
|
2446
|
+
inputSchema={
|
|
2447
|
+
"type": "object",
|
|
2448
|
+
"properties": {
|
|
2449
|
+
"datalog": {
|
|
2450
|
+
"type": "string",
|
|
2451
|
+
"description": "A valid Datalog query, e.g. [:find ?name :where [?e :component/name ?name]]",
|
|
2452
|
+
},
|
|
2453
|
+
},
|
|
2454
|
+
"required": ["datalog"],
|
|
2455
|
+
},
|
|
2456
|
+
),
|
|
2457
|
+
Tool(
|
|
2458
|
+
name="minigraf_transact",
|
|
2459
|
+
description=(
|
|
2460
|
+
"Store a durable fact in Minigraf's graph memory. Only call this for decisions, "
|
|
2461
|
+
"architecture, dependencies, constraints, or preferences — NOT for transient "
|
|
2462
|
+
"observations or intermediate reasoning."
|
|
2463
|
+
),
|
|
2464
|
+
inputSchema={
|
|
2465
|
+
"type": "object",
|
|
2466
|
+
"properties": {
|
|
2467
|
+
"facts": {
|
|
2468
|
+
"type": "string",
|
|
2469
|
+
"description": (
|
|
2470
|
+
'A Datalog transact block, e.g. [[:decision/cache-strategy '
|
|
2471
|
+
':decision/description "use Redis"]]'
|
|
2472
|
+
),
|
|
2473
|
+
},
|
|
2474
|
+
"reason": {
|
|
2475
|
+
"type": "string",
|
|
2476
|
+
"description": (
|
|
2477
|
+
"Why this fact deserves long-term storage. "
|
|
2478
|
+
"Forces you to justify writes — only store facts worth remembering."
|
|
2479
|
+
),
|
|
2480
|
+
},
|
|
2481
|
+
},
|
|
2482
|
+
"required": ["facts", "reason"],
|
|
2483
|
+
},
|
|
2484
|
+
),
|
|
2485
|
+
Tool(
|
|
2486
|
+
name="minigraf_retract",
|
|
2487
|
+
description=(
|
|
2488
|
+
"Retract a fact from Minigraf's graph memory. Retraction records a new fact with "
|
|
2489
|
+
"asserted=false — the original stays in history for bi-temporal auditing."
|
|
2490
|
+
),
|
|
2491
|
+
inputSchema={
|
|
2492
|
+
"type": "object",
|
|
2493
|
+
"properties": {
|
|
2494
|
+
"facts": {
|
|
2495
|
+
"type": "string",
|
|
2496
|
+
"description": "A Datalog retract block, e.g. [[:component/auth :calls :component/jwt]]",
|
|
2497
|
+
},
|
|
2498
|
+
"reason": {
|
|
2499
|
+
"type": "string",
|
|
2500
|
+
"description": "Why this fact is being retracted. Forces you to justify the removal.",
|
|
2501
|
+
},
|
|
2502
|
+
},
|
|
2503
|
+
"required": ["facts", "reason"],
|
|
2504
|
+
},
|
|
2505
|
+
),
|
|
2506
|
+
Tool(
|
|
2507
|
+
name="minigraf_rule",
|
|
2508
|
+
description=(
|
|
2509
|
+
"Register a Datalog rule for use in subsequent queries. "
|
|
2510
|
+
"Rules enable recursive graph traversal (e.g. ancestor, reachable). "
|
|
2511
|
+
"A rule persists for the server session — re-register after a server restart. "
|
|
2512
|
+
"Syntax: [(rule-name ?arg ...) body-clause ...] — omit the outer (rule ...) wrapper."
|
|
2513
|
+
),
|
|
2514
|
+
inputSchema={
|
|
2515
|
+
"type": "object",
|
|
2516
|
+
"properties": {
|
|
2517
|
+
"rule": {
|
|
2518
|
+
"type": "string",
|
|
2519
|
+
"description": (
|
|
2520
|
+
"Rule vector, e.g. [(ancestor ?a ?d) [?a :parent ?d]] "
|
|
2521
|
+
"or [(ancestor ?a ?d) [?a :parent ?m] (ancestor ?m ?d)]"
|
|
2522
|
+
),
|
|
2523
|
+
},
|
|
2524
|
+
},
|
|
2525
|
+
"required": ["rule"],
|
|
2526
|
+
},
|
|
2527
|
+
),
|
|
2528
|
+
Tool(
|
|
2529
|
+
name="minigraf_report_issue",
|
|
2530
|
+
description=(
|
|
2531
|
+
"Report an issue with Minigraf query or transact operations. "
|
|
2532
|
+
"Use this when Minigraf returns errors to file a GitHub issue for tracking."
|
|
2533
|
+
),
|
|
2534
|
+
inputSchema={
|
|
2535
|
+
"type": "object",
|
|
2536
|
+
"properties": {
|
|
2537
|
+
"issue_type": {
|
|
2538
|
+
"type": "string",
|
|
2539
|
+
"description": "Type of issue to report",
|
|
2540
|
+
"enum": ["invalid_query", "transact_failure", "parse_error", "minigraf_bug"],
|
|
2541
|
+
},
|
|
2542
|
+
"description": {
|
|
2543
|
+
"type": "string",
|
|
2544
|
+
"description": "Human-readable description of the issue",
|
|
2545
|
+
},
|
|
2546
|
+
"datalog": {
|
|
2547
|
+
"type": "string",
|
|
2548
|
+
"description": "Optional Datalog query or transact that failed",
|
|
2549
|
+
},
|
|
2550
|
+
"error": {
|
|
2551
|
+
"type": "string",
|
|
2552
|
+
"description": "Optional error message returned by Minigraf",
|
|
2553
|
+
},
|
|
2554
|
+
},
|
|
2555
|
+
"required": ["issue_type", "description"],
|
|
2556
|
+
},
|
|
2557
|
+
),
|
|
2558
|
+
Tool(
|
|
2559
|
+
name="memory_prepare_turn",
|
|
2560
|
+
description=(
|
|
2561
|
+
"Retrieve relevant memory context for the current user message. "
|
|
2562
|
+
"Call this at the START of every turn, before reading the user's message. "
|
|
2563
|
+
"Returns a context block string to prepend to your working context."
|
|
2564
|
+
),
|
|
2565
|
+
inputSchema={
|
|
2566
|
+
"type": "object",
|
|
2567
|
+
"properties": {
|
|
2568
|
+
"user_message": {
|
|
2569
|
+
"type": "string",
|
|
2570
|
+
"description": "The user's message for this turn",
|
|
2571
|
+
},
|
|
2572
|
+
},
|
|
2573
|
+
"required": ["user_message"],
|
|
2574
|
+
},
|
|
2575
|
+
),
|
|
2576
|
+
Tool(
|
|
2577
|
+
name="memory_finalize_turn",
|
|
2578
|
+
description=(
|
|
2579
|
+
"Extract and store memorable facts from the completed conversation turn. "
|
|
2580
|
+
"Call this at the END of every turn, after composing your response. "
|
|
2581
|
+
"Pass the full user+agent exchange for this turn."
|
|
2582
|
+
),
|
|
2583
|
+
inputSchema={
|
|
2584
|
+
"type": "object",
|
|
2585
|
+
"properties": {
|
|
2586
|
+
"conversation_delta": {
|
|
2587
|
+
"type": "string",
|
|
2588
|
+
"description": "The user message and agent response for this turn",
|
|
2589
|
+
},
|
|
2590
|
+
},
|
|
2591
|
+
"required": ["conversation_delta"],
|
|
2592
|
+
},
|
|
2593
|
+
),
|
|
2594
|
+
Tool(
|
|
2595
|
+
name="minigraf_audit",
|
|
2596
|
+
description=(
|
|
2597
|
+
"Audit all graph entities against the built-in schema. "
|
|
2598
|
+
"Retracts entities with schema violations (missing required attributes, "
|
|
2599
|
+
"unknown types, unknown attributes). Run periodically or after heavy write sessions. "
|
|
2600
|
+
"Pass as_of (transaction number) for a read-only point-in-time audit without retractions."
|
|
2601
|
+
),
|
|
2602
|
+
inputSchema={
|
|
2603
|
+
"type": "object",
|
|
2604
|
+
"properties": {
|
|
2605
|
+
"as_of": {
|
|
2606
|
+
"type": "integer",
|
|
2607
|
+
"description": "Optional transaction number for point-in-time audit (read-only, no retractions)",
|
|
2608
|
+
},
|
|
2609
|
+
},
|
|
2610
|
+
"required": [],
|
|
2611
|
+
},
|
|
2612
|
+
),
|
|
2613
|
+
Tool(
|
|
2614
|
+
name="minigraf_ingest_git",
|
|
2615
|
+
description=(
|
|
2616
|
+
"Ingest code structure from git history into the bi-temporal graph. "
|
|
2617
|
+
"Starts a background task and returns immediately. "
|
|
2618
|
+
"Call minigraf_ingest_status to poll progress."
|
|
2619
|
+
),
|
|
2620
|
+
inputSchema={
|
|
2621
|
+
"type": "object",
|
|
2622
|
+
"properties": {
|
|
2623
|
+
"repo_path": {
|
|
2624
|
+
"type": "string",
|
|
2625
|
+
"description": "Absolute path to the git repo root. Defaults to cwd.",
|
|
2626
|
+
},
|
|
2627
|
+
"branch": {
|
|
2628
|
+
"type": "string",
|
|
2629
|
+
"description": "Branch or ref to walk. Defaults to HEAD.",
|
|
2630
|
+
},
|
|
2631
|
+
},
|
|
2632
|
+
"required": [],
|
|
2633
|
+
},
|
|
2634
|
+
),
|
|
2635
|
+
Tool(
|
|
2636
|
+
name="minigraf_ingest_status",
|
|
2637
|
+
description=(
|
|
2638
|
+
"Return the current git ingestion progress. "
|
|
2639
|
+
"status is one of: idle, running, complete, error."
|
|
2640
|
+
),
|
|
2641
|
+
inputSchema={"type": "object", "properties": {}, "required": []},
|
|
2642
|
+
),
|
|
2643
|
+
]
|
|
2644
|
+
|
|
2645
|
+
|
|
2646
|
+
@server.list_tools()
|
|
2647
|
+
async def list_tools() -> List[Tool]:
|
|
2648
|
+
return _TOOLS
|
|
2649
|
+
|
|
2650
|
+
|
|
2651
|
+
@server.call_tool()
|
|
2652
|
+
async def call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:
|
|
2653
|
+
global _db
|
|
2654
|
+
try:
|
|
2655
|
+
if name == "minigraf_query":
|
|
2656
|
+
result = handle_minigraf_query(arguments["datalog"])
|
|
2657
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2658
|
+
|
|
2659
|
+
if name == "minigraf_transact":
|
|
2660
|
+
result = handle_minigraf_transact(arguments["facts"], arguments["reason"])
|
|
2661
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2662
|
+
|
|
2663
|
+
if name == "minigraf_retract":
|
|
2664
|
+
result = handle_minigraf_retract(arguments["facts"], arguments["reason"])
|
|
2665
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2666
|
+
|
|
2667
|
+
if name == "minigraf_rule":
|
|
2668
|
+
result = handle_minigraf_rule(arguments["rule"])
|
|
2669
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2670
|
+
|
|
2671
|
+
if name == "minigraf_report_issue":
|
|
2672
|
+
result = handle_minigraf_report_issue(
|
|
2673
|
+
arguments["issue_type"],
|
|
2674
|
+
arguments["description"],
|
|
2675
|
+
datalog=arguments.get("datalog"),
|
|
2676
|
+
error=arguments.get("error"),
|
|
2677
|
+
)
|
|
2678
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2679
|
+
|
|
2680
|
+
if name == "memory_prepare_turn":
|
|
2681
|
+
block = handle_memory_prepare_turn(arguments["user_message"])
|
|
2682
|
+
return [TextContent(type="text", text=block)]
|
|
2683
|
+
|
|
2684
|
+
if name == "memory_finalize_turn":
|
|
2685
|
+
result = await handle_memory_finalize_turn(arguments["conversation_delta"])
|
|
2686
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2687
|
+
|
|
2688
|
+
if name == "minigraf_audit":
|
|
2689
|
+
as_of = arguments.get("as_of")
|
|
2690
|
+
result = handle_minigraf_audit(as_of=as_of)
|
|
2691
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2692
|
+
|
|
2693
|
+
if name == "minigraf_ingest_git":
|
|
2694
|
+
result = await handle_minigraf_ingest_git(
|
|
2695
|
+
repo_path=arguments.get("repo_path"),
|
|
2696
|
+
branch=arguments.get("branch", "HEAD"),
|
|
2697
|
+
)
|
|
2698
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2699
|
+
|
|
2700
|
+
|
|
2701
|
+
if name == "minigraf_ingest_status":
|
|
2702
|
+
result = handle_minigraf_ingest_status()
|
|
2703
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
2704
|
+
|
|
2705
|
+
raise ValueError(f"Unknown tool: {name}")
|
|
2706
|
+
finally:
|
|
2707
|
+
# Release the file lock after every tool call so that the prepare_hook
|
|
2708
|
+
# subprocess can open the DB between turns. get_db() re-opens on demand.
|
|
2709
|
+
_db = None
|
|
2710
|
+
|
|
2711
|
+
|
|
2712
|
+
async def main() -> None:
|
|
2713
|
+
global _server_ref, _ingest_task, _ingest_progress
|
|
2714
|
+
_server_ref = server
|
|
2715
|
+
# Auto-start incremental ingest on server startup so ingestion begins
|
|
2716
|
+
# immediately without waiting for a user prompt. Runs as a background
|
|
2717
|
+
# asyncio task — never blocks the message loop.
|
|
2718
|
+
# Set MINIGRAF_NO_AUTO_INGEST=1 to skip auto-start (used by eval sandboxes).
|
|
2719
|
+
_ingest_progress = {
|
|
2720
|
+
"status": "idle", "processed": 0, "total": 0,
|
|
2721
|
+
"current_commit": "", "error": None,
|
|
2722
|
+
}
|
|
2723
|
+
if not os.environ.get("MINIGRAF_NO_AUTO_INGEST"):
|
|
2724
|
+
_ingest_task = asyncio.create_task(_run_ingestion(str(Path.cwd()), "HEAD"))
|
|
2725
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
2726
|
+
await server.run(
|
|
2727
|
+
read_stream,
|
|
2728
|
+
write_stream,
|
|
2729
|
+
server.create_initialization_options(),
|
|
2730
|
+
)
|
|
2731
|
+
|
|
2732
|
+
|
|
2733
|
+
if __name__ == "__main__":
|
|
2734
|
+
asyncio.run(main())
|