mneme-graph 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+ .pytest_cache/
25
+ .coverage
26
+ .coverage.*
27
+ .cache
28
+ nosetests.xml
29
+ coverage.xml
30
+ *.cover
31
+ .hypothesis/
32
+ htmlcov/
33
+ .mypy_cache/
34
+ .ruff_cache/
35
+
36
+ # Virtual envs
37
+ .env
38
+ .venv
39
+ env/
40
+ venv/
41
+ ENV/
42
+ env.bak/
43
+ venv.bak/
44
+
45
+ # Node
46
+ node_modules/
47
+ npm-debug.log*
48
+ yarn-debug.log*
49
+ yarn-error.log*
50
+ pnpm-debug.log*
51
+ .pnpm-store/
52
+ *.tsbuildinfo
53
+
54
+ # TypeScript
55
+ *.js.map
56
+ *.d.ts.map
57
+
58
+ # IDE
59
+ .vscode/
60
+ .idea/
61
+ *.swp
62
+ *.swo
63
+ .DS_Store
64
+
65
+ # OS
66
+ Thumbs.db
67
+ desktop.ini
68
+
69
+ # mneme local
70
+ .mneme/
71
+ *.local.json
72
+ vault-test/
73
+ benchmarks/results/
74
+ benchmarks/_runs/
75
+ benchmarks/*/output/
76
+ benchmarks/*/result.json
77
+ benchmarks/*/hardware.json
78
+
79
+ # Secrets
80
+ .env.local
81
+ .env.*.local
82
+ secrets/
83
+ *.key
84
+ *.pem
85
+
86
+ # Build artifacts
87
+ *.tgz
88
+ *.whl
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: mneme-graph
3
+ Version: 3.1.0
4
+ Summary: Local code knowledge graph for mneme: tree-sitter extraction, atomic GraphStore, confidence-labelled nodes.
5
+ Author: Onour Impram
6
+ License: Apache-2.0
7
+ Keywords: ast,code-graph,knowledge-graph,mneme,tree-sitter
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: mneme-core<4,>=3.0.0
18
+ Requires-Dist: tree-sitter-javascript>=0.21
19
+ Requires-Dist: tree-sitter-python>=0.21
20
+ Requires-Dist: tree-sitter-typescript>=0.21
21
+ Requires-Dist: tree-sitter>=0.21
22
+ Provides-Extra: dev
23
+ Requires-Dist: mypy>=1.10; extra == 'dev'
24
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
25
+ Requires-Dist: pytest>=8.2; extra == 'dev'
26
+ Requires-Dist: ruff>=0.4.7; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # mneme-graph
30
+
31
+ Local code knowledge graph for mneme. Extracts GraphNode/GraphEdge from Python
32
+ source files via tree-sitter. Derived and rebuildable — the ground truth is
33
+ always the source files; `graph.json` is a derived artifact.
34
+
35
+ Part of the [mneme](https://github.com/TheGoatPsy/mneme) memory engine.
36
+
37
+ ## Scope (v1)
38
+
39
+ This package is deliberately small and honest about its limits.
40
+
41
+ - **Python only.** Extraction uses tree-sitter for Python source. TypeScript,
42
+ JavaScript, Rust, and other languages are not yet supported.
43
+ - **`calls` resolution is heuristic.** A call edge resolves to a local
44
+ function/method by unqualified name within the same vault, with `INFERRED`
45
+ confidence. There is no cross-file binding or precise symbol resolution; a
46
+ call with no local name match points at an `<external>` node (`EXTRACTED`).
47
+ - **Derived, never source of truth.** `graph.json` is rebuilt from source on
48
+ every `build`; the source files remain the ground truth.
49
+
50
+ ### Deferred (not implemented yet)
51
+
52
+ - Community detection / clustering.
53
+ - Pull-request impact analysis.
54
+ - Entity canonicalization and a merge queue (to avoid ghost-duplicate nodes
55
+ across renames and aliases).
56
+ - Multi-language extraction.
57
+
58
+ These are roadmap items, not present capabilities.
@@ -0,0 +1,30 @@
1
+ # mneme-graph
2
+
3
+ Local code knowledge graph for mneme. Extracts GraphNode/GraphEdge from Python
4
+ source files via tree-sitter. Derived and rebuildable — the ground truth is
5
+ always the source files; `graph.json` is a derived artifact.
6
+
7
+ Part of the [mneme](https://github.com/TheGoatPsy/mneme) memory engine.
8
+
9
+ ## Scope (v1)
10
+
11
+ This package is deliberately small and honest about its limits.
12
+
13
+ - **Python only.** Extraction uses tree-sitter for Python source. TypeScript,
14
+ JavaScript, Rust, and other languages are not yet supported.
15
+ - **`calls` resolution is heuristic.** A call edge resolves to a local
16
+ function/method by unqualified name within the same vault, with `INFERRED`
17
+ confidence. There is no cross-file binding or precise symbol resolution; a
18
+ call with no local name match points at an `<external>` node (`EXTRACTED`).
19
+ - **Derived, never source of truth.** `graph.json` is rebuilt from source on
20
+ every `build`; the source files remain the ground truth.
21
+
22
+ ### Deferred (not implemented yet)
23
+
24
+ - Community detection / clustering.
25
+ - Pull-request impact analysis.
26
+ - Entity canonicalization and a merge queue (to avoid ghost-duplicate nodes
27
+ across renames and aliases).
28
+ - Multi-language extraction.
29
+
30
+ These are roadmap items, not present capabilities.
@@ -0,0 +1,15 @@
1
+ """mneme-graph: local code knowledge graph with confidence-labelled nodes.
2
+
3
+ Derives a rebuildable graph from source files via tree-sitter extraction.
4
+ The ground truth is always the source files; graph.json is a derived artifact
5
+ that can be deleted and rebuilt identically from the same source.
6
+
7
+ Public surface:
8
+
9
+ from mneme_graph.schema import GraphNode, GraphEdge, ConfidenceLabel
10
+ from mneme_graph.store import GraphStore
11
+ from mneme_graph.extractor.python_extractor import extract_file
12
+ """
13
+
14
+ __version__ = "0.2.0"
15
+ __all__ = ["__version__"]
@@ -0,0 +1,372 @@
1
+ """Derived and inferred analytics over the extracted mneme code graph.
2
+
3
+ All functions in this module are PURE: they operate on list[GraphNode] and
4
+ list[GraphEdge], never mutate their inputs, perform no IO, make no network
5
+ calls, and are fully deterministic.
6
+
7
+ Confidence-label invariant
8
+ --------------------------
9
+ Nodes and edges stored in the graph carry ``confidence="EXTRACTED"`` because
10
+ they were directly observed from source-code ASTs. The outputs of this
11
+ module are one level removed:
12
+
13
+ * ``Community`` objects are **INFERRED** — connected-component membership is
14
+ derived from structural patterns, not directly observable in any single
15
+ source location.
16
+ * ``MergeCandidate`` objects are **AMBIGUOUS** — duplicate detection is based
17
+ on heuristics (same name/kind/path, different ``line_start``); the evidence
18
+ is present but may be contradictory or incomplete.
19
+ * ``ImpactResult`` and ``apply_merge`` return plain node/edge data; callers
20
+ retain whatever confidence labels the underlying objects carry.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from collections import deque
26
+ from collections.abc import Iterable
27
+ from dataclasses import dataclass, field
28
+
29
+ from .schema import ConfidenceLabel, GraphEdge, GraphNode
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Community detection (connected components)
33
+ # ---------------------------------------------------------------------------
34
+
35
+ _COMMUNITY_EDGE_KINDS = frozenset({"calls", "inherits", "imports"})
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class Community:
40
+ """A connected component in the undirected graph projection.
41
+
42
+ Attributes:
43
+ community_id Lexicographically smallest ``node_id`` in the component.
44
+ node_ids Sorted tuple of all ``node_id`` values in the component.
45
+ confidence Always ``"INFERRED"`` — membership is derived, not
46
+ directly observed.
47
+ """
48
+
49
+ community_id: str
50
+ node_ids: tuple[str, ...]
51
+ confidence: ConfidenceLabel = field(default="INFERRED")
52
+
53
+
54
+ def detect_communities(
55
+ nodes: list[GraphNode],
56
+ edges: list[GraphEdge],
57
+ ) -> list[Community]:
58
+ """Return connected components over the undirected projection of *nodes*.
59
+
60
+ Only edges whose ``kind`` is in ``{"calls", "inherits", "imports"}`` are
61
+ considered; ``"defines"`` edges are ignored. Edges that reference a
62
+ ``node_id`` not present in *nodes* are silently ignored. Every node in
63
+ *nodes* participates: an isolated node becomes a singleton community.
64
+
65
+ Output is sorted by ``community_id``; ``node_ids`` within each community
66
+ is sorted. Calling this function twice with the same inputs produces
67
+ identical results.
68
+ """
69
+ node_ids: set[str] = {n.node_id for n in nodes}
70
+
71
+ # Build undirected adjacency list (only within known nodes).
72
+ adj: dict[str, list[str]] = {nid: [] for nid in node_ids}
73
+ for edge in edges:
74
+ if edge.kind not in _COMMUNITY_EDGE_KINDS:
75
+ continue
76
+ if edge.src_id not in node_ids or edge.dst_id not in node_ids:
77
+ continue
78
+ adj[edge.src_id].append(edge.dst_id)
79
+ adj[edge.dst_id].append(edge.src_id)
80
+
81
+ # BFS to find connected components.
82
+ visited: set[str] = set()
83
+ communities: list[Community] = []
84
+
85
+ for start in sorted(node_ids): # deterministic traversal order
86
+ if start in visited:
87
+ continue
88
+ component: list[str] = []
89
+ queue: deque[str] = deque([start])
90
+ visited.add(start)
91
+ while queue:
92
+ current = queue.popleft()
93
+ component.append(current)
94
+ for neighbour in adj[current]:
95
+ if neighbour not in visited:
96
+ visited.add(neighbour)
97
+ queue.append(neighbour)
98
+ sorted_ids = tuple(sorted(component))
99
+ communities.append(
100
+ Community(
101
+ community_id=sorted_ids[0],
102
+ node_ids=sorted_ids,
103
+ )
104
+ )
105
+
106
+ return sorted(communities, key=lambda c: c.community_id)
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # PR impact analysis (reverse BFS)
111
+ # ---------------------------------------------------------------------------
112
+
113
+
114
+ @dataclass(frozen=True)
115
+ class ImpactResult:
116
+ """Result of a PR impact query.
117
+
118
+ Attributes:
119
+ changed Sorted tuple of ``changed_node_ids`` that actually exist in
120
+ *nodes* (unknown ids are silently dropped).
121
+ affected Sorted tuple of ``(node_id, distance)`` pairs — nodes
122
+ reachable upstream from the changed set, excluding the
123
+ changed nodes themselves. Sorted by ``(distance, node_id)``.
124
+ """
125
+
126
+ changed: tuple[str, ...]
127
+ affected: tuple[tuple[str, int], ...]
128
+
129
+
130
+ def pr_impact(
131
+ nodes: list[GraphNode],
132
+ edges: list[GraphEdge],
133
+ changed_node_ids: Iterable[str],
134
+ *,
135
+ max_depth: int | None = None,
136
+ ) -> ImpactResult:
137
+ """Return the nodes transitively affected by changes to *changed_node_ids*.
138
+
139
+ Edge semantics: ``src -> dst`` means "``src`` depends on ``dst``".
140
+ Changing ``dst`` therefore affects ``src``. The traversal follows edges
141
+ in the reverse direction (``dst -> src``), i.e. it finds all nodes that
142
+ (directly or transitively) depend on any changed node.
143
+
144
+ Args:
145
+ nodes: Full node list; defines the valid node universe.
146
+ edges: Full edge list.
147
+ changed_node_ids: Seed set. Ids not found in *nodes* are ignored.
148
+ max_depth: Maximum hop distance to explore (``None`` = unbounded).
149
+
150
+ Returns:
151
+ ``ImpactResult`` with ``changed`` and ``affected`` fields.
152
+ """
153
+ node_ids: set[str] = {n.node_id for n in nodes}
154
+ changed_set: set[str] = {nid for nid in changed_node_ids if nid in node_ids}
155
+
156
+ # Build reverse adjacency: dst -> list[src] (who depends on dst?).
157
+ rev_adj: dict[str, list[str]] = {nid: [] for nid in node_ids}
158
+ for edge in edges:
159
+ if edge.src_id in node_ids and edge.dst_id in node_ids:
160
+ rev_adj[edge.dst_id].append(edge.src_id)
161
+
162
+ # BFS from changed_set, traversing reverse edges.
163
+ distance: dict[str, int] = {}
164
+ queue: deque[str] = deque()
165
+ for nid in sorted(changed_set): # deterministic seed order
166
+ distance[nid] = 0
167
+ queue.append(nid)
168
+
169
+ while queue:
170
+ current = queue.popleft()
171
+ current_dist = distance[current]
172
+ next_dist = current_dist + 1
173
+ if max_depth is not None and next_dist > max_depth:
174
+ continue
175
+ for upstream in rev_adj[current]:
176
+ if upstream not in distance:
177
+ distance[upstream] = next_dist
178
+ queue.append(upstream)
179
+
180
+ affected: list[tuple[str, int]] = sorted(
181
+ [(nid, dist) for nid, dist in distance.items() if nid not in changed_set],
182
+ key=lambda t: (t[1], t[0]),
183
+ )
184
+
185
+ return ImpactResult(
186
+ changed=tuple(sorted(changed_set)),
187
+ affected=tuple(affected),
188
+ )
189
+
190
+
191
+ def changed_nodes_for_paths(
192
+ nodes: list[GraphNode],
193
+ paths: Iterable[str],
194
+ ) -> list[str]:
195
+ """Return sorted, deduplicated ``node_id`` values for nodes in *paths*.
196
+
197
+ Args:
198
+ nodes: Full node list to search.
199
+ paths: Iterable of ``source_path`` strings to match against.
200
+
201
+ Returns:
202
+ Sorted list of matching ``node_id`` values.
203
+ """
204
+ path_set: set[str] = set(paths)
205
+ return sorted({n.node_id for n in nodes if n.source_path in path_set})
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Ghost-duplicate / merge candidate detection
210
+ # ---------------------------------------------------------------------------
211
+
212
+
213
+ @dataclass(frozen=True)
214
+ class MergeCandidate:
215
+ """A set of nodes that appear to represent the same logical symbol.
216
+
217
+ Ghost duplicates arise when ``line_start`` drift causes the extractor to
218
+ assign a new ``node_id`` to a symbol that has not meaningfully changed.
219
+ Only **local** nodes (``source_path != "<external>"``) are considered.
220
+
221
+ Attributes:
222
+ key ``(source_path, name, kind)`` grouping key.
223
+ node_ids Sorted tuple of duplicate ``node_id`` values (len >= 2).
224
+ confidence Always ``"AMBIGUOUS"`` — the heuristic may be wrong.
225
+ """
226
+
227
+ key: tuple[str, str, str]
228
+ node_ids: tuple[str, ...]
229
+ confidence: ConfidenceLabel = field(default="AMBIGUOUS")
230
+
231
+
232
+ def find_merge_candidates(nodes: list[GraphNode]) -> list[MergeCandidate]:
233
+ """Detect ghost-duplicate local nodes sharing ``(source_path, name, kind)``.
234
+
235
+ External nodes (``source_path == "<external>"``) are excluded because
236
+ their ``node_id`` is already content-addressed without ``line_start`` and
237
+ so they cannot produce duplicates by design.
238
+
239
+ FIX 3: A group is only a merge candidate when its nodes span at least two
240
+ DISTINCT ``content_hash`` values. Nodes that share ``(source_path, name,
241
+ kind)`` but have the SAME ``content_hash`` were extracted from the same
242
+ file version — they are legitimately distinct symbols (e.g. two methods
243
+ both named ``render`` in different classes) and must NOT be flagged. A
244
+ real ghost-duplicate (PA3) arises only across different file versions,
245
+ which necessarily have different content hashes.
246
+
247
+ Returns candidates sorted by ``key``; ``node_ids`` within each candidate
248
+ is sorted.
249
+ """
250
+ # Map key -> list of (node_id, content_hash) pairs.
251
+ groups: dict[tuple[str, str, str], list[tuple[str, str]]] = {}
252
+ for node in nodes:
253
+ if node.source_path == "<external>":
254
+ continue
255
+ key = (node.source_path, node.name, str(node.kind))
256
+ groups.setdefault(key, []).append((node.node_id, node.content_hash))
257
+
258
+ candidates: list[MergeCandidate] = []
259
+ for key, id_hash_pairs in groups.items():
260
+ distinct_hashes = {chash for _, chash in id_hash_pairs}
261
+ # Only emit a candidate when nodes span >= 2 distinct content_hash values
262
+ # (cross-version ghost duplicate). Same content_hash = same file version
263
+ # = legitimately distinct symbols within that version → NOT a candidate.
264
+ if len(distinct_hashes) < 2:
265
+ continue
266
+ deduped_ids = sorted({nid for nid, _ in id_hash_pairs})
267
+ if len(deduped_ids) >= 2:
268
+ candidates.append(
269
+ MergeCandidate(
270
+ key=key,
271
+ node_ids=tuple(deduped_ids),
272
+ )
273
+ )
274
+
275
+ return sorted(candidates, key=lambda c: c.key)
276
+
277
+
278
+ # ---------------------------------------------------------------------------
279
+ # Merge application
280
+ # ---------------------------------------------------------------------------
281
+
282
+
283
+ def _resolve_canonical(node_id: str, canonical_map: dict[str, str]) -> str:
284
+ """Follow canonical_map transitively until fixpoint, handling cycles.
285
+
286
+ For a simple chain (no cycle) the terminal id (not in canonical_map) is
287
+ returned. For a cycle (or a rho-shaped chain whose tail enters a cycle)
288
+ the lexicographically smallest id among the cycle members is returned so
289
+ that exactly one node in the cycle survives apply_merge.
290
+ """
291
+ seen_list: list[str] = []
292
+ seen_set: set[str] = set()
293
+ current = node_id
294
+ while current in canonical_map:
295
+ if current in seen_set:
296
+ # Cycle detected — current is the entry point we've looped back to.
297
+ # The cycle members are the suffix of seen_list starting from current.
298
+ cycle_start = seen_list.index(current)
299
+ cycle_members = seen_list[cycle_start:]
300
+ return min(cycle_members)
301
+ seen_list.append(current)
302
+ seen_set.add(current)
303
+ current = canonical_map[current]
304
+ # No cycle — current is the terminal (not in canonical_map).
305
+ return current
306
+
307
+
308
+ def apply_merge(
309
+ nodes: list[GraphNode],
310
+ edges: list[GraphEdge],
311
+ canonical_map: dict[str, str],
312
+ ) -> tuple[list[GraphNode], list[GraphEdge]]:
313
+ """Apply a duplicate-to-canonical mapping, returning new (nodes, edges).
314
+
315
+ *canonical_map* maps ``duplicate_node_id -> canonical_node_id``.
316
+
317
+ Rules:
318
+ * Nodes whose ``node_id`` is a key in *canonical_map* (and is not itself
319
+ the canonical target after transitive resolution) are dropped.
320
+ * Every edge's ``src_id`` and ``dst_id`` are rewritten through the map
321
+ (transitively resolved).
322
+ * After rewriting, a new ``edge_id`` is derived via ``GraphEdge.make``;
323
+ duplicate edges are deduplicated (last writer wins on same ``edge_id``).
324
+ * Self-loops introduced by merging (``src_id == dst_id``) are dropped.
325
+ * The original *nodes* and *edges* lists are never mutated.
326
+ * Output lists are deterministically sorted (nodes by ``node_id``, edges
327
+ by ``edge_id``).
328
+
329
+ Transitive resolution: if the map contains ``a -> b`` and ``b -> c`` then
330
+ ``a`` resolves to ``c``. Cycles in the map are broken at the first
331
+ repeated node (the node keeps its current id).
332
+ """
333
+ # Resolve each key to its ultimate canonical target.
334
+ resolved: dict[str, str] = {
335
+ dup: _resolve_canonical(dup, canonical_map) for dup in canonical_map
336
+ }
337
+
338
+ # Non-canonical node ids: keys whose resolved target differs from themselves.
339
+ non_canonical: set[str] = {dup for dup, canon in resolved.items() if dup != canon}
340
+
341
+ # Filter nodes: drop non-canonical duplicates.
342
+ new_nodes: list[GraphNode] = sorted(
343
+ [n for n in nodes if n.node_id not in non_canonical],
344
+ key=lambda n: n.node_id,
345
+ )
346
+
347
+ # FIX 1: surviving node ids — only nodes actually present in new_nodes.
348
+ surviving_node_ids: set[str] = {n.node_id for n in new_nodes}
349
+
350
+ # Rewrite and deduplicate edges.
351
+ seen_edge_ids: dict[str, GraphEdge] = {}
352
+ for edge in edges:
353
+ new_src = resolved.get(edge.src_id, edge.src_id)
354
+ new_dst = resolved.get(edge.dst_id, edge.dst_id)
355
+ if new_src == new_dst:
356
+ # Drop self-loops created by merging.
357
+ continue
358
+ # FIX 1: drop edges whose rewritten endpoint is not in the surviving set.
359
+ if new_src not in surviving_node_ids or new_dst not in surviving_node_ids:
360
+ continue
361
+ new_edge = GraphEdge.make(
362
+ src_id=new_src,
363
+ dst_id=new_dst,
364
+ kind=edge.kind,
365
+ confidence=edge.confidence,
366
+ valid_at=edge.valid_at,
367
+ )
368
+ seen_edge_ids[new_edge.edge_id] = new_edge
369
+
370
+ new_edges: list[GraphEdge] = sorted(seen_edge_ids.values(), key=lambda e: e.edge_id)
371
+
372
+ return new_nodes, new_edges