mneme-graph 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mneme_graph/__init__.py +15 -0
- mneme_graph/analytics.py +372 -0
- mneme_graph/cli.py +363 -0
- mneme_graph/extractor/__init__.py +83 -0
- mneme_graph/extractor/javascript_extractor.py +753 -0
- mneme_graph/extractor/python_extractor.py +823 -0
- mneme_graph/schema.py +203 -0
- mneme_graph/store.py +130 -0
- mneme_graph-3.1.0.dist-info/METADATA +58 -0
- mneme_graph-3.1.0.dist-info/RECORD +12 -0
- mneme_graph-3.1.0.dist-info/WHEEL +4 -0
- mneme_graph-3.1.0.dist-info/entry_points.txt +2 -0
mneme_graph/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""mneme-graph: local code knowledge graph with confidence-labelled nodes.
|
|
2
|
+
|
|
3
|
+
Derives a rebuildable graph from source files via tree-sitter extraction.
|
|
4
|
+
The ground truth is always the source files; graph.json is a derived artifact
|
|
5
|
+
that can be deleted and rebuilt identically from the same source.
|
|
6
|
+
|
|
7
|
+
Public surface:
|
|
8
|
+
|
|
9
|
+
from mneme_graph.schema import GraphNode, GraphEdge, ConfidenceLabel
|
|
10
|
+
from mneme_graph.store import GraphStore
|
|
11
|
+
from mneme_graph.extractor.python_extractor import extract_file
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.2.0"
|
|
15
|
+
__all__ = ["__version__"]
|
mneme_graph/analytics.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""Derived and inferred analytics over the extracted mneme code graph.
|
|
2
|
+
|
|
3
|
+
All functions in this module are PURE: they operate on list[GraphNode] and
|
|
4
|
+
list[GraphEdge], never mutate their inputs, perform no IO, make no network
|
|
5
|
+
calls, and are fully deterministic.
|
|
6
|
+
|
|
7
|
+
Confidence-label invariant
|
|
8
|
+
--------------------------
|
|
9
|
+
Nodes and edges stored in the graph carry ``confidence="EXTRACTED"`` because
|
|
10
|
+
they were directly observed from source-code ASTs. The outputs of this
|
|
11
|
+
module are one level removed:
|
|
12
|
+
|
|
13
|
+
* ``Community`` objects are **INFERRED** — connected-component membership is
|
|
14
|
+
derived from structural patterns, not directly observable in any single
|
|
15
|
+
source location.
|
|
16
|
+
* ``MergeCandidate`` objects are **AMBIGUOUS** — duplicate detection is based
|
|
17
|
+
on heuristics (same name/kind/path, different ``line_start``); the evidence
|
|
18
|
+
is present but may be contradictory or incomplete.
|
|
19
|
+
* ``ImpactResult`` and ``apply_merge`` return plain node/edge data; callers
|
|
20
|
+
retain whatever confidence labels the underlying objects carry.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from collections import deque
|
|
26
|
+
from collections.abc import Iterable
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
|
|
29
|
+
from .schema import ConfidenceLabel, GraphEdge, GraphNode
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Community detection (connected components)
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
_COMMUNITY_EDGE_KINDS = frozenset({"calls", "inherits", "imports"})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class Community:
|
|
40
|
+
"""A connected component in the undirected graph projection.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
community_id Lexicographically smallest ``node_id`` in the component.
|
|
44
|
+
node_ids Sorted tuple of all ``node_id`` values in the component.
|
|
45
|
+
confidence Always ``"INFERRED"`` — membership is derived, not
|
|
46
|
+
directly observed.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
community_id: str
|
|
50
|
+
node_ids: tuple[str, ...]
|
|
51
|
+
confidence: ConfidenceLabel = field(default="INFERRED")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def detect_communities(
|
|
55
|
+
nodes: list[GraphNode],
|
|
56
|
+
edges: list[GraphEdge],
|
|
57
|
+
) -> list[Community]:
|
|
58
|
+
"""Return connected components over the undirected projection of *nodes*.
|
|
59
|
+
|
|
60
|
+
Only edges whose ``kind`` is in ``{"calls", "inherits", "imports"}`` are
|
|
61
|
+
considered; ``"defines"`` edges are ignored. Edges that reference a
|
|
62
|
+
``node_id`` not present in *nodes* are silently ignored. Every node in
|
|
63
|
+
*nodes* participates: an isolated node becomes a singleton community.
|
|
64
|
+
|
|
65
|
+
Output is sorted by ``community_id``; ``node_ids`` within each community
|
|
66
|
+
is sorted. Calling this function twice with the same inputs produces
|
|
67
|
+
identical results.
|
|
68
|
+
"""
|
|
69
|
+
node_ids: set[str] = {n.node_id for n in nodes}
|
|
70
|
+
|
|
71
|
+
# Build undirected adjacency list (only within known nodes).
|
|
72
|
+
adj: dict[str, list[str]] = {nid: [] for nid in node_ids}
|
|
73
|
+
for edge in edges:
|
|
74
|
+
if edge.kind not in _COMMUNITY_EDGE_KINDS:
|
|
75
|
+
continue
|
|
76
|
+
if edge.src_id not in node_ids or edge.dst_id not in node_ids:
|
|
77
|
+
continue
|
|
78
|
+
adj[edge.src_id].append(edge.dst_id)
|
|
79
|
+
adj[edge.dst_id].append(edge.src_id)
|
|
80
|
+
|
|
81
|
+
# BFS to find connected components.
|
|
82
|
+
visited: set[str] = set()
|
|
83
|
+
communities: list[Community] = []
|
|
84
|
+
|
|
85
|
+
for start in sorted(node_ids): # deterministic traversal order
|
|
86
|
+
if start in visited:
|
|
87
|
+
continue
|
|
88
|
+
component: list[str] = []
|
|
89
|
+
queue: deque[str] = deque([start])
|
|
90
|
+
visited.add(start)
|
|
91
|
+
while queue:
|
|
92
|
+
current = queue.popleft()
|
|
93
|
+
component.append(current)
|
|
94
|
+
for neighbour in adj[current]:
|
|
95
|
+
if neighbour not in visited:
|
|
96
|
+
visited.add(neighbour)
|
|
97
|
+
queue.append(neighbour)
|
|
98
|
+
sorted_ids = tuple(sorted(component))
|
|
99
|
+
communities.append(
|
|
100
|
+
Community(
|
|
101
|
+
community_id=sorted_ids[0],
|
|
102
|
+
node_ids=sorted_ids,
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return sorted(communities, key=lambda c: c.community_id)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
# PR impact analysis (reverse BFS)
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass(frozen=True)
|
|
115
|
+
class ImpactResult:
|
|
116
|
+
"""Result of a PR impact query.
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
changed Sorted tuple of ``changed_node_ids`` that actually exist in
|
|
120
|
+
*nodes* (unknown ids are silently dropped).
|
|
121
|
+
affected Sorted tuple of ``(node_id, distance)`` pairs — nodes
|
|
122
|
+
reachable upstream from the changed set, excluding the
|
|
123
|
+
changed nodes themselves. Sorted by ``(distance, node_id)``.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
changed: tuple[str, ...]
|
|
127
|
+
affected: tuple[tuple[str, int], ...]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def pr_impact(
|
|
131
|
+
nodes: list[GraphNode],
|
|
132
|
+
edges: list[GraphEdge],
|
|
133
|
+
changed_node_ids: Iterable[str],
|
|
134
|
+
*,
|
|
135
|
+
max_depth: int | None = None,
|
|
136
|
+
) -> ImpactResult:
|
|
137
|
+
"""Return the nodes transitively affected by changes to *changed_node_ids*.
|
|
138
|
+
|
|
139
|
+
Edge semantics: ``src -> dst`` means "``src`` depends on ``dst``".
|
|
140
|
+
Changing ``dst`` therefore affects ``src``. The traversal follows edges
|
|
141
|
+
in the reverse direction (``dst -> src``), i.e. it finds all nodes that
|
|
142
|
+
(directly or transitively) depend on any changed node.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
nodes: Full node list; defines the valid node universe.
|
|
146
|
+
edges: Full edge list.
|
|
147
|
+
changed_node_ids: Seed set. Ids not found in *nodes* are ignored.
|
|
148
|
+
max_depth: Maximum hop distance to explore (``None`` = unbounded).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
``ImpactResult`` with ``changed`` and ``affected`` fields.
|
|
152
|
+
"""
|
|
153
|
+
node_ids: set[str] = {n.node_id for n in nodes}
|
|
154
|
+
changed_set: set[str] = {nid for nid in changed_node_ids if nid in node_ids}
|
|
155
|
+
|
|
156
|
+
# Build reverse adjacency: dst -> list[src] (who depends on dst?).
|
|
157
|
+
rev_adj: dict[str, list[str]] = {nid: [] for nid in node_ids}
|
|
158
|
+
for edge in edges:
|
|
159
|
+
if edge.src_id in node_ids and edge.dst_id in node_ids:
|
|
160
|
+
rev_adj[edge.dst_id].append(edge.src_id)
|
|
161
|
+
|
|
162
|
+
# BFS from changed_set, traversing reverse edges.
|
|
163
|
+
distance: dict[str, int] = {}
|
|
164
|
+
queue: deque[str] = deque()
|
|
165
|
+
for nid in sorted(changed_set): # deterministic seed order
|
|
166
|
+
distance[nid] = 0
|
|
167
|
+
queue.append(nid)
|
|
168
|
+
|
|
169
|
+
while queue:
|
|
170
|
+
current = queue.popleft()
|
|
171
|
+
current_dist = distance[current]
|
|
172
|
+
next_dist = current_dist + 1
|
|
173
|
+
if max_depth is not None and next_dist > max_depth:
|
|
174
|
+
continue
|
|
175
|
+
for upstream in rev_adj[current]:
|
|
176
|
+
if upstream not in distance:
|
|
177
|
+
distance[upstream] = next_dist
|
|
178
|
+
queue.append(upstream)
|
|
179
|
+
|
|
180
|
+
affected: list[tuple[str, int]] = sorted(
|
|
181
|
+
[(nid, dist) for nid, dist in distance.items() if nid not in changed_set],
|
|
182
|
+
key=lambda t: (t[1], t[0]),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return ImpactResult(
|
|
186
|
+
changed=tuple(sorted(changed_set)),
|
|
187
|
+
affected=tuple(affected),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def changed_nodes_for_paths(
|
|
192
|
+
nodes: list[GraphNode],
|
|
193
|
+
paths: Iterable[str],
|
|
194
|
+
) -> list[str]:
|
|
195
|
+
"""Return sorted, deduplicated ``node_id`` values for nodes in *paths*.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
nodes: Full node list to search.
|
|
199
|
+
paths: Iterable of ``source_path`` strings to match against.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Sorted list of matching ``node_id`` values.
|
|
203
|
+
"""
|
|
204
|
+
path_set: set[str] = set(paths)
|
|
205
|
+
return sorted({n.node_id for n in nodes if n.source_path in path_set})
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# Ghost-duplicate / merge candidate detection
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@dataclass(frozen=True)
|
|
214
|
+
class MergeCandidate:
|
|
215
|
+
"""A set of nodes that appear to represent the same logical symbol.
|
|
216
|
+
|
|
217
|
+
Ghost duplicates arise when ``line_start`` drift causes the extractor to
|
|
218
|
+
assign a new ``node_id`` to a symbol that has not meaningfully changed.
|
|
219
|
+
Only **local** nodes (``source_path != "<external>"``) are considered.
|
|
220
|
+
|
|
221
|
+
Attributes:
|
|
222
|
+
key ``(source_path, name, kind)`` grouping key.
|
|
223
|
+
node_ids Sorted tuple of duplicate ``node_id`` values (len >= 2).
|
|
224
|
+
confidence Always ``"AMBIGUOUS"`` — the heuristic may be wrong.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
key: tuple[str, str, str]
|
|
228
|
+
node_ids: tuple[str, ...]
|
|
229
|
+
confidence: ConfidenceLabel = field(default="AMBIGUOUS")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def find_merge_candidates(nodes: list[GraphNode]) -> list[MergeCandidate]:
|
|
233
|
+
"""Detect ghost-duplicate local nodes sharing ``(source_path, name, kind)``.
|
|
234
|
+
|
|
235
|
+
External nodes (``source_path == "<external>"``) are excluded because
|
|
236
|
+
their ``node_id`` is already content-addressed without ``line_start`` and
|
|
237
|
+
so they cannot produce duplicates by design.
|
|
238
|
+
|
|
239
|
+
FIX 3: A group is only a merge candidate when its nodes span at least two
|
|
240
|
+
DISTINCT ``content_hash`` values. Nodes that share ``(source_path, name,
|
|
241
|
+
kind)`` but have the SAME ``content_hash`` were extracted from the same
|
|
242
|
+
file version — they are legitimately distinct symbols (e.g. two methods
|
|
243
|
+
both named ``render`` in different classes) and must NOT be flagged. A
|
|
244
|
+
real ghost-duplicate (PA3) arises only across different file versions,
|
|
245
|
+
which necessarily have different content hashes.
|
|
246
|
+
|
|
247
|
+
Returns candidates sorted by ``key``; ``node_ids`` within each candidate
|
|
248
|
+
is sorted.
|
|
249
|
+
"""
|
|
250
|
+
# Map key -> list of (node_id, content_hash) pairs.
|
|
251
|
+
groups: dict[tuple[str, str, str], list[tuple[str, str]]] = {}
|
|
252
|
+
for node in nodes:
|
|
253
|
+
if node.source_path == "<external>":
|
|
254
|
+
continue
|
|
255
|
+
key = (node.source_path, node.name, str(node.kind))
|
|
256
|
+
groups.setdefault(key, []).append((node.node_id, node.content_hash))
|
|
257
|
+
|
|
258
|
+
candidates: list[MergeCandidate] = []
|
|
259
|
+
for key, id_hash_pairs in groups.items():
|
|
260
|
+
distinct_hashes = {chash for _, chash in id_hash_pairs}
|
|
261
|
+
# Only emit a candidate when nodes span >= 2 distinct content_hash values
|
|
262
|
+
# (cross-version ghost duplicate). Same content_hash = same file version
|
|
263
|
+
# = legitimately distinct symbols within that version → NOT a candidate.
|
|
264
|
+
if len(distinct_hashes) < 2:
|
|
265
|
+
continue
|
|
266
|
+
deduped_ids = sorted({nid for nid, _ in id_hash_pairs})
|
|
267
|
+
if len(deduped_ids) >= 2:
|
|
268
|
+
candidates.append(
|
|
269
|
+
MergeCandidate(
|
|
270
|
+
key=key,
|
|
271
|
+
node_ids=tuple(deduped_ids),
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return sorted(candidates, key=lambda c: c.key)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
# ---------------------------------------------------------------------------
|
|
279
|
+
# Merge application
|
|
280
|
+
# ---------------------------------------------------------------------------
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _resolve_canonical(node_id: str, canonical_map: dict[str, str]) -> str:
|
|
284
|
+
"""Follow canonical_map transitively until fixpoint, handling cycles.
|
|
285
|
+
|
|
286
|
+
For a simple chain (no cycle) the terminal id (not in canonical_map) is
|
|
287
|
+
returned. For a cycle (or a rho-shaped chain whose tail enters a cycle)
|
|
288
|
+
the lexicographically smallest id among the cycle members is returned so
|
|
289
|
+
that exactly one node in the cycle survives apply_merge.
|
|
290
|
+
"""
|
|
291
|
+
seen_list: list[str] = []
|
|
292
|
+
seen_set: set[str] = set()
|
|
293
|
+
current = node_id
|
|
294
|
+
while current in canonical_map:
|
|
295
|
+
if current in seen_set:
|
|
296
|
+
# Cycle detected — current is the entry point we've looped back to.
|
|
297
|
+
# The cycle members are the suffix of seen_list starting from current.
|
|
298
|
+
cycle_start = seen_list.index(current)
|
|
299
|
+
cycle_members = seen_list[cycle_start:]
|
|
300
|
+
return min(cycle_members)
|
|
301
|
+
seen_list.append(current)
|
|
302
|
+
seen_set.add(current)
|
|
303
|
+
current = canonical_map[current]
|
|
304
|
+
# No cycle — current is the terminal (not in canonical_map).
|
|
305
|
+
return current
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def apply_merge(
|
|
309
|
+
nodes: list[GraphNode],
|
|
310
|
+
edges: list[GraphEdge],
|
|
311
|
+
canonical_map: dict[str, str],
|
|
312
|
+
) -> tuple[list[GraphNode], list[GraphEdge]]:
|
|
313
|
+
"""Apply a duplicate-to-canonical mapping, returning new (nodes, edges).
|
|
314
|
+
|
|
315
|
+
*canonical_map* maps ``duplicate_node_id -> canonical_node_id``.
|
|
316
|
+
|
|
317
|
+
Rules:
|
|
318
|
+
* Nodes whose ``node_id`` is a key in *canonical_map* (and is not itself
|
|
319
|
+
the canonical target after transitive resolution) are dropped.
|
|
320
|
+
* Every edge's ``src_id`` and ``dst_id`` are rewritten through the map
|
|
321
|
+
(transitively resolved).
|
|
322
|
+
* After rewriting, a new ``edge_id`` is derived via ``GraphEdge.make``;
|
|
323
|
+
duplicate edges are deduplicated (last writer wins on same ``edge_id``).
|
|
324
|
+
* Self-loops introduced by merging (``src_id == dst_id``) are dropped.
|
|
325
|
+
* The original *nodes* and *edges* lists are never mutated.
|
|
326
|
+
* Output lists are deterministically sorted (nodes by ``node_id``, edges
|
|
327
|
+
by ``edge_id``).
|
|
328
|
+
|
|
329
|
+
Transitive resolution: if the map contains ``a -> b`` and ``b -> c`` then
|
|
330
|
+
``a`` resolves to ``c``. Cycles in the map are broken at the first
|
|
331
|
+
repeated node (the node keeps its current id).
|
|
332
|
+
"""
|
|
333
|
+
# Resolve each key to its ultimate canonical target.
|
|
334
|
+
resolved: dict[str, str] = {
|
|
335
|
+
dup: _resolve_canonical(dup, canonical_map) for dup in canonical_map
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
# Non-canonical node ids: keys whose resolved target differs from themselves.
|
|
339
|
+
non_canonical: set[str] = {dup for dup, canon in resolved.items() if dup != canon}
|
|
340
|
+
|
|
341
|
+
# Filter nodes: drop non-canonical duplicates.
|
|
342
|
+
new_nodes: list[GraphNode] = sorted(
|
|
343
|
+
[n for n in nodes if n.node_id not in non_canonical],
|
|
344
|
+
key=lambda n: n.node_id,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# FIX 1: surviving node ids — only nodes actually present in new_nodes.
|
|
348
|
+
surviving_node_ids: set[str] = {n.node_id for n in new_nodes}
|
|
349
|
+
|
|
350
|
+
# Rewrite and deduplicate edges.
|
|
351
|
+
seen_edge_ids: dict[str, GraphEdge] = {}
|
|
352
|
+
for edge in edges:
|
|
353
|
+
new_src = resolved.get(edge.src_id, edge.src_id)
|
|
354
|
+
new_dst = resolved.get(edge.dst_id, edge.dst_id)
|
|
355
|
+
if new_src == new_dst:
|
|
356
|
+
# Drop self-loops created by merging.
|
|
357
|
+
continue
|
|
358
|
+
# FIX 1: drop edges whose rewritten endpoint is not in the surviving set.
|
|
359
|
+
if new_src not in surviving_node_ids or new_dst not in surviving_node_ids:
|
|
360
|
+
continue
|
|
361
|
+
new_edge = GraphEdge.make(
|
|
362
|
+
src_id=new_src,
|
|
363
|
+
dst_id=new_dst,
|
|
364
|
+
kind=edge.kind,
|
|
365
|
+
confidence=edge.confidence,
|
|
366
|
+
valid_at=edge.valid_at,
|
|
367
|
+
)
|
|
368
|
+
seen_edge_ids[new_edge.edge_id] = new_edge
|
|
369
|
+
|
|
370
|
+
new_edges: list[GraphEdge] = sorted(seen_edge_ids.values(), key=lambda e: e.edge_id)
|
|
371
|
+
|
|
372
|
+
return new_nodes, new_edges
|