graphmemory 1.1.2__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphmemory-1.1.2 → graphmemory-1.2.0}/PKG-INFO +4 -2
- {graphmemory-1.1.2 → graphmemory-1.2.0}/README.md +3 -1
- graphmemory-1.2.0/examples/test_ingest.py +147 -0
- graphmemory-1.2.0/graphmemory/__init__.py +4 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/database.py +319 -26
- {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/extraction.py +11 -3
- {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/models.py +6 -0
- graphmemory-1.2.0/input/Genetic Programming1.txt +32173 -0
- graphmemory-1.2.0/input/Genetic Programming2.txt +34325 -0
- graphmemory-1.2.0/input/Genetic Programming3.txt +45720 -0
- graphmemory-1.2.0/input/Genetic Programming4.txt +21664 -0
- graphmemory-1.2.0/input/aimav4.txt +34094 -0
- graphmemory-1.2.0/input/reading_in_plannings.txt +67565 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/pyproject.toml +1 -1
- {graphmemory-1.1.2 → graphmemory-1.2.0}/tests/tests.py +163 -1
- graphmemory-1.2.0/video/public/banner.png +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/GraphMemoryShowcase.tsx +2 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/Root.tsx +1 -1
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/CodeBlock.tsx +4 -4
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/FeaturePill.tsx +2 -2
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/GraphViz.tsx +18 -18
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/SectionTitle.tsx +4 -4
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/AlgorithmsScene.tsx +3 -3
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/ExportScene.tsx +2 -2
- graphmemory-1.2.0/video/src/scenes/IntroScene.tsx +97 -0
- graphmemory-1.2.0/video/src/scenes/MergeScene.tsx +234 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/OutroScene.tsx +7 -7
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/RetrievalScene.tsx +2 -2
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/VectorSearchScene.tsx +3 -3
- graphmemory-1.1.2/graphmemory/__init__.py +0 -4
- graphmemory-1.1.2/video/src/scenes/IntroScene.tsx +0 -135
- {graphmemory-1.1.2 → graphmemory-1.2.0}/.gitignore +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/LICENSE +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/dspy_example_typed_pred.py +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/lexical_graph.py +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/openai_example.py +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/algorithms.py +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/requirements.txt +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/package-lock.json +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/package.json +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/Background.tsx +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/index.ts +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/ExtractionScene.tsx +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/NodeEdgeScene.tsx +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/QueryBuilderScene.tsx +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/theme.ts +0 -0
- {graphmemory-1.1.2 → graphmemory-1.2.0}/video/tsconfig.json +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphmemory
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Graph-based memory system using DuckDB
|
|
5
5
|
Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
|
|
6
6
|
Project-URL: Repository, https://github.com/bradAGI/GraphMemory
|
|
@@ -27,7 +27,9 @@ Description-Content-Type: text/markdown
|
|
|
27
27
|
|
|
28
28
|
[](https://discord.gg/DSS3DmStV8)
|
|
29
29
|
|
|
30
|
-
# GraphMemory
|
|
30
|
+
# GraphMemory - GraphRAG Database
|
|
31
|
+
|
|
32
|
+

|
|
31
33
|
|
|
32
34
|
An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.
|
|
33
35
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
[](https://discord.gg/DSS3DmStV8)
|
|
2
2
|
|
|
3
|
-
# GraphMemory
|
|
3
|
+
# GraphMemory - GraphRAG Database
|
|
4
|
+
|
|
5
|
+

|
|
4
6
|
|
|
5
7
|
An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.
|
|
6
8
|
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
9
|
+
|
|
10
|
+
import dspy
|
|
11
|
+
from graphmemory import GraphMemory, MergeStrategy
|
|
12
|
+
from graphmemory.extraction import extract_and_merge
|
|
13
|
+
|
|
14
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# --- Configure DSPy with gpt-5-nano ---
|
|
18
|
+
lm = dspy.LM("openai/gpt-5-nano")
|
|
19
|
+
dspy.configure(lm=lm)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
|
|
23
|
+
"""Split text into paragraph-aware chunks."""
|
|
24
|
+
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
|
25
|
+
chunks = []
|
|
26
|
+
current = []
|
|
27
|
+
current_len = 0
|
|
28
|
+
for p in paragraphs:
|
|
29
|
+
if current_len + len(p) > max_chars and current:
|
|
30
|
+
chunks.append("\n\n".join(current))
|
|
31
|
+
current = []
|
|
32
|
+
current_len = 0
|
|
33
|
+
current.append(p)
|
|
34
|
+
current_len += len(p)
|
|
35
|
+
if current:
|
|
36
|
+
chunks.append("\n\n".join(current))
|
|
37
|
+
return chunks
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main():
|
|
41
|
+
input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
|
|
42
|
+
with open(input_path) as f:
|
|
43
|
+
text = f.read(100_000)
|
|
44
|
+
|
|
45
|
+
text = re.sub(r"<!--.*?-->", "", text)
|
|
46
|
+
chunks = chunk_text(text, max_chars=4000)
|
|
47
|
+
|
|
48
|
+
print("=" * 60)
|
|
49
|
+
print("GraphMemory — Real LLM Extraction Test")
|
|
50
|
+
print("=" * 60)
|
|
51
|
+
print(f"Source: aimav4.txt ({len(text)} chars)")
|
|
52
|
+
print(f"Chunks: {len(chunks)}")
|
|
53
|
+
print(f"LLM: gpt-5-nano via DSPy")
|
|
54
|
+
|
|
55
|
+
db = GraphMemory(database=":memory:", vector_length=3)
|
|
56
|
+
|
|
57
|
+
print(f"\n--- Extracting entities & relationships ---")
|
|
58
|
+
total_nodes = 0
|
|
59
|
+
total_edges = 0
|
|
60
|
+
total_merged_nodes = 0
|
|
61
|
+
total_merged_edges = 0
|
|
62
|
+
|
|
63
|
+
for i, chunk in enumerate(chunks):
|
|
64
|
+
print(f"\n Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
|
|
65
|
+
try:
|
|
66
|
+
# Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
|
|
67
|
+
node_results, edge_results = extract_and_merge(
|
|
68
|
+
db,
|
|
69
|
+
chunk,
|
|
70
|
+
match_keys=["name"],
|
|
71
|
+
match_type=True,
|
|
72
|
+
similarity_threshold=0.88,
|
|
73
|
+
sentences=[chunk], # single LLM call per chunk
|
|
74
|
+
)
|
|
75
|
+
created_n = sum(1 for r in node_results if r.created)
|
|
76
|
+
merged_n = sum(1 for r in node_results if not r.created)
|
|
77
|
+
created_e = sum(1 for r in edge_results if r.created)
|
|
78
|
+
merged_e = sum(1 for r in edge_results if not r.created)
|
|
79
|
+
|
|
80
|
+
total_nodes += created_n
|
|
81
|
+
total_merged_nodes += merged_n
|
|
82
|
+
total_edges += created_e
|
|
83
|
+
total_merged_edges += merged_e
|
|
84
|
+
|
|
85
|
+
print(f" Nodes: {created_n} new, {merged_n} merged")
|
|
86
|
+
print(f" Edges: {created_e} new, {merged_e} merged")
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.warning(f" Chunk {i + 1} failed: {e}")
|
|
89
|
+
|
|
90
|
+
# --- Post-extraction dedupe ---
|
|
91
|
+
print(f"\n--- Post-extraction duplicate resolution ---")
|
|
92
|
+
clusters = db.resolve_duplicates(
|
|
93
|
+
match_keys=["name"],
|
|
94
|
+
match_type=True,
|
|
95
|
+
similarity_threshold=0.90,
|
|
96
|
+
)
|
|
97
|
+
if clusters:
|
|
98
|
+
for c in clusters:
|
|
99
|
+
merged_names = [m.properties.get("name", "?") for m in c.merged]
|
|
100
|
+
print(f" Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
|
|
101
|
+
else:
|
|
102
|
+
print(" No additional duplicates found.")
|
|
103
|
+
|
|
104
|
+
# --- Results ---
|
|
105
|
+
all_nodes = db.nodes_to_json()
|
|
106
|
+
all_edges = db.edges_to_json()
|
|
107
|
+
|
|
108
|
+
print(f"\n--- Final Graph ---")
|
|
109
|
+
print(f" Nodes: {len(all_nodes)}")
|
|
110
|
+
print(f" Edges: {len(all_edges)}")
|
|
111
|
+
|
|
112
|
+
type_counts = {}
|
|
113
|
+
for n in all_nodes:
|
|
114
|
+
t = n.get("type", "Unknown")
|
|
115
|
+
type_counts[t] = type_counts.get(t, 0) + 1
|
|
116
|
+
print(f" Types: {type_counts}")
|
|
117
|
+
|
|
118
|
+
print(f"\n--- Extracted Entities ---")
|
|
119
|
+
for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
|
|
120
|
+
props = n.get("properties", {})
|
|
121
|
+
print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
|
|
122
|
+
|
|
123
|
+
print(f"\n--- Extracted Relationships ---")
|
|
124
|
+
node_id_map = {n["id"]: n for n in all_nodes}
|
|
125
|
+
for e in all_edges:
|
|
126
|
+
src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
|
|
127
|
+
tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
|
|
128
|
+
print(f" {src} --[{e['relation']}]--> {tgt}")
|
|
129
|
+
|
|
130
|
+
print(f"\n--- Full-text search: 'deep learning' ---")
|
|
131
|
+
results = db.search_nodes("deep learning", limit=5)
|
|
132
|
+
for sr in results:
|
|
133
|
+
print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
|
|
134
|
+
|
|
135
|
+
print(f"\n--- Summary ---")
|
|
136
|
+
print(f" Extracted: {total_nodes} nodes, {total_edges} edges")
|
|
137
|
+
print(f" Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
|
|
138
|
+
print(f" Post-dedupe clusters: {len(clusters)}")
|
|
139
|
+
print(f" Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
|
|
140
|
+
|
|
141
|
+
print("\n" + "=" * 60)
|
|
142
|
+
print("Done!")
|
|
143
|
+
print("=" * 60)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
main()
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from .database import GraphMemory, QueryBuilder
|
|
2
|
+
from .models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
|
|
3
|
+
|
|
4
|
+
__all__ = ["DuplicateCluster", "Edge", "EdgeMergeResult", "GraphMemory", "MergeResult", "MergeStrategy", "NearestNode", "Node", "QueryBuilder", "RetrievalContext", "RetrievalResult", "SearchResult", "TraversalResult", "algorithms", "extraction"]
|
|
@@ -13,7 +13,7 @@ import xml.etree.ElementTree as ET
|
|
|
13
13
|
from contextlib import contextmanager
|
|
14
14
|
from typing import Any, Dict, List, Union, List
|
|
15
15
|
|
|
16
|
-
from graphmemory.models import Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
|
|
16
|
+
from graphmemory.models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -338,25 +338,71 @@ class GraphMemory:
|
|
|
338
338
|
except duckdb.Error as e:
|
|
339
339
|
logger.error(f"Error during bulk delete edges: {e}")
|
|
340
340
|
|
|
341
|
-
def _find_matching_node(
|
|
342
|
-
|
|
341
|
+
def _find_matching_node(
|
|
342
|
+
self, cur, node: Node, match_keys: list[str], match_type: bool,
|
|
343
|
+
similarity_threshold: float = 1.0,
|
|
344
|
+
vector_threshold: float | None = None,
|
|
345
|
+
) -> Node | None:
|
|
346
|
+
"""Find an existing node matching the given property keys and optional type.
|
|
347
|
+
|
|
348
|
+
When ``similarity_threshold`` is 1.0 (default), matching is exact.
|
|
349
|
+
Lower values enable fuzzy matching via DuckDB's ``jaro_winkler_similarity``.
|
|
350
|
+
When ``vector_threshold`` is set and the node has a vector, candidates must
|
|
351
|
+
also have a cosine distance within that threshold.
|
|
352
|
+
"""
|
|
353
|
+
fuzzy = similarity_threshold < 1.0
|
|
354
|
+
|
|
355
|
+
# Separate param lists for SELECT expressions vs WHERE clauses,
|
|
356
|
+
# since DuckDB binds positional params in statement order.
|
|
357
|
+
select_extra: list[str] = []
|
|
358
|
+
select_params: list = []
|
|
343
359
|
where_parts: list[str] = []
|
|
344
|
-
|
|
360
|
+
where_params: list = []
|
|
361
|
+
|
|
345
362
|
if match_type and node.type is not None:
|
|
346
363
|
where_parts.append("type = ?")
|
|
347
|
-
|
|
364
|
+
where_params.append(node.type)
|
|
365
|
+
|
|
348
366
|
for key in match_keys:
|
|
349
367
|
if not self._VALID_ATTRIBUTE_RE.match(key):
|
|
350
368
|
raise ValueError(f"Invalid match key: {key!r}")
|
|
351
369
|
value = (node.properties or {}).get(key)
|
|
352
370
|
if value is None:
|
|
353
371
|
where_parts.append(f"json_extract(properties, '$.{key}') IS NULL")
|
|
372
|
+
elif fuzzy and isinstance(value, str):
|
|
373
|
+
alias = f"sim_{key}"
|
|
374
|
+
select_extra.append(
|
|
375
|
+
f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
|
|
376
|
+
)
|
|
377
|
+
select_params.append(value)
|
|
378
|
+
where_parts.append(f"{alias} >= ?")
|
|
379
|
+
where_params.append(similarity_threshold)
|
|
354
380
|
else:
|
|
355
381
|
where_parts.append(f"json_extract(properties, '$.{key}') = ?")
|
|
356
|
-
|
|
357
|
-
|
|
382
|
+
where_params.append(json.dumps(value))
|
|
383
|
+
|
|
384
|
+
if vector_threshold is not None and node.vector:
|
|
385
|
+
where_parts.append(f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?")
|
|
386
|
+
where_params.extend([node.vector, vector_threshold])
|
|
387
|
+
|
|
388
|
+
if not where_parts and not select_extra:
|
|
358
389
|
return None
|
|
359
|
-
|
|
390
|
+
|
|
391
|
+
select_cols = "id, type, properties, vector"
|
|
392
|
+
if select_extra:
|
|
393
|
+
select_cols += ", " + ", ".join(select_extra)
|
|
394
|
+
|
|
395
|
+
where_clause = " AND ".join(where_parts) if where_parts else "TRUE"
|
|
396
|
+
|
|
397
|
+
order_clause = ""
|
|
398
|
+
if fuzzy:
|
|
399
|
+
sim_cols = [f"sim_{k}" for k in match_keys
|
|
400
|
+
if isinstance((node.properties or {}).get(k), str)]
|
|
401
|
+
if sim_cols:
|
|
402
|
+
order_clause = " ORDER BY " + " + ".join(sim_cols) + " DESC"
|
|
403
|
+
|
|
404
|
+
query = f"SELECT {select_cols} FROM nodes WHERE {where_clause}{order_clause} LIMIT 1;"
|
|
405
|
+
params = select_params + where_params
|
|
360
406
|
row = cur.execute(query, params).fetchone()
|
|
361
407
|
if row:
|
|
362
408
|
return Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
|
|
@@ -373,11 +419,53 @@ class GraphMemory:
|
|
|
373
419
|
return existing or {}
|
|
374
420
|
return incoming or {}
|
|
375
421
|
|
|
422
|
+
def _safe_update_node(self, cur, node_id: str, node_type, properties: dict, vector) -> None:
|
|
423
|
+
"""Update a node, working around DuckDB FK constraints on UPDATE.
|
|
424
|
+
|
|
425
|
+
DuckDB internally deletes+reinserts rows on UPDATE, which triggers FK
|
|
426
|
+
violations when edges reference the node. This method temporarily removes
|
|
427
|
+
and restores those edges.
|
|
428
|
+
"""
|
|
429
|
+
try:
|
|
430
|
+
cur.execute(
|
|
431
|
+
"UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
|
|
432
|
+
(node_type, json.dumps(properties), vector, node_id)
|
|
433
|
+
)
|
|
434
|
+
except duckdb.ConstraintException:
|
|
435
|
+
# Stash edges, update node, restore edges
|
|
436
|
+
edges = cur.execute(
|
|
437
|
+
"SELECT id, source_id, target_id, relation, weight FROM edges "
|
|
438
|
+
"WHERE source_id = ? OR target_id = ?;",
|
|
439
|
+
(node_id, node_id)
|
|
440
|
+
).fetchall()
|
|
441
|
+
for eid, *_ in edges:
|
|
442
|
+
cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
|
|
443
|
+
cur.execute(
|
|
444
|
+
"UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
|
|
445
|
+
(node_type, json.dumps(properties), vector, node_id)
|
|
446
|
+
)
|
|
447
|
+
for eid, src, tgt, rel, wt in edges:
|
|
448
|
+
cur.execute(
|
|
449
|
+
"INSERT INTO edges (id, source_id, target_id, relation, weight) "
|
|
450
|
+
"VALUES (?, ?, ?, ?, ?);",
|
|
451
|
+
(eid, src, tgt, rel, wt)
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
@staticmethod
|
|
455
|
+
def normalize_relation(relation: str) -> str:
|
|
456
|
+
"""Lowercase, strip, and collapse whitespace/separators to underscores."""
|
|
457
|
+
s = relation.strip().lower()
|
|
458
|
+
s = re.sub(r'[\s\-\.]+', '_', s)
|
|
459
|
+
s = re.sub(r'_+', '_', s)
|
|
460
|
+
return s.strip('_')
|
|
461
|
+
|
|
376
462
|
@with_retry()
|
|
377
463
|
def merge_node(self, node: Node, match_keys: list[str],
|
|
378
464
|
match_type: bool = True,
|
|
379
465
|
strategy: MergeStrategy = MergeStrategy.UPDATE,
|
|
380
|
-
update_vector: bool = True
|
|
466
|
+
update_vector: bool = True,
|
|
467
|
+
similarity_threshold: float = 1.0,
|
|
468
|
+
vector_threshold: float | None = None) -> MergeResult:
|
|
381
469
|
"""Insert a node or update it if a match is found by property keys.
|
|
382
470
|
|
|
383
471
|
Args:
|
|
@@ -401,15 +489,16 @@ class GraphMemory:
|
|
|
401
489
|
try:
|
|
402
490
|
with self.transaction():
|
|
403
491
|
cur = self.cursor()
|
|
404
|
-
existing = self._find_matching_node(
|
|
492
|
+
existing = self._find_matching_node(
|
|
493
|
+
cur, node, match_keys, match_type,
|
|
494
|
+
similarity_threshold=similarity_threshold,
|
|
495
|
+
vector_threshold=vector_threshold,
|
|
496
|
+
)
|
|
405
497
|
if existing:
|
|
406
498
|
merged_props = self._merge_properties(existing.properties, node.properties, strategy)
|
|
407
499
|
vector = node.vector if update_vector and node.vector else existing.vector
|
|
408
500
|
node_type = node.type if node.type is not None else existing.type
|
|
409
|
-
cur.
|
|
410
|
-
"UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
|
|
411
|
-
(node_type, json.dumps(merged_props), vector, str(existing.id))
|
|
412
|
-
)
|
|
501
|
+
self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
|
|
413
502
|
self._fts_dirty = True
|
|
414
503
|
result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
|
|
415
504
|
return MergeResult(node=result_node, created=False)
|
|
@@ -429,7 +518,9 @@ class GraphMemory:
|
|
|
429
518
|
def bulk_merge_nodes(self, nodes: list[Node], match_keys: list[str],
|
|
430
519
|
match_type: bool = True,
|
|
431
520
|
strategy: MergeStrategy = MergeStrategy.UPDATE,
|
|
432
|
-
update_vector: bool = True
|
|
521
|
+
update_vector: bool = True,
|
|
522
|
+
similarity_threshold: float = 1.0,
|
|
523
|
+
vector_threshold: float | None = None) -> list[MergeResult]:
|
|
433
524
|
"""Merge multiple nodes, inserting new ones and updating matches.
|
|
434
525
|
|
|
435
526
|
Runs in a single transaction for atomicity.
|
|
@@ -448,15 +539,16 @@ class GraphMemory:
|
|
|
448
539
|
if node.vector and not self._validate_vector(node.vector):
|
|
449
540
|
logger.error(f"Invalid vector for node, skipping: {node.id}")
|
|
450
541
|
continue
|
|
451
|
-
existing = self._find_matching_node(
|
|
542
|
+
existing = self._find_matching_node(
|
|
543
|
+
cur, node, match_keys, match_type,
|
|
544
|
+
similarity_threshold=similarity_threshold,
|
|
545
|
+
vector_threshold=vector_threshold,
|
|
546
|
+
)
|
|
452
547
|
if existing:
|
|
453
548
|
merged_props = self._merge_properties(existing.properties, node.properties, strategy)
|
|
454
549
|
vector = node.vector if update_vector and node.vector else existing.vector
|
|
455
550
|
node_type = node.type if node.type is not None else existing.type
|
|
456
|
-
cur.
|
|
457
|
-
"UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
|
|
458
|
-
(node_type, json.dumps(merged_props), vector, str(existing.id))
|
|
459
|
-
)
|
|
551
|
+
self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
|
|
460
552
|
result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
|
|
461
553
|
results.append(MergeResult(node=result_node, created=False))
|
|
462
554
|
else:
|
|
@@ -474,10 +566,14 @@ class GraphMemory:
|
|
|
474
566
|
raise
|
|
475
567
|
|
|
476
568
|
def _find_matching_edge(self, cur, edge: Edge) -> Edge | None:
|
|
477
|
-
"""Find an existing edge matching (source_id, target_id, relation).
|
|
569
|
+
"""Find an existing edge matching (source_id, target_id, relation).
|
|
570
|
+
|
|
571
|
+
Relations are compared in normalized form (lowercase, underscored).
|
|
572
|
+
"""
|
|
573
|
+
normalized = self.normalize_relation(edge.relation)
|
|
478
574
|
row = cur.execute(
|
|
479
575
|
"SELECT id, source_id, target_id, relation, weight FROM edges WHERE source_id = ? AND target_id = ? AND relation = ? LIMIT 1;",
|
|
480
|
-
(str(edge.source_id), str(edge.target_id),
|
|
576
|
+
(str(edge.source_id), str(edge.target_id), normalized)
|
|
481
577
|
).fetchone()
|
|
482
578
|
if row:
|
|
483
579
|
return Edge(id=row[0], source_id=row[1], target_id=row[2], relation=row[3], weight=row[4])
|
|
@@ -511,11 +607,13 @@ class GraphMemory:
|
|
|
511
607
|
result_edge = existing
|
|
512
608
|
return EdgeMergeResult(edge=result_edge, created=False)
|
|
513
609
|
else:
|
|
610
|
+
normalized = self.normalize_relation(edge.relation)
|
|
514
611
|
cur.execute(
|
|
515
612
|
"INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
|
|
516
|
-
(str(edge.id), str(edge.source_id), str(edge.target_id),
|
|
613
|
+
(str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
|
|
517
614
|
)
|
|
518
|
-
|
|
615
|
+
result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
|
|
616
|
+
return EdgeMergeResult(edge=result_edge, created=True)
|
|
519
617
|
except duckdb.Error as e:
|
|
520
618
|
logger.error(f"Error during merge edge: {e}")
|
|
521
619
|
raise
|
|
@@ -545,16 +643,211 @@ class GraphMemory:
|
|
|
545
643
|
result_edge = existing
|
|
546
644
|
results.append(EdgeMergeResult(edge=result_edge, created=False))
|
|
547
645
|
else:
|
|
646
|
+
normalized = self.normalize_relation(edge.relation)
|
|
548
647
|
cur.execute(
|
|
549
648
|
"INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
|
|
550
|
-
(str(edge.id), str(edge.source_id), str(edge.target_id),
|
|
649
|
+
(str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
|
|
551
650
|
)
|
|
552
|
-
|
|
651
|
+
result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
|
|
652
|
+
results.append(EdgeMergeResult(edge=result_edge, created=True))
|
|
553
653
|
return results
|
|
554
654
|
except duckdb.Error as e:
|
|
555
655
|
logger.error(f"Error during bulk merge edges: {e}")
|
|
556
656
|
raise
|
|
557
657
|
|
|
658
|
+
@with_retry()
|
|
659
|
+
def resolve_duplicates(
|
|
660
|
+
self,
|
|
661
|
+
match_keys: list[str] | None = None,
|
|
662
|
+
match_type: bool = True,
|
|
663
|
+
similarity_threshold: float = 0.9,
|
|
664
|
+
vector_threshold: float | None = None,
|
|
665
|
+
strategy: MergeStrategy = MergeStrategy.UPDATE,
|
|
666
|
+
) -> list[DuplicateCluster]:
|
|
667
|
+
"""Scan all nodes and merge clusters of likely duplicates.
|
|
668
|
+
|
|
669
|
+
For each unprocessed node, finds fuzzy matches among remaining nodes.
|
|
670
|
+
The first node encountered becomes the "survivor"; duplicates have their
|
|
671
|
+
edges reassigned and are then deleted.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
match_keys: Property names to compare (default ``["name"]``).
|
|
675
|
+
match_type: Also require ``node.type`` to match (default ``True``).
|
|
676
|
+
similarity_threshold: Jaro-Winkler threshold for string properties.
|
|
677
|
+
vector_threshold: Max cosine distance for vector similarity (optional).
|
|
678
|
+
strategy: How to merge properties from duplicates into the survivor.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
List of :class:`~graphmemory.models.DuplicateCluster` results.
|
|
682
|
+
"""
|
|
683
|
+
if match_keys is None:
|
|
684
|
+
match_keys = ["name"]
|
|
685
|
+
for key in match_keys:
|
|
686
|
+
if not self._VALID_ATTRIBUTE_RE.match(key):
|
|
687
|
+
raise ValueError(f"Invalid match key: {key!r}")
|
|
688
|
+
|
|
689
|
+
clusters: list[DuplicateCluster] = []
|
|
690
|
+
try:
|
|
691
|
+
cur = self.cursor()
|
|
692
|
+
all_rows = cur.execute(
|
|
693
|
+
"SELECT id, type, properties, vector FROM nodes ORDER BY id;"
|
|
694
|
+
).fetchall()
|
|
695
|
+
all_nodes = [
|
|
696
|
+
Node(id=r[0], type=r[1], properties=json.loads(r[2]), vector=r[3])
|
|
697
|
+
for r in all_rows
|
|
698
|
+
]
|
|
699
|
+
|
|
700
|
+
seen: set[str] = set()
|
|
701
|
+
for node in all_nodes:
|
|
702
|
+
nid = str(node.id)
|
|
703
|
+
if nid in seen:
|
|
704
|
+
continue
|
|
705
|
+
seen.add(nid)
|
|
706
|
+
|
|
707
|
+
# Build fuzzy query for candidates (separate param lists for ordering)
|
|
708
|
+
select_extra: list[str] = []
|
|
709
|
+
select_params: list = []
|
|
710
|
+
where_parts: list[str] = ["id != ?"]
|
|
711
|
+
where_params: list = [nid]
|
|
712
|
+
|
|
713
|
+
if match_type and node.type is not None:
|
|
714
|
+
where_parts.append("type = ?")
|
|
715
|
+
where_params.append(node.type)
|
|
716
|
+
|
|
717
|
+
for key in match_keys:
|
|
718
|
+
value = (node.properties or {}).get(key)
|
|
719
|
+
if value is None:
|
|
720
|
+
continue
|
|
721
|
+
if isinstance(value, str):
|
|
722
|
+
alias = f"sim_{key}"
|
|
723
|
+
select_extra.append(
|
|
724
|
+
f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
|
|
725
|
+
)
|
|
726
|
+
select_params.append(value)
|
|
727
|
+
where_parts.append(f"{alias} >= ?")
|
|
728
|
+
where_params.append(similarity_threshold)
|
|
729
|
+
|
|
730
|
+
if vector_threshold is not None and node.vector:
|
|
731
|
+
where_parts.append(
|
|
732
|
+
f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?"
|
|
733
|
+
)
|
|
734
|
+
where_params.extend([node.vector, vector_threshold])
|
|
735
|
+
|
|
736
|
+
if not select_extra:
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
# Exclude already-processed nodes
|
|
740
|
+
if seen - {nid}:
|
|
741
|
+
placeholders = ", ".join("?" for _ in seen if _ != nid)
|
|
742
|
+
where_parts.append(f"id NOT IN ({placeholders})")
|
|
743
|
+
where_params.extend(s for s in seen if s != nid)
|
|
744
|
+
|
|
745
|
+
select_cols = "id, type, properties, vector"
|
|
746
|
+
if select_extra:
|
|
747
|
+
select_cols += ", " + ", ".join(select_extra)
|
|
748
|
+
|
|
749
|
+
query = f"SELECT {select_cols} FROM nodes WHERE {' AND '.join(where_parts)};"
|
|
750
|
+
dup_rows = cur.execute(query, select_params + where_params).fetchall()
|
|
751
|
+
|
|
752
|
+
if not dup_rows:
|
|
753
|
+
continue
|
|
754
|
+
|
|
755
|
+
duplicates: list[Node] = []
|
|
756
|
+
survivor_props = dict(node.properties or {})
|
|
757
|
+
survivor_vector = node.vector
|
|
758
|
+
survivor_type = node.type
|
|
759
|
+
edges_to_rewrite: list[tuple] = []
|
|
760
|
+
|
|
761
|
+
for row in dup_rows:
|
|
762
|
+
dup = Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
|
|
763
|
+
dup_id = str(dup.id)
|
|
764
|
+
seen.add(dup_id)
|
|
765
|
+
duplicates.append(dup)
|
|
766
|
+
|
|
767
|
+
survivor_props = self._merge_properties(survivor_props, dup.properties, strategy)
|
|
768
|
+
if not survivor_vector and dup.vector:
|
|
769
|
+
survivor_vector = dup.vector
|
|
770
|
+
if not survivor_type and dup.type:
|
|
771
|
+
survivor_type = dup.type
|
|
772
|
+
|
|
773
|
+
dup_edges = cur.execute(
|
|
774
|
+
"SELECT id, source_id, target_id, relation, weight FROM edges "
|
|
775
|
+
"WHERE source_id = ? OR target_id = ?;",
|
|
776
|
+
(dup_id, dup_id)
|
|
777
|
+
).fetchall()
|
|
778
|
+
for eid, src, tgt, rel, wt in dup_edges:
|
|
779
|
+
new_src = nid if src == dup_id else src
|
|
780
|
+
new_tgt = nid if tgt == dup_id else tgt
|
|
781
|
+
edges_to_rewrite.append((eid, new_src, new_tgt, rel, wt))
|
|
782
|
+
|
|
783
|
+
# Delete edges referencing duplicates
|
|
784
|
+
for dup in duplicates:
|
|
785
|
+
cur.execute(
|
|
786
|
+
"DELETE FROM edges WHERE source_id = ? OR target_id = ?;",
|
|
787
|
+
(str(dup.id), str(dup.id))
|
|
788
|
+
)
|
|
789
|
+
# Also temporarily remove edges referencing survivor (DuckDB
|
|
790
|
+
# internally does delete+reinsert on UPDATE, triggering FK checks)
|
|
791
|
+
survivor_edges = cur.execute(
|
|
792
|
+
"SELECT id, source_id, target_id, relation, weight FROM edges "
|
|
793
|
+
"WHERE source_id = ? OR target_id = ?;",
|
|
794
|
+
(nid, nid)
|
|
795
|
+
).fetchall()
|
|
796
|
+
for eid, *_ in survivor_edges:
|
|
797
|
+
cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
|
|
798
|
+
|
|
799
|
+
# Delete duplicate nodes
|
|
800
|
+
for dup in duplicates:
|
|
801
|
+
cur.execute("DELETE FROM nodes WHERE id = ?;", (str(dup.id),))
|
|
802
|
+
|
|
803
|
+
# Update survivor with merged properties (safe now, no FK refs)
|
|
804
|
+
cur.execute(
|
|
805
|
+
"UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
|
|
806
|
+
(survivor_type, json.dumps(survivor_props), survivor_vector, nid)
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# Re-insert all edges, verifying both endpoints still exist
|
|
810
|
+
rewritten_eids = {e[0] for e in edges_to_rewrite}
|
|
811
|
+
all_edges_to_insert = []
|
|
812
|
+
for eid, src, tgt, rel, wt in edges_to_rewrite:
|
|
813
|
+
if src == tgt:
|
|
814
|
+
continue # skip self-loops
|
|
815
|
+
all_edges_to_insert.append((eid, src, tgt, rel, wt))
|
|
816
|
+
for eid, src, tgt, rel, wt in survivor_edges:
|
|
817
|
+
if eid in rewritten_eids:
|
|
818
|
+
continue
|
|
819
|
+
all_edges_to_insert.append((eid, src, tgt, rel, wt))
|
|
820
|
+
|
|
821
|
+
for eid, src, tgt, rel, wt in all_edges_to_insert:
|
|
822
|
+
src_exists = cur.execute(
|
|
823
|
+
"SELECT 1 FROM nodes WHERE id = ?", (str(src),)
|
|
824
|
+
).fetchone()
|
|
825
|
+
tgt_exists = cur.execute(
|
|
826
|
+
"SELECT 1 FROM nodes WHERE id = ?", (str(tgt),)
|
|
827
|
+
).fetchone()
|
|
828
|
+
if src_exists and tgt_exists:
|
|
829
|
+
cur.execute(
|
|
830
|
+
"INSERT INTO edges (id, source_id, target_id, relation, weight) "
|
|
831
|
+
"VALUES (?, ?, ?, ?, ?);",
|
|
832
|
+
(eid, src, tgt, rel, wt)
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
survivor = Node(id=node.id, type=survivor_type, properties=survivor_props, vector=survivor_vector)
|
|
836
|
+
clusters.append(DuplicateCluster(survivor=survivor, merged=duplicates))
|
|
837
|
+
|
|
838
|
+
if clusters:
|
|
839
|
+
self._fts_dirty = True
|
|
840
|
+
|
|
841
|
+
logger.info(
|
|
842
|
+
"Resolved %d duplicate clusters (%d nodes merged).",
|
|
843
|
+
len(clusters),
|
|
844
|
+
sum(len(c.merged) for c in clusters),
|
|
845
|
+
)
|
|
846
|
+
return clusters
|
|
847
|
+
except duckdb.Error as e:
|
|
848
|
+
logger.error(f"Error during resolve_duplicates: {e}")
|
|
849
|
+
raise
|
|
850
|
+
|
|
558
851
|
@with_retry()
|
|
559
852
|
def delete_edge(self, source_id: uuid.UUID, target_id: uuid.UUID):
|
|
560
853
|
try:
|
|
@@ -115,8 +115,13 @@ def _get_signatures():
|
|
|
115
115
|
# ---------------------------------------------------------------------------
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
def _make_predictor(dspy, signature):
|
|
119
|
+
"""Create a DSPy v3 predictor from a Signature."""
|
|
120
|
+
return dspy.Predict(signature)
|
|
121
|
+
|
|
122
|
+
|
|
118
123
|
def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
|
|
119
|
-
"""Extract entity nodes from text using a DSPy
|
|
124
|
+
"""Extract entity nodes from text using a DSPy predictor.
|
|
120
125
|
|
|
121
126
|
Args:
|
|
122
127
|
text: Full text to extract from (used when *sentences* is ``None``).
|
|
@@ -129,7 +134,7 @@ def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
|
|
|
129
134
|
"""
|
|
130
135
|
dspy = _require_dspy()
|
|
131
136
|
NodeSig, _ = _get_signatures()
|
|
132
|
-
predictor = dspy
|
|
137
|
+
predictor = _make_predictor(dspy, NodeSig)
|
|
133
138
|
|
|
134
139
|
if sentences is None:
|
|
135
140
|
sentences = [s.strip() for s in text.split(".") if s.strip()]
|
|
@@ -166,7 +171,7 @@ def extract_edges(
|
|
|
166
171
|
"""
|
|
167
172
|
dspy = _require_dspy()
|
|
168
173
|
_, EdgeSig = _get_signatures()
|
|
169
|
-
predictor = dspy
|
|
174
|
+
predictor = _make_predictor(dspy, EdgeSig)
|
|
170
175
|
|
|
171
176
|
if sentences is None:
|
|
172
177
|
sentences = [s.strip() for s in text.split(".") if s.strip()]
|
|
@@ -253,6 +258,8 @@ def extract_and_merge(
|
|
|
253
258
|
match_type: bool = True,
|
|
254
259
|
strategy: MergeStrategy = MergeStrategy.UPDATE,
|
|
255
260
|
sentences: list[str] | None = None,
|
|
261
|
+
similarity_threshold: float = 1.0,
|
|
262
|
+
vector_threshold: float | None = None,
|
|
256
263
|
) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
|
|
257
264
|
"""Extract nodes and edges from text, merging with existing graph data.
|
|
258
265
|
|
|
@@ -278,6 +285,7 @@ def extract_and_merge(
|
|
|
278
285
|
|
|
279
286
|
node_results = graph.bulk_merge_nodes(
|
|
280
287
|
nodes, match_keys=match_keys, match_type=match_type, strategy=strategy,
|
|
288
|
+
similarity_threshold=similarity_threshold, vector_threshold=vector_threshold,
|
|
281
289
|
) if nodes else []
|
|
282
290
|
|
|
283
291
|
edge_results = graph.bulk_merge_edges(edges) if edges else []
|