graphmemory 1.1.2__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {graphmemory-1.1.2 → graphmemory-1.2.0}/PKG-INFO +4 -2
  2. {graphmemory-1.1.2 → graphmemory-1.2.0}/README.md +3 -1
  3. graphmemory-1.2.0/examples/test_ingest.py +147 -0
  4. graphmemory-1.2.0/graphmemory/__init__.py +4 -0
  5. {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/database.py +319 -26
  6. {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/extraction.py +11 -3
  7. {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/models.py +6 -0
  8. graphmemory-1.2.0/input/Genetic Programming1.txt +32173 -0
  9. graphmemory-1.2.0/input/Genetic Programming2.txt +34325 -0
  10. graphmemory-1.2.0/input/Genetic Programming3.txt +45720 -0
  11. graphmemory-1.2.0/input/Genetic Programming4.txt +21664 -0
  12. graphmemory-1.2.0/input/aimav4.txt +34094 -0
  13. graphmemory-1.2.0/input/reading_in_plannings.txt +67565 -0
  14. {graphmemory-1.1.2 → graphmemory-1.2.0}/pyproject.toml +1 -1
  15. {graphmemory-1.1.2 → graphmemory-1.2.0}/tests/tests.py +163 -1
  16. graphmemory-1.2.0/video/public/banner.png +0 -0
  17. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/GraphMemoryShowcase.tsx +2 -0
  18. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/Root.tsx +1 -1
  19. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/CodeBlock.tsx +4 -4
  20. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/FeaturePill.tsx +2 -2
  21. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/GraphViz.tsx +18 -18
  22. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/SectionTitle.tsx +4 -4
  23. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/AlgorithmsScene.tsx +3 -3
  24. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/ExportScene.tsx +2 -2
  25. graphmemory-1.2.0/video/src/scenes/IntroScene.tsx +97 -0
  26. graphmemory-1.2.0/video/src/scenes/MergeScene.tsx +234 -0
  27. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/OutroScene.tsx +7 -7
  28. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/RetrievalScene.tsx +2 -2
  29. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/VectorSearchScene.tsx +3 -3
  30. graphmemory-1.1.2/graphmemory/__init__.py +0 -4
  31. graphmemory-1.1.2/video/src/scenes/IntroScene.tsx +0 -135
  32. {graphmemory-1.1.2 → graphmemory-1.2.0}/.gitignore +0 -0
  33. {graphmemory-1.1.2 → graphmemory-1.2.0}/LICENSE +0 -0
  34. {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/dspy_example_typed_pred.py +0 -0
  35. {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/lexical_graph.py +0 -0
  36. {graphmemory-1.1.2 → graphmemory-1.2.0}/examples/openai_example.py +0 -0
  37. {graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/algorithms.py +0 -0
  38. {graphmemory-1.1.2 → graphmemory-1.2.0}/requirements.txt +0 -0
  39. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/package-lock.json +0 -0
  40. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/package.json +0 -0
  41. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/components/Background.tsx +0 -0
  42. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/index.ts +0 -0
  43. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/ExtractionScene.tsx +0 -0
  44. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/NodeEdgeScene.tsx +0 -0
  45. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/scenes/QueryBuilderScene.tsx +0 -0
  46. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/src/theme.ts +0 -0
  47. {graphmemory-1.1.2 → graphmemory-1.2.0}/video/tsconfig.json +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphmemory
3
- Version: 1.1.2
3
+ Version: 1.2.0
4
4
  Summary: Graph-based memory system using DuckDB
5
5
  Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
6
6
  Project-URL: Repository, https://github.com/bradAGI/GraphMemory
@@ -27,7 +27,9 @@ Description-Content-Type: text/markdown
27
27
 
28
28
  [![](https://dcbadge.limes.pink/api/server/https://discord.gg/DSS3DmStV8)](https://discord.gg/DSS3DmStV8)
29
29
 
30
- # GraphMemory
30
+ # GraphMemory - GraphRAG Database
31
+
32
+ ![GraphMemory](https://github.com/bradAGI/GraphMemory/assets/46579244/9897dc2a-46c9-42e0-a8d3-2dcb1d93e6ae)
31
33
 
32
34
  An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.
33
35
 
@@ -1,6 +1,8 @@
1
1
  [![](https://dcbadge.limes.pink/api/server/https://discord.gg/DSS3DmStV8)](https://discord.gg/DSS3DmStV8)
2
2
 
3
- # GraphMemory
3
+ # GraphMemory - GraphRAG Database
4
+
5
+ ![GraphMemory](https://github.com/bradAGI/GraphMemory/assets/46579244/9897dc2a-46c9-42e0-a8d3-2dcb1d93e6ae)
4
6
 
5
7
  An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.
6
8
 
@@ -0,0 +1,147 @@
1
+ """End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
2
+
3
+ import sys
4
+ import os
5
+ import re
6
+ import logging
7
+
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
+
10
+ import dspy
11
+ from graphmemory import GraphMemory, MergeStrategy
12
+ from graphmemory.extraction import extract_and_merge
13
+
14
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # --- Configure DSPy with gpt-5-nano ---
18
+ lm = dspy.LM("openai/gpt-5-nano")
19
+ dspy.configure(lm=lm)
20
+
21
+
22
+ def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
23
+ """Split text into paragraph-aware chunks."""
24
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
25
+ chunks = []
26
+ current = []
27
+ current_len = 0
28
+ for p in paragraphs:
29
+ if current_len + len(p) > max_chars and current:
30
+ chunks.append("\n\n".join(current))
31
+ current = []
32
+ current_len = 0
33
+ current.append(p)
34
+ current_len += len(p)
35
+ if current:
36
+ chunks.append("\n\n".join(current))
37
+ return chunks
38
+
39
+
40
+ def main():
41
+ input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
42
+ with open(input_path) as f:
43
+ text = f.read(100_000)
44
+
45
+ text = re.sub(r"<!--.*?-->", "", text)
46
+ chunks = chunk_text(text, max_chars=4000)
47
+
48
+ print("=" * 60)
49
+ print("GraphMemory — Real LLM Extraction Test")
50
+ print("=" * 60)
51
+ print(f"Source: aimav4.txt ({len(text)} chars)")
52
+ print(f"Chunks: {len(chunks)}")
53
+ print(f"LLM: gpt-5-nano via DSPy")
54
+
55
+ db = GraphMemory(database=":memory:", vector_length=3)
56
+
57
+ print(f"\n--- Extracting entities & relationships ---")
58
+ total_nodes = 0
59
+ total_edges = 0
60
+ total_merged_nodes = 0
61
+ total_merged_edges = 0
62
+
63
+ for i, chunk in enumerate(chunks):
64
+ print(f"\n Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
65
+ try:
66
+ # Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
67
+ node_results, edge_results = extract_and_merge(
68
+ db,
69
+ chunk,
70
+ match_keys=["name"],
71
+ match_type=True,
72
+ similarity_threshold=0.88,
73
+ sentences=[chunk], # single LLM call per chunk
74
+ )
75
+ created_n = sum(1 for r in node_results if r.created)
76
+ merged_n = sum(1 for r in node_results if not r.created)
77
+ created_e = sum(1 for r in edge_results if r.created)
78
+ merged_e = sum(1 for r in edge_results if not r.created)
79
+
80
+ total_nodes += created_n
81
+ total_merged_nodes += merged_n
82
+ total_edges += created_e
83
+ total_merged_edges += merged_e
84
+
85
+ print(f" Nodes: {created_n} new, {merged_n} merged")
86
+ print(f" Edges: {created_e} new, {merged_e} merged")
87
+ except Exception as e:
88
+ logger.warning(f" Chunk {i + 1} failed: {e}")
89
+
90
+ # --- Post-extraction dedupe ---
91
+ print(f"\n--- Post-extraction duplicate resolution ---")
92
+ clusters = db.resolve_duplicates(
93
+ match_keys=["name"],
94
+ match_type=True,
95
+ similarity_threshold=0.90,
96
+ )
97
+ if clusters:
98
+ for c in clusters:
99
+ merged_names = [m.properties.get("name", "?") for m in c.merged]
100
+ print(f" Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
101
+ else:
102
+ print(" No additional duplicates found.")
103
+
104
+ # --- Results ---
105
+ all_nodes = db.nodes_to_json()
106
+ all_edges = db.edges_to_json()
107
+
108
+ print(f"\n--- Final Graph ---")
109
+ print(f" Nodes: {len(all_nodes)}")
110
+ print(f" Edges: {len(all_edges)}")
111
+
112
+ type_counts = {}
113
+ for n in all_nodes:
114
+ t = n.get("type", "Unknown")
115
+ type_counts[t] = type_counts.get(t, 0) + 1
116
+ print(f" Types: {type_counts}")
117
+
118
+ print(f"\n--- Extracted Entities ---")
119
+ for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
120
+ props = n.get("properties", {})
121
+ print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
122
+
123
+ print(f"\n--- Extracted Relationships ---")
124
+ node_id_map = {n["id"]: n for n in all_nodes}
125
+ for e in all_edges:
126
+ src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
127
+ tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
128
+ print(f" {src} --[{e['relation']}]--> {tgt}")
129
+
130
+ print(f"\n--- Full-text search: 'deep learning' ---")
131
+ results = db.search_nodes("deep learning", limit=5)
132
+ for sr in results:
133
+ print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
134
+
135
+ print(f"\n--- Summary ---")
136
+ print(f" Extracted: {total_nodes} nodes, {total_edges} edges")
137
+ print(f" Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
138
+ print(f" Post-dedupe clusters: {len(clusters)}")
139
+ print(f" Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
140
+
141
+ print("\n" + "=" * 60)
142
+ print("Done!")
143
+ print("=" * 60)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
@@ -0,0 +1,4 @@
1
+ from .database import GraphMemory, QueryBuilder
2
+ from .models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
3
+
4
+ __all__ = ["DuplicateCluster", "Edge", "EdgeMergeResult", "GraphMemory", "MergeResult", "MergeStrategy", "NearestNode", "Node", "QueryBuilder", "RetrievalContext", "RetrievalResult", "SearchResult", "TraversalResult", "algorithms", "extraction"]
@@ -13,7 +13,7 @@ import xml.etree.ElementTree as ET
13
13
  from contextlib import contextmanager
14
14
  from typing import Any, Dict, List, Union, List
15
15
 
16
- from graphmemory.models import Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
16
+ from graphmemory.models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
@@ -338,25 +338,71 @@ class GraphMemory:
338
338
  except duckdb.Error as e:
339
339
  logger.error(f"Error during bulk delete edges: {e}")
340
340
 
341
- def _find_matching_node(self, cur, node: Node, match_keys: list[str], match_type: bool) -> Node | None:
342
- """Find an existing node matching the given property keys and optional type."""
341
+ def _find_matching_node(
342
+ self, cur, node: Node, match_keys: list[str], match_type: bool,
343
+ similarity_threshold: float = 1.0,
344
+ vector_threshold: float | None = None,
345
+ ) -> Node | None:
346
+ """Find an existing node matching the given property keys and optional type.
347
+
348
+ When ``similarity_threshold`` is 1.0 (default), matching is exact.
349
+ Lower values enable fuzzy matching via DuckDB's ``jaro_winkler_similarity``.
350
+ When ``vector_threshold`` is set and the node has a vector, candidates must
351
+ also have a cosine distance within that threshold.
352
+ """
353
+ fuzzy = similarity_threshold < 1.0
354
+
355
+ # Separate param lists for SELECT expressions vs WHERE clauses,
356
+ # since DuckDB binds positional params in statement order.
357
+ select_extra: list[str] = []
358
+ select_params: list = []
343
359
  where_parts: list[str] = []
344
- params: list = []
360
+ where_params: list = []
361
+
345
362
  if match_type and node.type is not None:
346
363
  where_parts.append("type = ?")
347
- params.append(node.type)
364
+ where_params.append(node.type)
365
+
348
366
  for key in match_keys:
349
367
  if not self._VALID_ATTRIBUTE_RE.match(key):
350
368
  raise ValueError(f"Invalid match key: {key!r}")
351
369
  value = (node.properties or {}).get(key)
352
370
  if value is None:
353
371
  where_parts.append(f"json_extract(properties, '$.{key}') IS NULL")
372
+ elif fuzzy and isinstance(value, str):
373
+ alias = f"sim_{key}"
374
+ select_extra.append(
375
+ f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
376
+ )
377
+ select_params.append(value)
378
+ where_parts.append(f"{alias} >= ?")
379
+ where_params.append(similarity_threshold)
354
380
  else:
355
381
  where_parts.append(f"json_extract(properties, '$.{key}') = ?")
356
- params.append(json.dumps(value))
357
- if not where_parts:
382
+ where_params.append(json.dumps(value))
383
+
384
+ if vector_threshold is not None and node.vector:
385
+ where_parts.append(f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?")
386
+ where_params.extend([node.vector, vector_threshold])
387
+
388
+ if not where_parts and not select_extra:
358
389
  return None
359
- query = "SELECT id, type, properties, vector FROM nodes WHERE " + " AND ".join(where_parts) + " LIMIT 1;"
390
+
391
+ select_cols = "id, type, properties, vector"
392
+ if select_extra:
393
+ select_cols += ", " + ", ".join(select_extra)
394
+
395
+ where_clause = " AND ".join(where_parts) if where_parts else "TRUE"
396
+
397
+ order_clause = ""
398
+ if fuzzy:
399
+ sim_cols = [f"sim_{k}" for k in match_keys
400
+ if isinstance((node.properties or {}).get(k), str)]
401
+ if sim_cols:
402
+ order_clause = " ORDER BY " + " + ".join(sim_cols) + " DESC"
403
+
404
+ query = f"SELECT {select_cols} FROM nodes WHERE {where_clause}{order_clause} LIMIT 1;"
405
+ params = select_params + where_params
360
406
  row = cur.execute(query, params).fetchone()
361
407
  if row:
362
408
  return Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
@@ -373,11 +419,53 @@ class GraphMemory:
373
419
  return existing or {}
374
420
  return incoming or {}
375
421
 
422
+ def _safe_update_node(self, cur, node_id: str, node_type, properties: dict, vector) -> None:
423
+ """Update a node, working around DuckDB FK constraints on UPDATE.
424
+
425
+ DuckDB internally deletes+reinserts rows on UPDATE, which triggers FK
426
+ violations when edges reference the node. This method temporarily removes
427
+ and restores those edges.
428
+ """
429
+ try:
430
+ cur.execute(
431
+ "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
432
+ (node_type, json.dumps(properties), vector, node_id)
433
+ )
434
+ except duckdb.ConstraintException:
435
+ # Stash edges, update node, restore edges
436
+ edges = cur.execute(
437
+ "SELECT id, source_id, target_id, relation, weight FROM edges "
438
+ "WHERE source_id = ? OR target_id = ?;",
439
+ (node_id, node_id)
440
+ ).fetchall()
441
+ for eid, *_ in edges:
442
+ cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
443
+ cur.execute(
444
+ "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
445
+ (node_type, json.dumps(properties), vector, node_id)
446
+ )
447
+ for eid, src, tgt, rel, wt in edges:
448
+ cur.execute(
449
+ "INSERT INTO edges (id, source_id, target_id, relation, weight) "
450
+ "VALUES (?, ?, ?, ?, ?);",
451
+ (eid, src, tgt, rel, wt)
452
+ )
453
+
454
+ @staticmethod
455
+ def normalize_relation(relation: str) -> str:
456
+ """Lowercase, strip, and collapse whitespace/separators to underscores."""
457
+ s = relation.strip().lower()
458
+ s = re.sub(r'[\s\-\.]+', '_', s)
459
+ s = re.sub(r'_+', '_', s)
460
+ return s.strip('_')
461
+
376
462
  @with_retry()
377
463
  def merge_node(self, node: Node, match_keys: list[str],
378
464
  match_type: bool = True,
379
465
  strategy: MergeStrategy = MergeStrategy.UPDATE,
380
- update_vector: bool = True) -> MergeResult:
466
+ update_vector: bool = True,
467
+ similarity_threshold: float = 1.0,
468
+ vector_threshold: float | None = None) -> MergeResult:
381
469
  """Insert a node or update it if a match is found by property keys.
382
470
 
383
471
  Args:
@@ -401,15 +489,16 @@ class GraphMemory:
401
489
  try:
402
490
  with self.transaction():
403
491
  cur = self.cursor()
404
- existing = self._find_matching_node(cur, node, match_keys, match_type)
492
+ existing = self._find_matching_node(
493
+ cur, node, match_keys, match_type,
494
+ similarity_threshold=similarity_threshold,
495
+ vector_threshold=vector_threshold,
496
+ )
405
497
  if existing:
406
498
  merged_props = self._merge_properties(existing.properties, node.properties, strategy)
407
499
  vector = node.vector if update_vector and node.vector else existing.vector
408
500
  node_type = node.type if node.type is not None else existing.type
409
- cur.execute(
410
- "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
411
- (node_type, json.dumps(merged_props), vector, str(existing.id))
412
- )
501
+ self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
413
502
  self._fts_dirty = True
414
503
  result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
415
504
  return MergeResult(node=result_node, created=False)
@@ -429,7 +518,9 @@ class GraphMemory:
429
518
  def bulk_merge_nodes(self, nodes: list[Node], match_keys: list[str],
430
519
  match_type: bool = True,
431
520
  strategy: MergeStrategy = MergeStrategy.UPDATE,
432
- update_vector: bool = True) -> list[MergeResult]:
521
+ update_vector: bool = True,
522
+ similarity_threshold: float = 1.0,
523
+ vector_threshold: float | None = None) -> list[MergeResult]:
433
524
  """Merge multiple nodes, inserting new ones and updating matches.
434
525
 
435
526
  Runs in a single transaction for atomicity.
@@ -448,15 +539,16 @@ class GraphMemory:
448
539
  if node.vector and not self._validate_vector(node.vector):
449
540
  logger.error(f"Invalid vector for node, skipping: {node.id}")
450
541
  continue
451
- existing = self._find_matching_node(cur, node, match_keys, match_type)
542
+ existing = self._find_matching_node(
543
+ cur, node, match_keys, match_type,
544
+ similarity_threshold=similarity_threshold,
545
+ vector_threshold=vector_threshold,
546
+ )
452
547
  if existing:
453
548
  merged_props = self._merge_properties(existing.properties, node.properties, strategy)
454
549
  vector = node.vector if update_vector and node.vector else existing.vector
455
550
  node_type = node.type if node.type is not None else existing.type
456
- cur.execute(
457
- "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
458
- (node_type, json.dumps(merged_props), vector, str(existing.id))
459
- )
551
+ self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
460
552
  result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
461
553
  results.append(MergeResult(node=result_node, created=False))
462
554
  else:
@@ -474,10 +566,14 @@ class GraphMemory:
474
566
  raise
475
567
 
476
568
  def _find_matching_edge(self, cur, edge: Edge) -> Edge | None:
477
- """Find an existing edge matching (source_id, target_id, relation)."""
569
+ """Find an existing edge matching (source_id, target_id, relation).
570
+
571
+ Relations are compared in normalized form (lowercase, underscored).
572
+ """
573
+ normalized = self.normalize_relation(edge.relation)
478
574
  row = cur.execute(
479
575
  "SELECT id, source_id, target_id, relation, weight FROM edges WHERE source_id = ? AND target_id = ? AND relation = ? LIMIT 1;",
480
- (str(edge.source_id), str(edge.target_id), edge.relation)
576
+ (str(edge.source_id), str(edge.target_id), normalized)
481
577
  ).fetchone()
482
578
  if row:
483
579
  return Edge(id=row[0], source_id=row[1], target_id=row[2], relation=row[3], weight=row[4])
@@ -511,11 +607,13 @@ class GraphMemory:
511
607
  result_edge = existing
512
608
  return EdgeMergeResult(edge=result_edge, created=False)
513
609
  else:
610
+ normalized = self.normalize_relation(edge.relation)
514
611
  cur.execute(
515
612
  "INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
516
- (str(edge.id), str(edge.source_id), str(edge.target_id), edge.relation, edge.weight)
613
+ (str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
517
614
  )
518
- return EdgeMergeResult(edge=edge, created=True)
615
+ result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
616
+ return EdgeMergeResult(edge=result_edge, created=True)
519
617
  except duckdb.Error as e:
520
618
  logger.error(f"Error during merge edge: {e}")
521
619
  raise
@@ -545,16 +643,211 @@ class GraphMemory:
545
643
  result_edge = existing
546
644
  results.append(EdgeMergeResult(edge=result_edge, created=False))
547
645
  else:
646
+ normalized = self.normalize_relation(edge.relation)
548
647
  cur.execute(
549
648
  "INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
550
- (str(edge.id), str(edge.source_id), str(edge.target_id), edge.relation, edge.weight)
649
+ (str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
551
650
  )
552
- results.append(EdgeMergeResult(edge=edge, created=True))
651
+ result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
652
+ results.append(EdgeMergeResult(edge=result_edge, created=True))
553
653
  return results
554
654
  except duckdb.Error as e:
555
655
  logger.error(f"Error during bulk merge edges: {e}")
556
656
  raise
557
657
 
658
+ @with_retry()
659
+ def resolve_duplicates(
660
+ self,
661
+ match_keys: list[str] | None = None,
662
+ match_type: bool = True,
663
+ similarity_threshold: float = 0.9,
664
+ vector_threshold: float | None = None,
665
+ strategy: MergeStrategy = MergeStrategy.UPDATE,
666
+ ) -> list[DuplicateCluster]:
667
+ """Scan all nodes and merge clusters of likely duplicates.
668
+
669
+ For each unprocessed node, finds fuzzy matches among remaining nodes.
670
+ The first node encountered becomes the "survivor"; duplicates have their
671
+ edges reassigned and are then deleted.
672
+
673
+ Args:
674
+ match_keys: Property names to compare (default ``["name"]``).
675
+ match_type: Also require ``node.type`` to match (default ``True``).
676
+ similarity_threshold: Jaro-Winkler threshold for string properties.
677
+ vector_threshold: Max cosine distance for vector similarity (optional).
678
+ strategy: How to merge properties from duplicates into the survivor.
679
+
680
+ Returns:
681
+ List of :class:`~graphmemory.models.DuplicateCluster` results.
682
+ """
683
+ if match_keys is None:
684
+ match_keys = ["name"]
685
+ for key in match_keys:
686
+ if not self._VALID_ATTRIBUTE_RE.match(key):
687
+ raise ValueError(f"Invalid match key: {key!r}")
688
+
689
+ clusters: list[DuplicateCluster] = []
690
+ try:
691
+ cur = self.cursor()
692
+ all_rows = cur.execute(
693
+ "SELECT id, type, properties, vector FROM nodes ORDER BY id;"
694
+ ).fetchall()
695
+ all_nodes = [
696
+ Node(id=r[0], type=r[1], properties=json.loads(r[2]), vector=r[3])
697
+ for r in all_rows
698
+ ]
699
+
700
+ seen: set[str] = set()
701
+ for node in all_nodes:
702
+ nid = str(node.id)
703
+ if nid in seen:
704
+ continue
705
+ seen.add(nid)
706
+
707
+ # Build fuzzy query for candidates (separate param lists for ordering)
708
+ select_extra: list[str] = []
709
+ select_params: list = []
710
+ where_parts: list[str] = ["id != ?"]
711
+ where_params: list = [nid]
712
+
713
+ if match_type and node.type is not None:
714
+ where_parts.append("type = ?")
715
+ where_params.append(node.type)
716
+
717
+ for key in match_keys:
718
+ value = (node.properties or {}).get(key)
719
+ if value is None:
720
+ continue
721
+ if isinstance(value, str):
722
+ alias = f"sim_{key}"
723
+ select_extra.append(
724
+ f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
725
+ )
726
+ select_params.append(value)
727
+ where_parts.append(f"{alias} >= ?")
728
+ where_params.append(similarity_threshold)
729
+
730
+ if vector_threshold is not None and node.vector:
731
+ where_parts.append(
732
+ f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?"
733
+ )
734
+ where_params.extend([node.vector, vector_threshold])
735
+
736
+ if not select_extra:
737
+ continue
738
+
739
+ # Exclude already-processed nodes
740
+ if seen - {nid}:
741
+ placeholders = ", ".join("?" for _ in seen if _ != nid)
742
+ where_parts.append(f"id NOT IN ({placeholders})")
743
+ where_params.extend(s for s in seen if s != nid)
744
+
745
+ select_cols = "id, type, properties, vector"
746
+ if select_extra:
747
+ select_cols += ", " + ", ".join(select_extra)
748
+
749
+ query = f"SELECT {select_cols} FROM nodes WHERE {' AND '.join(where_parts)};"
750
+ dup_rows = cur.execute(query, select_params + where_params).fetchall()
751
+
752
+ if not dup_rows:
753
+ continue
754
+
755
+ duplicates: list[Node] = []
756
+ survivor_props = dict(node.properties or {})
757
+ survivor_vector = node.vector
758
+ survivor_type = node.type
759
+ edges_to_rewrite: list[tuple] = []
760
+
761
+ for row in dup_rows:
762
+ dup = Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
763
+ dup_id = str(dup.id)
764
+ seen.add(dup_id)
765
+ duplicates.append(dup)
766
+
767
+ survivor_props = self._merge_properties(survivor_props, dup.properties, strategy)
768
+ if not survivor_vector and dup.vector:
769
+ survivor_vector = dup.vector
770
+ if not survivor_type and dup.type:
771
+ survivor_type = dup.type
772
+
773
+ dup_edges = cur.execute(
774
+ "SELECT id, source_id, target_id, relation, weight FROM edges "
775
+ "WHERE source_id = ? OR target_id = ?;",
776
+ (dup_id, dup_id)
777
+ ).fetchall()
778
+ for eid, src, tgt, rel, wt in dup_edges:
779
+ new_src = nid if src == dup_id else src
780
+ new_tgt = nid if tgt == dup_id else tgt
781
+ edges_to_rewrite.append((eid, new_src, new_tgt, rel, wt))
782
+
783
+ # Delete edges referencing duplicates
784
+ for dup in duplicates:
785
+ cur.execute(
786
+ "DELETE FROM edges WHERE source_id = ? OR target_id = ?;",
787
+ (str(dup.id), str(dup.id))
788
+ )
789
+ # Also temporarily remove edges referencing survivor (DuckDB
790
+ # internally does delete+reinsert on UPDATE, triggering FK checks)
791
+ survivor_edges = cur.execute(
792
+ "SELECT id, source_id, target_id, relation, weight FROM edges "
793
+ "WHERE source_id = ? OR target_id = ?;",
794
+ (nid, nid)
795
+ ).fetchall()
796
+ for eid, *_ in survivor_edges:
797
+ cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
798
+
799
+ # Delete duplicate nodes
800
+ for dup in duplicates:
801
+ cur.execute("DELETE FROM nodes WHERE id = ?;", (str(dup.id),))
802
+
803
+ # Update survivor with merged properties (safe now, no FK refs)
804
+ cur.execute(
805
+ "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
806
+ (survivor_type, json.dumps(survivor_props), survivor_vector, nid)
807
+ )
808
+
809
+ # Re-insert all edges, verifying both endpoints still exist
810
+ rewritten_eids = {e[0] for e in edges_to_rewrite}
811
+ all_edges_to_insert = []
812
+ for eid, src, tgt, rel, wt in edges_to_rewrite:
813
+ if src == tgt:
814
+ continue # skip self-loops
815
+ all_edges_to_insert.append((eid, src, tgt, rel, wt))
816
+ for eid, src, tgt, rel, wt in survivor_edges:
817
+ if eid in rewritten_eids:
818
+ continue
819
+ all_edges_to_insert.append((eid, src, tgt, rel, wt))
820
+
821
+ for eid, src, tgt, rel, wt in all_edges_to_insert:
822
+ src_exists = cur.execute(
823
+ "SELECT 1 FROM nodes WHERE id = ?", (str(src),)
824
+ ).fetchone()
825
+ tgt_exists = cur.execute(
826
+ "SELECT 1 FROM nodes WHERE id = ?", (str(tgt),)
827
+ ).fetchone()
828
+ if src_exists and tgt_exists:
829
+ cur.execute(
830
+ "INSERT INTO edges (id, source_id, target_id, relation, weight) "
831
+ "VALUES (?, ?, ?, ?, ?);",
832
+ (eid, src, tgt, rel, wt)
833
+ )
834
+
835
+ survivor = Node(id=node.id, type=survivor_type, properties=survivor_props, vector=survivor_vector)
836
+ clusters.append(DuplicateCluster(survivor=survivor, merged=duplicates))
837
+
838
+ if clusters:
839
+ self._fts_dirty = True
840
+
841
+ logger.info(
842
+ "Resolved %d duplicate clusters (%d nodes merged).",
843
+ len(clusters),
844
+ sum(len(c.merged) for c in clusters),
845
+ )
846
+ return clusters
847
+ except duckdb.Error as e:
848
+ logger.error(f"Error during resolve_duplicates: {e}")
849
+ raise
850
+
558
851
  @with_retry()
559
852
  def delete_edge(self, source_id: uuid.UUID, target_id: uuid.UUID):
560
853
  try:
@@ -115,8 +115,13 @@ def _get_signatures():
115
115
  # ---------------------------------------------------------------------------
116
116
 
117
117
 
118
+ def _make_predictor(dspy, signature):
119
+ """Create a DSPy v3 predictor from a Signature."""
120
+ return dspy.Predict(signature)
121
+
122
+
118
123
  def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
119
- """Extract entity nodes from text using a DSPy typed predictor.
124
+ """Extract entity nodes from text using a DSPy predictor.
120
125
 
121
126
  Args:
122
127
  text: Full text to extract from (used when *sentences* is ``None``).
@@ -129,7 +134,7 @@ def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
129
134
  """
130
135
  dspy = _require_dspy()
131
136
  NodeSig, _ = _get_signatures()
132
- predictor = dspy.TypedPredictor(NodeSig)
137
+ predictor = _make_predictor(dspy, NodeSig)
133
138
 
134
139
  if sentences is None:
135
140
  sentences = [s.strip() for s in text.split(".") if s.strip()]
@@ -166,7 +171,7 @@ def extract_edges(
166
171
  """
167
172
  dspy = _require_dspy()
168
173
  _, EdgeSig = _get_signatures()
169
- predictor = dspy.TypedPredictor(EdgeSig)
174
+ predictor = _make_predictor(dspy, EdgeSig)
170
175
 
171
176
  if sentences is None:
172
177
  sentences = [s.strip() for s in text.split(".") if s.strip()]
@@ -253,6 +258,8 @@ def extract_and_merge(
253
258
  match_type: bool = True,
254
259
  strategy: MergeStrategy = MergeStrategy.UPDATE,
255
260
  sentences: list[str] | None = None,
261
+ similarity_threshold: float = 1.0,
262
+ vector_threshold: float | None = None,
256
263
  ) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
257
264
  """Extract nodes and edges from text, merging with existing graph data.
258
265
 
@@ -278,6 +285,7 @@ def extract_and_merge(
278
285
 
279
286
  node_results = graph.bulk_merge_nodes(
280
287
  nodes, match_keys=match_keys, match_type=match_type, strategy=strategy,
288
+ similarity_threshold=similarity_threshold, vector_threshold=vector_threshold,
281
289
  ) if nodes else []
282
290
 
283
291
  edge_results = graph.bulk_merge_edges(edges) if edges else []