graphmemory 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {graphmemory-1.2.0 → graphmemory-1.3.0}/PKG-INFO +5 -4
  2. {graphmemory-1.2.0 → graphmemory-1.3.0}/README.md +4 -3
  3. graphmemory-1.3.0/examples/test_ingest.py +152 -0
  4. {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/database.py +59 -6
  5. {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/extraction.py +123 -1
  6. {graphmemory-1.2.0 → graphmemory-1.3.0}/pyproject.toml +1 -1
  7. {graphmemory-1.2.0 → graphmemory-1.3.0}/tests/tests.py +142 -0
  8. graphmemory-1.2.0/examples/test_ingest.py +0 -147
  9. {graphmemory-1.2.0 → graphmemory-1.3.0}/.gitignore +0 -0
  10. {graphmemory-1.2.0 → graphmemory-1.3.0}/LICENSE +0 -0
  11. {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/dspy_example_typed_pred.py +0 -0
  12. {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/lexical_graph.py +0 -0
  13. {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/openai_example.py +0 -0
  14. {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/__init__.py +0 -0
  15. {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/algorithms.py +0 -0
  16. {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/models.py +0 -0
  17. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming1.txt +0 -0
  18. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming2.txt +0 -0
  19. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming3.txt +0 -0
  20. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming4.txt +0 -0
  21. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/aimav4.txt +0 -0
  22. {graphmemory-1.2.0 → graphmemory-1.3.0}/input/reading_in_plannings.txt +0 -0
  23. {graphmemory-1.2.0 → graphmemory-1.3.0}/requirements.txt +0 -0
  24. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/package-lock.json +0 -0
  25. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/package.json +0 -0
  26. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/public/banner.png +0 -0
  27. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/GraphMemoryShowcase.tsx +0 -0
  28. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/Root.tsx +0 -0
  29. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/Background.tsx +0 -0
  30. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/CodeBlock.tsx +0 -0
  31. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/FeaturePill.tsx +0 -0
  32. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/GraphViz.tsx +0 -0
  33. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/SectionTitle.tsx +0 -0
  34. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/index.ts +0 -0
  35. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/AlgorithmsScene.tsx +0 -0
  36. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/ExportScene.tsx +0 -0
  37. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/ExtractionScene.tsx +0 -0
  38. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/IntroScene.tsx +0 -0
  39. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/MergeScene.tsx +0 -0
  40. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/NodeEdgeScene.tsx +0 -0
  41. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/OutroScene.tsx +0 -0
  42. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/QueryBuilderScene.tsx +0 -0
  43. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/RetrievalScene.tsx +0 -0
  44. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/VectorSearchScene.tsx +0 -0
  45. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/theme.ts +0 -0
  46. {graphmemory-1.2.0 → graphmemory-1.3.0}/video/tsconfig.json +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphmemory
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Graph-based memory system using DuckDB
5
5
  Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
6
6
  Project-URL: Repository, https://github.com/bradAGI/GraphMemory
@@ -224,7 +224,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
224
224
 
225
225
  | Method | Description |
226
226
  |--------|-------------|
227
- | `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
227
+ | `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
228
228
  | `close()` | Close connection (thread-safe, idempotent). |
229
229
  | `transaction()` | Context manager for atomic operations. |
230
230
 
@@ -262,7 +262,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
262
262
  | `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
263
263
  | `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
264
264
  | `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
265
- | `create_index()` | Create HNSW index for faster vector search. |
265
+ | `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
266
+ | `compact_index()` | Compact HNSW index to reclaim space after deletions. |
266
267
 
267
268
  ### Retrieval
268
269
 
@@ -295,7 +296,7 @@ See `examples/` for complete usage:
295
296
 
296
297
  ## Testing
297
298
 
298
- 265 tests covering all functionality.
299
+ 291 tests covering all functionality.
299
300
 
300
301
  ```sh
301
302
  python3 -m pytest tests/tests.py -v
@@ -197,7 +197,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
197
197
 
198
198
  | Method | Description |
199
199
  |--------|-------------|
200
- | `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
200
+ | `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
201
201
  | `close()` | Close connection (thread-safe, idempotent). |
202
202
  | `transaction()` | Context manager for atomic operations. |
203
203
 
@@ -235,7 +235,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
235
235
  | `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
236
236
  | `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
237
237
  | `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
238
- | `create_index()` | Create HNSW index for faster vector search. |
238
+ | `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
239
+ | `compact_index()` | Compact HNSW index to reclaim space after deletions. |
239
240
 
240
241
  ### Retrieval
241
242
 
@@ -268,7 +269,7 @@ See `examples/` for complete usage:
268
269
 
269
270
  ## Testing
270
271
 
271
- 265 tests covering all functionality.
272
+ 291 tests covering all functionality.
272
273
 
273
274
  ```sh
274
275
  python3 -m pytest tests/tests.py -v
@@ -0,0 +1,152 @@
1
+ """End-to-end test: ingest aimav4.txt using parallel LLM extraction via DSPy."""
2
+
3
+ import sys
4
+ import os
5
+ import re
6
+ import time
7
+ import logging
8
+
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
10
+
11
+ import dspy
12
+ from graphmemory import GraphMemory, MergeStrategy
13
+ from graphmemory.extraction import extract_and_merge_parallel
14
+
15
+ logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s")
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # --- Configure DSPy with gpt-5-nano (10k RPM, 10M TPM) ---
19
+ lm = dspy.LM("openai/gpt-5-nano")
20
+ dspy.configure(lm=lm)
21
+
22
+ # With 10k RPM we can safely run 50+ concurrent requests
23
+ MAX_WORKERS = 50
24
+
25
+
26
+ def chunk_text(text: str, max_chars: int = 4000) -> list[str]:
27
+ """Split text into paragraph-aware chunks."""
28
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
29
+ chunks = []
30
+ current = []
31
+ current_len = 0
32
+ for p in paragraphs:
33
+ if current_len + len(p) > max_chars and current:
34
+ chunks.append("\n\n".join(current))
35
+ current = []
36
+ current_len = 0
37
+ current.append(p)
38
+ current_len += len(p)
39
+ if current:
40
+ chunks.append("\n\n".join(current))
41
+ return chunks
42
+
43
+
44
+ def on_progress(phase, done, total):
45
+ bar_len = 30
46
+ filled = int(bar_len * done / total)
47
+ bar = "█" * filled + "░" * (bar_len - filled)
48
+ print(f"\r {phase:5s} [{bar}] {done}/{total}", end="", flush=True)
49
+ if done == total:
50
+ print()
51
+
52
+
53
+ def main():
54
+ input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
55
+ with open(input_path) as f:
56
+ text = f.read(200_000)
57
+
58
+ text = re.sub(r"<!--.*?-->", "", text)
59
+ chunks = chunk_text(text, max_chars=4000)
60
+
61
+ print("=" * 60)
62
+ print("GraphMemory — Parallel LLM Extraction")
63
+ print("=" * 60)
64
+ print(f"Source: aimav4.txt ({len(text):,} chars)")
65
+ print(f"Chunks: {len(chunks)} x ~4k chars")
66
+ print(f"Workers: {MAX_WORKERS} concurrent LLM calls")
67
+ print(f"LLM: gpt-5-nano via DSPy")
68
+
69
+ db = GraphMemory(database=":memory:", vector_length=3)
70
+
71
+ print(f"\n--- Phase 1: Node extraction (parallel) ---")
72
+ print(f"--- Phase 2: Edge extraction (parallel) ---")
73
+ t0 = time.time()
74
+
75
+ node_results, edge_results = extract_and_merge_parallel(
76
+ db,
77
+ chunks,
78
+ match_keys=["name"],
79
+ match_type=True,
80
+ similarity_threshold=0.88,
81
+ max_workers=MAX_WORKERS,
82
+ on_progress=on_progress,
83
+ )
84
+
85
+ elapsed = time.time() - t0
86
+ created_n = sum(1 for r in node_results if r.created)
87
+ merged_n = sum(1 for r in node_results if not r.created)
88
+ created_e = sum(1 for r in edge_results if r.created)
89
+ merged_e = sum(1 for r in edge_results if not r.created)
90
+
91
+ print(f"\n Done in {elapsed:.1f}s ({len(chunks) * 2} LLM calls)")
92
+ print(f" Nodes: {created_n} new, {merged_n} fuzzy-merged")
93
+ print(f" Edges: {created_e} new, {merged_e} deduped")
94
+
95
+ # --- Post-extraction dedupe ---
96
+ print(f"\n--- Post-extraction duplicate resolution ---")
97
+ t1 = time.time()
98
+ clusters = db.resolve_duplicates(
99
+ match_keys=["name"],
100
+ match_type=True,
101
+ similarity_threshold=0.90,
102
+ )
103
+ print(f" {len(clusters)} clusters resolved in {time.time() - t1:.1f}s")
104
+ for c in clusters[:10]:
105
+ merged_names = [m.properties.get("name", "?") for m in c.merged]
106
+ print(f" '{c.survivor.properties.get('name')}' <- {merged_names}")
107
+ if len(clusters) > 10:
108
+ print(f" ... and {len(clusters) - 10} more")
109
+
110
+ # --- Results ---
111
+ all_nodes = db.nodes_to_json()
112
+ all_edges = db.edges_to_json()
113
+
114
+ type_counts = {}
115
+ for n in all_nodes:
116
+ t = n.get("type", "Unknown")
117
+ type_counts[t] = type_counts.get(t, 0) + 1
118
+
119
+ print(f"\n--- Final Graph ---")
120
+ print(f" Nodes: {len(all_nodes)}")
121
+ print(f" Edges: {len(all_edges)}")
122
+ print(f" Types: {dict(sorted(type_counts.items(), key=lambda x: -x[1]))}")
123
+
124
+ print(f"\n--- Sample Entities (first 30) ---")
125
+ sorted_nodes = sorted(all_nodes, key=lambda x: (x.get("type") or "", x.get("properties", {}).get("name") or ""))
126
+ for n in sorted_nodes[:30]:
127
+ props = n.get("properties", {})
128
+ print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
129
+ if len(sorted_nodes) > 30:
130
+ print(f" ... and {len(sorted_nodes) - 30} more")
131
+
132
+ print(f"\n--- Sample Relationships (first 20) ---")
133
+ node_id_map = {n["id"]: n for n in all_nodes}
134
+ for e in all_edges[:20]:
135
+ src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", "?")
136
+ tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", "?")
137
+ print(f" {src} --[{e['relation']}]--> {tgt}")
138
+ if len(all_edges) > 20:
139
+ print(f" ... and {len(all_edges) - 20} more")
140
+
141
+ print(f"\n--- Search: 'artificial intelligence' ---")
142
+ results = db.search_nodes("artificial intelligence", limit=5)
143
+ for sr in results:
144
+ print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
145
+
146
+ print(f"\n{'=' * 60}")
147
+ print(f"{len(all_nodes)} nodes, {len(all_edges)} edges from {len(text):,} chars in {elapsed:.1f}s")
148
+ print(f"{'=' * 60}")
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()
@@ -86,7 +86,8 @@ class GraphMemory:
86
86
  'inner_product': {'function': 'array_negative_inner_product', 'hnsw_metric': 'ip'},
87
87
  }
88
88
 
89
- def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1):
89
+ def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1,
90
+ hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True):
90
91
  if distance_metric not in self.DISTANCE_METRICS:
91
92
  raise ValueError(
92
93
  f"Invalid distance_metric '{distance_metric}'. "
@@ -97,9 +98,13 @@ class GraphMemory:
97
98
  self.distance_metric = distance_metric
98
99
  self.max_retries = max_retries
99
100
  self.retry_base_delay = retry_base_delay
101
+ self.hnsw_ef_construction = hnsw_ef_construction
102
+ self.hnsw_ef_search = hnsw_ef_search
103
+ self.hnsw_m = hnsw_m
100
104
  self._lock = threading.RLock()
101
105
  self._fts_initialized = False
102
106
  self._fts_dirty = True
107
+ self._hnsw_indexed = False
103
108
  self._closed = False
104
109
  self.conn = duckdb.connect(database=self.database)
105
110
  self._load_vss_extension()
@@ -116,6 +121,9 @@ class GraphMemory:
116
121
  self._create_tables()
117
122
  logger.info("Tables created or verified successfully.")
118
123
 
124
+ if auto_index:
125
+ self._ensure_hnsw_index()
126
+
119
127
  def cursor(self):
120
128
  """Return a new DuckDB cursor for individual operations.
121
129
 
@@ -150,6 +158,8 @@ class GraphMemory:
150
158
  self._configure_database()
151
159
  self._fts_initialized = False
152
160
  self._fts_dirty = True
161
+ self._hnsw_indexed = False
162
+ self._ensure_hnsw_index()
153
163
  logger.info("Reconnection successful.")
154
164
 
155
165
  def close(self):
@@ -191,6 +201,8 @@ class GraphMemory:
191
201
 
192
202
  def set_vector_length(self, vector_length):
193
203
  self.vector_length = vector_length
204
+ self._hnsw_indexed = False
205
+ self._ensure_hnsw_index()
194
206
  logger.info(f"Vector length set to: {self.vector_length}")
195
207
 
196
208
  def _create_tables(self):
@@ -303,6 +315,7 @@ class GraphMemory:
303
315
  cur.execute(
304
316
  "DELETE FROM nodes WHERE id = ?;", (str(node_id),))
305
317
  self._fts_dirty = True
318
+ self.compact_index()
306
319
  except duckdb.Error as e:
307
320
  logger.error(f"Error deleting node: {e}")
308
321
 
@@ -321,6 +334,7 @@ class GraphMemory:
321
334
  cur.execute(
322
335
  f"DELETE FROM nodes WHERE id IN ({placeholders});", id_strs)
323
336
  self._fts_dirty = True
337
+ self.compact_index()
324
338
  except duckdb.Error as e:
325
339
  logger.error(f"Error during bulk delete nodes: {e}")
326
340
 
@@ -920,15 +934,53 @@ class GraphMemory:
920
934
  logger.error(f"Error updating edge: {e}")
921
935
  return False
922
936
 
937
+ def _ensure_hnsw_index(self):
938
+ """Create HNSW index if not already present. Called automatically on init."""
939
+ if self._hnsw_indexed:
940
+ return
941
+ try:
942
+ nodes_exist = self.conn.execute(
943
+ "SELECT 1 FROM information_schema.tables WHERE table_name = 'nodes';"
944
+ ).fetchone()
945
+ if nodes_exist:
946
+ self.create_index()
947
+ except duckdb.Error:
948
+ pass
949
+
923
950
  @with_retry()
924
- def create_index(self):
951
+ def create_index(self, ef_construction: int | None = None, ef_search: int | None = None, m: int | None = None):
952
+ """Create or recreate the HNSW vector index.
953
+
954
+ Args:
955
+ ef_construction: Candidate vertices during build (default from init).
956
+ ef_search: Candidate vertices during search (default from init).
957
+ m: Max neighbors per vertex (default from init).
958
+ """
959
+ ef_c = ef_construction or self.hnsw_ef_construction
960
+ ef_s = ef_search or self.hnsw_ef_search
961
+ m_val = m or self.hnsw_m
962
+ hnsw_metric = self.DISTANCE_METRICS[self.distance_metric]['hnsw_metric']
925
963
  with self._lock:
926
964
  try:
927
- hnsw_metric = self.DISTANCE_METRICS[self.distance_metric]['hnsw_metric']
965
+ # Drop existing index first to allow metric/param changes
966
+ self.conn.execute("DROP INDEX IF EXISTS vss_idx;")
928
967
  self.conn.execute(
929
- f"CREATE INDEX IF NOT EXISTS vss_idx ON nodes USING HNSW(vector) WITH (metric = '{hnsw_metric}');")
968
+ f"CREATE INDEX vss_idx ON nodes USING HNSW(vector) "
969
+ f"WITH (metric = '{hnsw_metric}', ef_construction = {ef_c}, ef_search = {ef_s}, M = {m_val});"
970
+ )
971
+ self._hnsw_indexed = True
972
+ logger.info(f"HNSW index created (metric={hnsw_metric}, ef_construction={ef_c}, ef_search={ef_s}, M={m_val}).")
973
+ except duckdb.Error as e:
974
+ logger.error(f"Error creating HNSW index: {e}")
975
+
976
+ def compact_index(self):
977
+ """Compact the HNSW index to reclaim space after deletions."""
978
+ with self._lock:
979
+ try:
980
+ self.conn.execute("PRAGMA hnsw_compact_index('vss_idx');")
981
+ logger.info("HNSW index compacted.")
930
982
  except duckdb.Error as e:
931
- logger.error(f"Error creating index: {e}")
983
+ logger.error(f"Error compacting HNSW index: {e}")
932
984
 
933
985
  @with_retry()
934
986
  def nearest_nodes(self, vector: list[float], limit: int) -> list[NearestNode]:
@@ -1334,9 +1386,10 @@ class GraphMemory:
1334
1386
 
1335
1387
  # Collect vector similarity results
1336
1388
  vss_results = {}
1389
+ dist_func = self.DISTANCE_METRICS[self.distance_metric]['function']
1337
1390
  vss_query = f"""
1338
1391
  SELECT id, type, properties, vector,
1339
- array_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) AS distance
1392
+ {dist_func}(vector, CAST(? AS FLOAT[{self.vector_length}])) AS distance
1340
1393
  FROM nodes
1341
1394
  WHERE vector IS NOT NULL
1342
1395
  ORDER BY distance;
@@ -10,7 +10,8 @@ Requires the ``dspy`` optional dependency:
10
10
  from __future__ import annotations
11
11
 
12
12
  import logging
13
- from typing import TYPE_CHECKING, Any
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from typing import TYPE_CHECKING, Any, Callable
14
15
 
15
16
  from pydantic import BaseModel, Field
16
17
 
@@ -296,3 +297,124 @@ def extract_and_merge(
296
297
  len(edge_results),
297
298
  )
298
299
  return node_results, edge_results
300
+
301
+
302
+ # ---------------------------------------------------------------------------
303
+ # Parallel extraction
304
+ # ---------------------------------------------------------------------------
305
+
306
+
307
+ def _extract_nodes_chunk(chunk: str) -> list[Node]:
308
+ """Extract nodes from a single chunk (thread-safe, no DB access)."""
309
+ return extract_nodes(chunk, sentences=[chunk])
310
+
311
+
312
+ def _extract_edges_chunk(chunk: str, nodes: list[Node]) -> list[Edge]:
313
+ """Extract edges from a single chunk given known nodes (thread-safe)."""
314
+ return extract_edges(chunk, nodes, sentences=[chunk])
315
+
316
+
317
+ def extract_and_merge_parallel(
318
+ graph: GraphMemory,
319
+ chunks: list[str],
320
+ match_keys: list[str] | None = None,
321
+ match_type: bool = True,
322
+ strategy: MergeStrategy = MergeStrategy.UPDATE,
323
+ similarity_threshold: float = 1.0,
324
+ vector_threshold: float | None = None,
325
+ max_workers: int = 8,
326
+ on_progress: Callable[[str, int, int], None] | None = None,
327
+ ) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
328
+ """Extract from multiple text chunks in parallel, then merge sequentially.
329
+
330
+ Runs in two parallel phases to maximize LLM throughput:
331
+ 1. Node extraction — all chunks concurrently (saturate RPM)
332
+ 2. Edge extraction — all chunks concurrently (with all extracted nodes as context)
333
+ Then merges into DB sequentially.
334
+
335
+ Args:
336
+ graph: A :class:`~graphmemory.database.GraphMemory` instance.
337
+ chunks: List of text chunks to process.
338
+ match_keys: Property names to match nodes on (default ``["name"]``).
339
+ match_type: Also require ``node.type`` to match.
340
+ strategy: How to merge properties on match.
341
+ similarity_threshold: Jaro-Winkler threshold for fuzzy matching.
342
+ vector_threshold: Max cosine distance for vector similarity.
343
+ max_workers: Max concurrent LLM calls (match your RPM headroom).
344
+ on_progress: Optional callback ``(phase, completed, total)``.
345
+
346
+ Returns:
347
+ Aggregated ``(node_results, edge_results)`` across all chunks.
348
+ """
349
+ if match_keys is None:
350
+ match_keys = ["name"]
351
+
352
+ total = len(chunks)
353
+
354
+ # Phase 1: Extract nodes from ALL chunks in parallel
355
+ chunk_nodes: dict[int, list[Node]] = {}
356
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
357
+ future_to_idx = {
358
+ pool.submit(_extract_nodes_chunk, chunk): i
359
+ for i, chunk in enumerate(chunks)
360
+ }
361
+ done = 0
362
+ for future in as_completed(future_to_idx):
363
+ idx = future_to_idx[future]
364
+ try:
365
+ chunk_nodes[idx] = future.result()
366
+ except Exception as e:
367
+ logger.warning("Node extraction failed for chunk %d: %s", idx + 1, e)
368
+ chunk_nodes[idx] = []
369
+ done += 1
370
+ if on_progress:
371
+ on_progress("nodes", done, total)
372
+
373
+ # Merge all nodes into DB sequentially to build the full node set
374
+ all_node_results: list[MergeResult] = []
375
+ for idx in range(total):
376
+ nodes = chunk_nodes.get(idx, [])
377
+ if nodes:
378
+ results = graph.bulk_merge_nodes(
379
+ nodes, match_keys=match_keys, match_type=match_type,
380
+ strategy=strategy, similarity_threshold=similarity_threshold,
381
+ vector_threshold=vector_threshold,
382
+ )
383
+ all_node_results.extend(results)
384
+
385
+ # Build complete node list for edge extraction context
386
+ all_nodes = [r.node for r in all_node_results]
387
+ logger.info("Phase 1 complete: %d nodes extracted and merged.", len(all_nodes))
388
+
389
+ # Phase 2: Extract edges from ALL chunks in parallel (with full node context)
390
+ chunk_edges: dict[int, list[Edge]] = {}
391
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
392
+ future_to_idx = {
393
+ pool.submit(_extract_edges_chunk, chunk, all_nodes): i
394
+ for i, chunk in enumerate(chunks)
395
+ }
396
+ done = 0
397
+ for future in as_completed(future_to_idx):
398
+ idx = future_to_idx[future]
399
+ try:
400
+ chunk_edges[idx] = future.result()
401
+ except Exception as e:
402
+ logger.warning("Edge extraction failed for chunk %d: %s", idx + 1, e)
403
+ chunk_edges[idx] = []
404
+ done += 1
405
+ if on_progress:
406
+ on_progress("edges", done, total)
407
+
408
+ # Merge all edges into DB sequentially
409
+ all_edge_results: list[EdgeMergeResult] = []
410
+ for idx in range(total):
411
+ edges = chunk_edges.get(idx, [])
412
+ if edges:
413
+ results = graph.bulk_merge_edges(edges)
414
+ all_edge_results.extend(results)
415
+
416
+ logger.info(
417
+ "Parallel extraction complete: %d chunks, %d nodes, %d edges.",
418
+ total, len(all_node_results), len(all_edge_results),
419
+ )
420
+ return all_node_results, all_edge_results
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "graphmemory"
7
- version = "1.2.0"
7
+ version = "1.3.0"
8
8
  description = "Graph-based memory system using DuckDB"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -2729,5 +2729,147 @@ class TestFuzzyMatching(unittest.TestCase):
2729
2729
  self.assertEqual(remaining, 2)
2730
2730
 
2731
2731
 
2732
+ class TestHNSWIndex(unittest.TestCase):
2733
+
2734
+ def test_auto_index_on_init(self):
2735
+ db = GraphMemory(database=':memory:', vector_length=3)
2736
+ self.assertTrue(db._hnsw_indexed)
2737
+ db.close()
2738
+
2739
+ def test_auto_index_disabled(self):
2740
+ db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
2741
+ self.assertFalse(db._hnsw_indexed)
2742
+ db.close()
2743
+
2744
+ def test_create_index_with_custom_params(self):
2745
+ db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
2746
+ db.create_index(ef_construction=64, ef_search=32, m=8)
2747
+ self.assertTrue(db._hnsw_indexed)
2748
+ db.close()
2749
+
2750
+ def test_create_index_uses_configured_metric(self):
2751
+ for metric in ['l2', 'cosine', 'inner_product']:
2752
+ db = GraphMemory(database=':memory:', vector_length=3, distance_metric=metric)
2753
+ self.assertTrue(db._hnsw_indexed)
2754
+ db.close()
2755
+
2756
+ def test_create_index_idempotent_recreate(self):
2757
+ db = GraphMemory(database=':memory:', vector_length=3)
2758
+ db.create_index()
2759
+ db.create_index()
2760
+ self.assertTrue(db._hnsw_indexed)
2761
+ db.close()
2762
+
2763
+ def test_set_vector_length_rebuilds_index(self):
2764
+ db = GraphMemory(database=':memory:', vector_length=3)
2765
+ self.assertTrue(db._hnsw_indexed)
2766
+ db._hnsw_indexed = False
2767
+ db.set_vector_length(5)
2768
+ self.assertTrue(db._hnsw_indexed)
2769
+ self.assertEqual(db.vector_length, 5)
2770
+ db.close()
2771
+
2772
+ def test_compact_index_no_error(self):
2773
+ db = GraphMemory(database=':memory:', vector_length=3)
2774
+ node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
2775
+ db.insert_node(node)
2776
+ db.delete_node(node.id)
2777
+ db.compact_index()
2778
+ db.close()
2779
+
2780
+ def test_reconnect_rebuilds_index(self):
2781
+ import tempfile
2782
+ path = tempfile.mktemp(suffix='.db')
2783
+ try:
2784
+ db = GraphMemory(database=path, vector_length=3)
2785
+ db._hnsw_indexed = False
2786
+ db._reconnect()
2787
+ self.assertTrue(db._hnsw_indexed)
2788
+ db.close()
2789
+ finally:
2790
+ if os.path.exists(path):
2791
+ os.unlink(path)
2792
+
2793
+ def test_hnsw_params_stored(self):
2794
+ db = GraphMemory(database=':memory:', vector_length=3,
2795
+ hnsw_ef_construction=256, hnsw_ef_search=128, hnsw_m=32)
2796
+ self.assertEqual(db.hnsw_ef_construction, 256)
2797
+ self.assertEqual(db.hnsw_ef_search, 128)
2798
+ self.assertEqual(db.hnsw_m, 32)
2799
+ db.close()
2800
+
2801
+
2802
+ class TestHybridSearchMetric(unittest.TestCase):
2803
+
2804
+ def setUp(self):
2805
+ self.db = GraphMemory(database=':memory:', vector_length=3, distance_metric='cosine')
2806
+ self.db.insert_node(Node(type="Doc", properties={"text": "machine learning"}, vector=[1.0, 0.0, 0.0]))
2807
+ self.db.insert_node(Node(type="Doc", properties={"text": "deep learning"}, vector=[0.9, 0.1, 0.0]))
2808
+ self.db.insert_node(Node(type="Doc", properties={"text": "cooking recipes"}, vector=[0.0, 0.0, 1.0]))
2809
+
2810
+ def tearDown(self):
2811
+ self.db.close()
2812
+
2813
+ def test_hybrid_search_uses_cosine_metric(self):
2814
+ results = self.db.hybrid_search(
2815
+ query_text="learning",
2816
+ query_vector=[1.0, 0.0, 0.0],
2817
+ limit=3
2818
+ )
2819
+ self.assertGreater(len(results), 0)
2820
+ # The learning docs should score higher than cooking
2821
+ names = [r.node.properties.get("text") for r in results]
2822
+ self.assertIn("machine learning", names[:2])
2823
+
2824
+ def test_hybrid_search_inner_product(self):
2825
+ db = GraphMemory(database=':memory:', vector_length=3, distance_metric='inner_product')
2826
+ db.insert_node(Node(type="Doc", properties={"text": "similar"}, vector=[1.0, 0.0, 0.0]))
2827
+ db.insert_node(Node(type="Doc", properties={"text": "different"}, vector=[0.0, 0.0, 1.0]))
2828
+ results = db.hybrid_search(
2829
+ query_text="similar",
2830
+ query_vector=[1.0, 0.0, 0.0],
2831
+ limit=2
2832
+ )
2833
+ self.assertGreater(len(results), 0)
2834
+ db.close()
2835
+
2836
+ def test_hybrid_search_l2_metric(self):
2837
+ db = GraphMemory(database=':memory:', vector_length=3, distance_metric='l2')
2838
+ db.insert_node(Node(type="Doc", properties={"text": "near"}, vector=[0.1, 0.0, 0.0]))
2839
+ db.insert_node(Node(type="Doc", properties={"text": "far"}, vector=[9.0, 9.0, 9.0]))
2840
+ results = db.hybrid_search(
2841
+ query_text="near",
2842
+ query_vector=[0.0, 0.0, 0.0],
2843
+ limit=2
2844
+ )
2845
+ self.assertGreater(len(results), 0)
2846
+ self.assertEqual(results[0].node.properties["text"], "near")
2847
+ db.close()
2848
+
2849
+
2850
+ class TestCompactAfterDelete(unittest.TestCase):
2851
+
2852
+ def setUp(self):
2853
+ self.db = GraphMemory(database=':memory:', vector_length=3)
2854
+
2855
+ def tearDown(self):
2856
+ self.db.close()
2857
+
2858
+ def test_delete_node_compacts(self):
2859
+ node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
2860
+ self.db.insert_node(node)
2861
+ # Should not raise — compact_index called internally
2862
+ self.db.delete_node(node.id)
2863
+ self.assertEqual(len(self.db.nodes_to_json()), 0)
2864
+
2865
+ def test_bulk_delete_nodes_compacts(self):
2866
+ n1 = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
2867
+ n2 = Node(type="Test", properties={"name": "B"}, vector=[0.0, 1.0, 0.0])
2868
+ self.db.insert_node(n1)
2869
+ self.db.insert_node(n2)
2870
+ self.db.bulk_delete_nodes([n1.id, n2.id])
2871
+ self.assertEqual(len(self.db.nodes_to_json()), 0)
2872
+
2873
+
2732
2874
  if __name__ == '__main__':
2733
2875
  unittest.main()
@@ -1,147 +0,0 @@
1
- """End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
2
-
3
- import sys
4
- import os
5
- import re
6
- import logging
7
-
8
- sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
-
10
- import dspy
11
- from graphmemory import GraphMemory, MergeStrategy
12
- from graphmemory.extraction import extract_and_merge
13
-
14
- logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
15
- logger = logging.getLogger(__name__)
16
-
17
- # --- Configure DSPy with gpt-5-nano ---
18
- lm = dspy.LM("openai/gpt-5-nano")
19
- dspy.configure(lm=lm)
20
-
21
-
22
- def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
23
- """Split text into paragraph-aware chunks."""
24
- paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
25
- chunks = []
26
- current = []
27
- current_len = 0
28
- for p in paragraphs:
29
- if current_len + len(p) > max_chars and current:
30
- chunks.append("\n\n".join(current))
31
- current = []
32
- current_len = 0
33
- current.append(p)
34
- current_len += len(p)
35
- if current:
36
- chunks.append("\n\n".join(current))
37
- return chunks
38
-
39
-
40
- def main():
41
- input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
42
- with open(input_path) as f:
43
- text = f.read(100_000)
44
-
45
- text = re.sub(r"<!--.*?-->", "", text)
46
- chunks = chunk_text(text, max_chars=4000)
47
-
48
- print("=" * 60)
49
- print("GraphMemory — Real LLM Extraction Test")
50
- print("=" * 60)
51
- print(f"Source: aimav4.txt ({len(text)} chars)")
52
- print(f"Chunks: {len(chunks)}")
53
- print(f"LLM: gpt-5-nano via DSPy")
54
-
55
- db = GraphMemory(database=":memory:", vector_length=3)
56
-
57
- print(f"\n--- Extracting entities & relationships ---")
58
- total_nodes = 0
59
- total_edges = 0
60
- total_merged_nodes = 0
61
- total_merged_edges = 0
62
-
63
- for i, chunk in enumerate(chunks):
64
- print(f"\n Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
65
- try:
66
- # Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
67
- node_results, edge_results = extract_and_merge(
68
- db,
69
- chunk,
70
- match_keys=["name"],
71
- match_type=True,
72
- similarity_threshold=0.88,
73
- sentences=[chunk], # single LLM call per chunk
74
- )
75
- created_n = sum(1 for r in node_results if r.created)
76
- merged_n = sum(1 for r in node_results if not r.created)
77
- created_e = sum(1 for r in edge_results if r.created)
78
- merged_e = sum(1 for r in edge_results if not r.created)
79
-
80
- total_nodes += created_n
81
- total_merged_nodes += merged_n
82
- total_edges += created_e
83
- total_merged_edges += merged_e
84
-
85
- print(f" Nodes: {created_n} new, {merged_n} merged")
86
- print(f" Edges: {created_e} new, {merged_e} merged")
87
- except Exception as e:
88
- logger.warning(f" Chunk {i + 1} failed: {e}")
89
-
90
- # --- Post-extraction dedupe ---
91
- print(f"\n--- Post-extraction duplicate resolution ---")
92
- clusters = db.resolve_duplicates(
93
- match_keys=["name"],
94
- match_type=True,
95
- similarity_threshold=0.90,
96
- )
97
- if clusters:
98
- for c in clusters:
99
- merged_names = [m.properties.get("name", "?") for m in c.merged]
100
- print(f" Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
101
- else:
102
- print(" No additional duplicates found.")
103
-
104
- # --- Results ---
105
- all_nodes = db.nodes_to_json()
106
- all_edges = db.edges_to_json()
107
-
108
- print(f"\n--- Final Graph ---")
109
- print(f" Nodes: {len(all_nodes)}")
110
- print(f" Edges: {len(all_edges)}")
111
-
112
- type_counts = {}
113
- for n in all_nodes:
114
- t = n.get("type", "Unknown")
115
- type_counts[t] = type_counts.get(t, 0) + 1
116
- print(f" Types: {type_counts}")
117
-
118
- print(f"\n--- Extracted Entities ---")
119
- for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
120
- props = n.get("properties", {})
121
- print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
122
-
123
- print(f"\n--- Extracted Relationships ---")
124
- node_id_map = {n["id"]: n for n in all_nodes}
125
- for e in all_edges:
126
- src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
127
- tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
128
- print(f" {src} --[{e['relation']}]--> {tgt}")
129
-
130
- print(f"\n--- Full-text search: 'deep learning' ---")
131
- results = db.search_nodes("deep learning", limit=5)
132
- for sr in results:
133
- print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
134
-
135
- print(f"\n--- Summary ---")
136
- print(f" Extracted: {total_nodes} nodes, {total_edges} edges")
137
- print(f" Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
138
- print(f" Post-dedupe clusters: {len(clusters)}")
139
- print(f" Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
140
-
141
- print("\n" + "=" * 60)
142
- print("Done!")
143
- print("=" * 60)
144
-
145
-
146
- if __name__ == "__main__":
147
- main()
File without changes
File without changes