graphmemory 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphmemory-1.2.0 → graphmemory-1.3.0}/PKG-INFO +5 -4
- {graphmemory-1.2.0 → graphmemory-1.3.0}/README.md +4 -3
- graphmemory-1.3.0/examples/test_ingest.py +152 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/database.py +59 -6
- {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/extraction.py +123 -1
- {graphmemory-1.2.0 → graphmemory-1.3.0}/pyproject.toml +1 -1
- {graphmemory-1.2.0 → graphmemory-1.3.0}/tests/tests.py +142 -0
- graphmemory-1.2.0/examples/test_ingest.py +0 -147
- {graphmemory-1.2.0 → graphmemory-1.3.0}/.gitignore +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/LICENSE +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/dspy_example_typed_pred.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/lexical_graph.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/examples/openai_example.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/__init__.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/algorithms.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/models.py +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming1.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming2.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming3.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/Genetic Programming4.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/aimav4.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/input/reading_in_plannings.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/requirements.txt +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/package-lock.json +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/package.json +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/public/banner.png +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/GraphMemoryShowcase.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/Root.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/Background.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/CodeBlock.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/FeaturePill.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/GraphViz.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/components/SectionTitle.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/index.ts +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/AlgorithmsScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/ExportScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/ExtractionScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/IntroScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/MergeScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/NodeEdgeScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/OutroScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/QueryBuilderScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/RetrievalScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/scenes/VectorSearchScene.tsx +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/src/theme.ts +0 -0
- {graphmemory-1.2.0 → graphmemory-1.3.0}/video/tsconfig.json +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphmemory
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Graph-based memory system using DuckDB
|
|
5
5
|
Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
|
|
6
6
|
Project-URL: Repository, https://github.com/bradAGI/GraphMemory
|
|
@@ -224,7 +224,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
|
|
|
224
224
|
|
|
225
225
|
| Method | Description |
|
|
226
226
|
|--------|-------------|
|
|
227
|
-
| `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
|
|
227
|
+
| `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
|
|
228
228
|
| `close()` | Close connection (thread-safe, idempotent). |
|
|
229
229
|
| `transaction()` | Context manager for atomic operations. |
|
|
230
230
|
|
|
@@ -262,7 +262,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
|
|
|
262
262
|
| `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
|
|
263
263
|
| `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
|
|
264
264
|
| `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
|
|
265
|
-
| `create_index()` | Create HNSW index
|
|
265
|
+
| `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
|
|
266
|
+
| `compact_index()` | Compact HNSW index to reclaim space after deletions. |
|
|
266
267
|
|
|
267
268
|
### Retrieval
|
|
268
269
|
|
|
@@ -295,7 +296,7 @@ See `examples/` for complete usage:
|
|
|
295
296
|
|
|
296
297
|
## Testing
|
|
297
298
|
|
|
298
|
-
|
|
299
|
+
291 tests covering all functionality.
|
|
299
300
|
|
|
300
301
|
```sh
|
|
301
302
|
python3 -m pytest tests/tests.py -v
|
|
@@ -197,7 +197,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
|
|
|
197
197
|
|
|
198
198
|
| Method | Description |
|
|
199
199
|
|--------|-------------|
|
|
200
|
-
| `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
|
|
200
|
+
| `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
|
|
201
201
|
| `close()` | Close connection (thread-safe, idempotent). |
|
|
202
202
|
| `transaction()` | Context manager for atomic operations. |
|
|
203
203
|
|
|
@@ -235,7 +235,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
|
|
|
235
235
|
| `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
|
|
236
236
|
| `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
|
|
237
237
|
| `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
|
|
238
|
-
| `create_index()` | Create HNSW index
|
|
238
|
+
| `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
|
|
239
|
+
| `compact_index()` | Compact HNSW index to reclaim space after deletions. |
|
|
239
240
|
|
|
240
241
|
### Retrieval
|
|
241
242
|
|
|
@@ -268,7 +269,7 @@ See `examples/` for complete usage:
|
|
|
268
269
|
|
|
269
270
|
## Testing
|
|
270
271
|
|
|
271
|
-
|
|
272
|
+
291 tests covering all functionality.
|
|
272
273
|
|
|
273
274
|
```sh
|
|
274
275
|
python3 -m pytest tests/tests.py -v
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""End-to-end test: ingest aimav4.txt using parallel LLM extraction via DSPy."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
10
|
+
|
|
11
|
+
import dspy
|
|
12
|
+
from graphmemory import GraphMemory, MergeStrategy
|
|
13
|
+
from graphmemory.extraction import extract_and_merge_parallel
|
|
14
|
+
|
|
15
|
+
logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s")
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# --- Configure DSPy with gpt-5-nano (10k RPM, 10M TPM) ---
|
|
19
|
+
lm = dspy.LM("openai/gpt-5-nano")
|
|
20
|
+
dspy.configure(lm=lm)
|
|
21
|
+
|
|
22
|
+
# With 10k RPM we can safely run 50+ concurrent requests
|
|
23
|
+
MAX_WORKERS = 50
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def chunk_text(text: str, max_chars: int = 4000) -> list[str]:
|
|
27
|
+
"""Split text into paragraph-aware chunks."""
|
|
28
|
+
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
|
29
|
+
chunks = []
|
|
30
|
+
current = []
|
|
31
|
+
current_len = 0
|
|
32
|
+
for p in paragraphs:
|
|
33
|
+
if current_len + len(p) > max_chars and current:
|
|
34
|
+
chunks.append("\n\n".join(current))
|
|
35
|
+
current = []
|
|
36
|
+
current_len = 0
|
|
37
|
+
current.append(p)
|
|
38
|
+
current_len += len(p)
|
|
39
|
+
if current:
|
|
40
|
+
chunks.append("\n\n".join(current))
|
|
41
|
+
return chunks
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def on_progress(phase, done, total):
|
|
45
|
+
bar_len = 30
|
|
46
|
+
filled = int(bar_len * done / total)
|
|
47
|
+
bar = "█" * filled + "░" * (bar_len - filled)
|
|
48
|
+
print(f"\r {phase:5s} [{bar}] {done}/{total}", end="", flush=True)
|
|
49
|
+
if done == total:
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main():
|
|
54
|
+
input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
|
|
55
|
+
with open(input_path) as f:
|
|
56
|
+
text = f.read(200_000)
|
|
57
|
+
|
|
58
|
+
text = re.sub(r"<!--.*?-->", "", text)
|
|
59
|
+
chunks = chunk_text(text, max_chars=4000)
|
|
60
|
+
|
|
61
|
+
print("=" * 60)
|
|
62
|
+
print("GraphMemory — Parallel LLM Extraction")
|
|
63
|
+
print("=" * 60)
|
|
64
|
+
print(f"Source: aimav4.txt ({len(text):,} chars)")
|
|
65
|
+
print(f"Chunks: {len(chunks)} x ~4k chars")
|
|
66
|
+
print(f"Workers: {MAX_WORKERS} concurrent LLM calls")
|
|
67
|
+
print(f"LLM: gpt-5-nano via DSPy")
|
|
68
|
+
|
|
69
|
+
db = GraphMemory(database=":memory:", vector_length=3)
|
|
70
|
+
|
|
71
|
+
print(f"\n--- Phase 1: Node extraction (parallel) ---")
|
|
72
|
+
print(f"--- Phase 2: Edge extraction (parallel) ---")
|
|
73
|
+
t0 = time.time()
|
|
74
|
+
|
|
75
|
+
node_results, edge_results = extract_and_merge_parallel(
|
|
76
|
+
db,
|
|
77
|
+
chunks,
|
|
78
|
+
match_keys=["name"],
|
|
79
|
+
match_type=True,
|
|
80
|
+
similarity_threshold=0.88,
|
|
81
|
+
max_workers=MAX_WORKERS,
|
|
82
|
+
on_progress=on_progress,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
elapsed = time.time() - t0
|
|
86
|
+
created_n = sum(1 for r in node_results if r.created)
|
|
87
|
+
merged_n = sum(1 for r in node_results if not r.created)
|
|
88
|
+
created_e = sum(1 for r in edge_results if r.created)
|
|
89
|
+
merged_e = sum(1 for r in edge_results if not r.created)
|
|
90
|
+
|
|
91
|
+
print(f"\n Done in {elapsed:.1f}s ({len(chunks) * 2} LLM calls)")
|
|
92
|
+
print(f" Nodes: {created_n} new, {merged_n} fuzzy-merged")
|
|
93
|
+
print(f" Edges: {created_e} new, {merged_e} deduped")
|
|
94
|
+
|
|
95
|
+
# --- Post-extraction dedupe ---
|
|
96
|
+
print(f"\n--- Post-extraction duplicate resolution ---")
|
|
97
|
+
t1 = time.time()
|
|
98
|
+
clusters = db.resolve_duplicates(
|
|
99
|
+
match_keys=["name"],
|
|
100
|
+
match_type=True,
|
|
101
|
+
similarity_threshold=0.90,
|
|
102
|
+
)
|
|
103
|
+
print(f" {len(clusters)} clusters resolved in {time.time() - t1:.1f}s")
|
|
104
|
+
for c in clusters[:10]:
|
|
105
|
+
merged_names = [m.properties.get("name", "?") for m in c.merged]
|
|
106
|
+
print(f" '{c.survivor.properties.get('name')}' <- {merged_names}")
|
|
107
|
+
if len(clusters) > 10:
|
|
108
|
+
print(f" ... and {len(clusters) - 10} more")
|
|
109
|
+
|
|
110
|
+
# --- Results ---
|
|
111
|
+
all_nodes = db.nodes_to_json()
|
|
112
|
+
all_edges = db.edges_to_json()
|
|
113
|
+
|
|
114
|
+
type_counts = {}
|
|
115
|
+
for n in all_nodes:
|
|
116
|
+
t = n.get("type", "Unknown")
|
|
117
|
+
type_counts[t] = type_counts.get(t, 0) + 1
|
|
118
|
+
|
|
119
|
+
print(f"\n--- Final Graph ---")
|
|
120
|
+
print(f" Nodes: {len(all_nodes)}")
|
|
121
|
+
print(f" Edges: {len(all_edges)}")
|
|
122
|
+
print(f" Types: {dict(sorted(type_counts.items(), key=lambda x: -x[1]))}")
|
|
123
|
+
|
|
124
|
+
print(f"\n--- Sample Entities (first 30) ---")
|
|
125
|
+
sorted_nodes = sorted(all_nodes, key=lambda x: (x.get("type") or "", x.get("properties", {}).get("name") or ""))
|
|
126
|
+
for n in sorted_nodes[:30]:
|
|
127
|
+
props = n.get("properties", {})
|
|
128
|
+
print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
|
|
129
|
+
if len(sorted_nodes) > 30:
|
|
130
|
+
print(f" ... and {len(sorted_nodes) - 30} more")
|
|
131
|
+
|
|
132
|
+
print(f"\n--- Sample Relationships (first 20) ---")
|
|
133
|
+
node_id_map = {n["id"]: n for n in all_nodes}
|
|
134
|
+
for e in all_edges[:20]:
|
|
135
|
+
src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", "?")
|
|
136
|
+
tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", "?")
|
|
137
|
+
print(f" {src} --[{e['relation']}]--> {tgt}")
|
|
138
|
+
if len(all_edges) > 20:
|
|
139
|
+
print(f" ... and {len(all_edges) - 20} more")
|
|
140
|
+
|
|
141
|
+
print(f"\n--- Search: 'artificial intelligence' ---")
|
|
142
|
+
results = db.search_nodes("artificial intelligence", limit=5)
|
|
143
|
+
for sr in results:
|
|
144
|
+
print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
|
|
145
|
+
|
|
146
|
+
print(f"\n{'=' * 60}")
|
|
147
|
+
print(f"{len(all_nodes)} nodes, {len(all_edges)} edges from {len(text):,} chars in {elapsed:.1f}s")
|
|
148
|
+
print(f"{'=' * 60}")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
main()
|
|
@@ -86,7 +86,8 @@ class GraphMemory:
|
|
|
86
86
|
'inner_product': {'function': 'array_negative_inner_product', 'hnsw_metric': 'ip'},
|
|
87
87
|
}
|
|
88
88
|
|
|
89
|
-
def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1
|
|
89
|
+
def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1,
|
|
90
|
+
hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True):
|
|
90
91
|
if distance_metric not in self.DISTANCE_METRICS:
|
|
91
92
|
raise ValueError(
|
|
92
93
|
f"Invalid distance_metric '{distance_metric}'. "
|
|
@@ -97,9 +98,13 @@ class GraphMemory:
|
|
|
97
98
|
self.distance_metric = distance_metric
|
|
98
99
|
self.max_retries = max_retries
|
|
99
100
|
self.retry_base_delay = retry_base_delay
|
|
101
|
+
self.hnsw_ef_construction = hnsw_ef_construction
|
|
102
|
+
self.hnsw_ef_search = hnsw_ef_search
|
|
103
|
+
self.hnsw_m = hnsw_m
|
|
100
104
|
self._lock = threading.RLock()
|
|
101
105
|
self._fts_initialized = False
|
|
102
106
|
self._fts_dirty = True
|
|
107
|
+
self._hnsw_indexed = False
|
|
103
108
|
self._closed = False
|
|
104
109
|
self.conn = duckdb.connect(database=self.database)
|
|
105
110
|
self._load_vss_extension()
|
|
@@ -116,6 +121,9 @@ class GraphMemory:
|
|
|
116
121
|
self._create_tables()
|
|
117
122
|
logger.info("Tables created or verified successfully.")
|
|
118
123
|
|
|
124
|
+
if auto_index:
|
|
125
|
+
self._ensure_hnsw_index()
|
|
126
|
+
|
|
119
127
|
def cursor(self):
|
|
120
128
|
"""Return a new DuckDB cursor for individual operations.
|
|
121
129
|
|
|
@@ -150,6 +158,8 @@ class GraphMemory:
|
|
|
150
158
|
self._configure_database()
|
|
151
159
|
self._fts_initialized = False
|
|
152
160
|
self._fts_dirty = True
|
|
161
|
+
self._hnsw_indexed = False
|
|
162
|
+
self._ensure_hnsw_index()
|
|
153
163
|
logger.info("Reconnection successful.")
|
|
154
164
|
|
|
155
165
|
def close(self):
|
|
@@ -191,6 +201,8 @@ class GraphMemory:
|
|
|
191
201
|
|
|
192
202
|
def set_vector_length(self, vector_length):
|
|
193
203
|
self.vector_length = vector_length
|
|
204
|
+
self._hnsw_indexed = False
|
|
205
|
+
self._ensure_hnsw_index()
|
|
194
206
|
logger.info(f"Vector length set to: {self.vector_length}")
|
|
195
207
|
|
|
196
208
|
def _create_tables(self):
|
|
@@ -303,6 +315,7 @@ class GraphMemory:
|
|
|
303
315
|
cur.execute(
|
|
304
316
|
"DELETE FROM nodes WHERE id = ?;", (str(node_id),))
|
|
305
317
|
self._fts_dirty = True
|
|
318
|
+
self.compact_index()
|
|
306
319
|
except duckdb.Error as e:
|
|
307
320
|
logger.error(f"Error deleting node: {e}")
|
|
308
321
|
|
|
@@ -321,6 +334,7 @@ class GraphMemory:
|
|
|
321
334
|
cur.execute(
|
|
322
335
|
f"DELETE FROM nodes WHERE id IN ({placeholders});", id_strs)
|
|
323
336
|
self._fts_dirty = True
|
|
337
|
+
self.compact_index()
|
|
324
338
|
except duckdb.Error as e:
|
|
325
339
|
logger.error(f"Error during bulk delete nodes: {e}")
|
|
326
340
|
|
|
@@ -920,15 +934,53 @@ class GraphMemory:
|
|
|
920
934
|
logger.error(f"Error updating edge: {e}")
|
|
921
935
|
return False
|
|
922
936
|
|
|
937
|
+
def _ensure_hnsw_index(self):
|
|
938
|
+
"""Create HNSW index if not already present. Called automatically on init."""
|
|
939
|
+
if self._hnsw_indexed:
|
|
940
|
+
return
|
|
941
|
+
try:
|
|
942
|
+
nodes_exist = self.conn.execute(
|
|
943
|
+
"SELECT 1 FROM information_schema.tables WHERE table_name = 'nodes';"
|
|
944
|
+
).fetchone()
|
|
945
|
+
if nodes_exist:
|
|
946
|
+
self.create_index()
|
|
947
|
+
except duckdb.Error:
|
|
948
|
+
pass
|
|
949
|
+
|
|
923
950
|
@with_retry()
|
|
924
|
-
def create_index(self):
|
|
951
|
+
def create_index(self, ef_construction: int | None = None, ef_search: int | None = None, m: int | None = None):
|
|
952
|
+
"""Create or recreate the HNSW vector index.
|
|
953
|
+
|
|
954
|
+
Args:
|
|
955
|
+
ef_construction: Candidate vertices during build (default from init).
|
|
956
|
+
ef_search: Candidate vertices during search (default from init).
|
|
957
|
+
m: Max neighbors per vertex (default from init).
|
|
958
|
+
"""
|
|
959
|
+
ef_c = ef_construction or self.hnsw_ef_construction
|
|
960
|
+
ef_s = ef_search or self.hnsw_ef_search
|
|
961
|
+
m_val = m or self.hnsw_m
|
|
962
|
+
hnsw_metric = self.DISTANCE_METRICS[self.distance_metric]['hnsw_metric']
|
|
925
963
|
with self._lock:
|
|
926
964
|
try:
|
|
927
|
-
|
|
965
|
+
# Drop existing index first to allow metric/param changes
|
|
966
|
+
self.conn.execute("DROP INDEX IF EXISTS vss_idx;")
|
|
928
967
|
self.conn.execute(
|
|
929
|
-
f"CREATE INDEX
|
|
968
|
+
f"CREATE INDEX vss_idx ON nodes USING HNSW(vector) "
|
|
969
|
+
f"WITH (metric = '{hnsw_metric}', ef_construction = {ef_c}, ef_search = {ef_s}, M = {m_val});"
|
|
970
|
+
)
|
|
971
|
+
self._hnsw_indexed = True
|
|
972
|
+
logger.info(f"HNSW index created (metric={hnsw_metric}, ef_construction={ef_c}, ef_search={ef_s}, M={m_val}).")
|
|
973
|
+
except duckdb.Error as e:
|
|
974
|
+
logger.error(f"Error creating HNSW index: {e}")
|
|
975
|
+
|
|
976
|
+
def compact_index(self):
|
|
977
|
+
"""Compact the HNSW index to reclaim space after deletions."""
|
|
978
|
+
with self._lock:
|
|
979
|
+
try:
|
|
980
|
+
self.conn.execute("PRAGMA hnsw_compact_index('vss_idx');")
|
|
981
|
+
logger.info("HNSW index compacted.")
|
|
930
982
|
except duckdb.Error as e:
|
|
931
|
-
logger.error(f"Error
|
|
983
|
+
logger.error(f"Error compacting HNSW index: {e}")
|
|
932
984
|
|
|
933
985
|
@with_retry()
|
|
934
986
|
def nearest_nodes(self, vector: list[float], limit: int) -> list[NearestNode]:
|
|
@@ -1334,9 +1386,10 @@ class GraphMemory:
|
|
|
1334
1386
|
|
|
1335
1387
|
# Collect vector similarity results
|
|
1336
1388
|
vss_results = {}
|
|
1389
|
+
dist_func = self.DISTANCE_METRICS[self.distance_metric]['function']
|
|
1337
1390
|
vss_query = f"""
|
|
1338
1391
|
SELECT id, type, properties, vector,
|
|
1339
|
-
|
|
1392
|
+
{dist_func}(vector, CAST(? AS FLOAT[{self.vector_length}])) AS distance
|
|
1340
1393
|
FROM nodes
|
|
1341
1394
|
WHERE vector IS NOT NULL
|
|
1342
1395
|
ORDER BY distance;
|
|
@@ -10,7 +10,8 @@ Requires the ``dspy`` optional dependency:
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
-
from
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
14
15
|
|
|
15
16
|
from pydantic import BaseModel, Field
|
|
16
17
|
|
|
@@ -296,3 +297,124 @@ def extract_and_merge(
|
|
|
296
297
|
len(edge_results),
|
|
297
298
|
)
|
|
298
299
|
return node_results, edge_results
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
# Parallel extraction
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _extract_nodes_chunk(chunk: str) -> list[Node]:
|
|
308
|
+
"""Extract nodes from a single chunk (thread-safe, no DB access)."""
|
|
309
|
+
return extract_nodes(chunk, sentences=[chunk])
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _extract_edges_chunk(chunk: str, nodes: list[Node]) -> list[Edge]:
|
|
313
|
+
"""Extract edges from a single chunk given known nodes (thread-safe)."""
|
|
314
|
+
return extract_edges(chunk, nodes, sentences=[chunk])
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def extract_and_merge_parallel(
|
|
318
|
+
graph: GraphMemory,
|
|
319
|
+
chunks: list[str],
|
|
320
|
+
match_keys: list[str] | None = None,
|
|
321
|
+
match_type: bool = True,
|
|
322
|
+
strategy: MergeStrategy = MergeStrategy.UPDATE,
|
|
323
|
+
similarity_threshold: float = 1.0,
|
|
324
|
+
vector_threshold: float | None = None,
|
|
325
|
+
max_workers: int = 8,
|
|
326
|
+
on_progress: Callable[[str, int, int], None] | None = None,
|
|
327
|
+
) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
|
|
328
|
+
"""Extract from multiple text chunks in parallel, then merge sequentially.
|
|
329
|
+
|
|
330
|
+
Runs in two parallel phases to maximize LLM throughput:
|
|
331
|
+
1. Node extraction — all chunks concurrently (saturate RPM)
|
|
332
|
+
2. Edge extraction — all chunks concurrently (with all extracted nodes as context)
|
|
333
|
+
Then merges into DB sequentially.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
graph: A :class:`~graphmemory.database.GraphMemory` instance.
|
|
337
|
+
chunks: List of text chunks to process.
|
|
338
|
+
match_keys: Property names to match nodes on (default ``["name"]``).
|
|
339
|
+
match_type: Also require ``node.type`` to match.
|
|
340
|
+
strategy: How to merge properties on match.
|
|
341
|
+
similarity_threshold: Jaro-Winkler threshold for fuzzy matching.
|
|
342
|
+
vector_threshold: Max cosine distance for vector similarity.
|
|
343
|
+
max_workers: Max concurrent LLM calls (match your RPM headroom).
|
|
344
|
+
on_progress: Optional callback ``(phase, completed, total)``.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Aggregated ``(node_results, edge_results)`` across all chunks.
|
|
348
|
+
"""
|
|
349
|
+
if match_keys is None:
|
|
350
|
+
match_keys = ["name"]
|
|
351
|
+
|
|
352
|
+
total = len(chunks)
|
|
353
|
+
|
|
354
|
+
# Phase 1: Extract nodes from ALL chunks in parallel
|
|
355
|
+
chunk_nodes: dict[int, list[Node]] = {}
|
|
356
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
357
|
+
future_to_idx = {
|
|
358
|
+
pool.submit(_extract_nodes_chunk, chunk): i
|
|
359
|
+
for i, chunk in enumerate(chunks)
|
|
360
|
+
}
|
|
361
|
+
done = 0
|
|
362
|
+
for future in as_completed(future_to_idx):
|
|
363
|
+
idx = future_to_idx[future]
|
|
364
|
+
try:
|
|
365
|
+
chunk_nodes[idx] = future.result()
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.warning("Node extraction failed for chunk %d: %s", idx + 1, e)
|
|
368
|
+
chunk_nodes[idx] = []
|
|
369
|
+
done += 1
|
|
370
|
+
if on_progress:
|
|
371
|
+
on_progress("nodes", done, total)
|
|
372
|
+
|
|
373
|
+
# Merge all nodes into DB sequentially to build the full node set
|
|
374
|
+
all_node_results: list[MergeResult] = []
|
|
375
|
+
for idx in range(total):
|
|
376
|
+
nodes = chunk_nodes.get(idx, [])
|
|
377
|
+
if nodes:
|
|
378
|
+
results = graph.bulk_merge_nodes(
|
|
379
|
+
nodes, match_keys=match_keys, match_type=match_type,
|
|
380
|
+
strategy=strategy, similarity_threshold=similarity_threshold,
|
|
381
|
+
vector_threshold=vector_threshold,
|
|
382
|
+
)
|
|
383
|
+
all_node_results.extend(results)
|
|
384
|
+
|
|
385
|
+
# Build complete node list for edge extraction context
|
|
386
|
+
all_nodes = [r.node for r in all_node_results]
|
|
387
|
+
logger.info("Phase 1 complete: %d nodes extracted and merged.", len(all_nodes))
|
|
388
|
+
|
|
389
|
+
# Phase 2: Extract edges from ALL chunks in parallel (with full node context)
|
|
390
|
+
chunk_edges: dict[int, list[Edge]] = {}
|
|
391
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
392
|
+
future_to_idx = {
|
|
393
|
+
pool.submit(_extract_edges_chunk, chunk, all_nodes): i
|
|
394
|
+
for i, chunk in enumerate(chunks)
|
|
395
|
+
}
|
|
396
|
+
done = 0
|
|
397
|
+
for future in as_completed(future_to_idx):
|
|
398
|
+
idx = future_to_idx[future]
|
|
399
|
+
try:
|
|
400
|
+
chunk_edges[idx] = future.result()
|
|
401
|
+
except Exception as e:
|
|
402
|
+
logger.warning("Edge extraction failed for chunk %d: %s", idx + 1, e)
|
|
403
|
+
chunk_edges[idx] = []
|
|
404
|
+
done += 1
|
|
405
|
+
if on_progress:
|
|
406
|
+
on_progress("edges", done, total)
|
|
407
|
+
|
|
408
|
+
# Merge all edges into DB sequentially
|
|
409
|
+
all_edge_results: list[EdgeMergeResult] = []
|
|
410
|
+
for idx in range(total):
|
|
411
|
+
edges = chunk_edges.get(idx, [])
|
|
412
|
+
if edges:
|
|
413
|
+
results = graph.bulk_merge_edges(edges)
|
|
414
|
+
all_edge_results.extend(results)
|
|
415
|
+
|
|
416
|
+
logger.info(
|
|
417
|
+
"Parallel extraction complete: %d chunks, %d nodes, %d edges.",
|
|
418
|
+
total, len(all_node_results), len(all_edge_results),
|
|
419
|
+
)
|
|
420
|
+
return all_node_results, all_edge_results
|
|
@@ -2729,5 +2729,147 @@ class TestFuzzyMatching(unittest.TestCase):
|
|
|
2729
2729
|
self.assertEqual(remaining, 2)
|
|
2730
2730
|
|
|
2731
2731
|
|
|
2732
|
+
class TestHNSWIndex(unittest.TestCase):
|
|
2733
|
+
|
|
2734
|
+
def test_auto_index_on_init(self):
|
|
2735
|
+
db = GraphMemory(database=':memory:', vector_length=3)
|
|
2736
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2737
|
+
db.close()
|
|
2738
|
+
|
|
2739
|
+
def test_auto_index_disabled(self):
|
|
2740
|
+
db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
|
|
2741
|
+
self.assertFalse(db._hnsw_indexed)
|
|
2742
|
+
db.close()
|
|
2743
|
+
|
|
2744
|
+
def test_create_index_with_custom_params(self):
|
|
2745
|
+
db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
|
|
2746
|
+
db.create_index(ef_construction=64, ef_search=32, m=8)
|
|
2747
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2748
|
+
db.close()
|
|
2749
|
+
|
|
2750
|
+
def test_create_index_uses_configured_metric(self):
|
|
2751
|
+
for metric in ['l2', 'cosine', 'inner_product']:
|
|
2752
|
+
db = GraphMemory(database=':memory:', vector_length=3, distance_metric=metric)
|
|
2753
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2754
|
+
db.close()
|
|
2755
|
+
|
|
2756
|
+
def test_create_index_idempotent_recreate(self):
|
|
2757
|
+
db = GraphMemory(database=':memory:', vector_length=3)
|
|
2758
|
+
db.create_index()
|
|
2759
|
+
db.create_index()
|
|
2760
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2761
|
+
db.close()
|
|
2762
|
+
|
|
2763
|
+
def test_set_vector_length_rebuilds_index(self):
|
|
2764
|
+
db = GraphMemory(database=':memory:', vector_length=3)
|
|
2765
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2766
|
+
db._hnsw_indexed = False
|
|
2767
|
+
db.set_vector_length(5)
|
|
2768
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2769
|
+
self.assertEqual(db.vector_length, 5)
|
|
2770
|
+
db.close()
|
|
2771
|
+
|
|
2772
|
+
def test_compact_index_no_error(self):
|
|
2773
|
+
db = GraphMemory(database=':memory:', vector_length=3)
|
|
2774
|
+
node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
|
|
2775
|
+
db.insert_node(node)
|
|
2776
|
+
db.delete_node(node.id)
|
|
2777
|
+
db.compact_index()
|
|
2778
|
+
db.close()
|
|
2779
|
+
|
|
2780
|
+
def test_reconnect_rebuilds_index(self):
|
|
2781
|
+
import tempfile
|
|
2782
|
+
path = tempfile.mktemp(suffix='.db')
|
|
2783
|
+
try:
|
|
2784
|
+
db = GraphMemory(database=path, vector_length=3)
|
|
2785
|
+
db._hnsw_indexed = False
|
|
2786
|
+
db._reconnect()
|
|
2787
|
+
self.assertTrue(db._hnsw_indexed)
|
|
2788
|
+
db.close()
|
|
2789
|
+
finally:
|
|
2790
|
+
if os.path.exists(path):
|
|
2791
|
+
os.unlink(path)
|
|
2792
|
+
|
|
2793
|
+
def test_hnsw_params_stored(self):
|
|
2794
|
+
db = GraphMemory(database=':memory:', vector_length=3,
|
|
2795
|
+
hnsw_ef_construction=256, hnsw_ef_search=128, hnsw_m=32)
|
|
2796
|
+
self.assertEqual(db.hnsw_ef_construction, 256)
|
|
2797
|
+
self.assertEqual(db.hnsw_ef_search, 128)
|
|
2798
|
+
self.assertEqual(db.hnsw_m, 32)
|
|
2799
|
+
db.close()
|
|
2800
|
+
|
|
2801
|
+
|
|
2802
|
+
class TestHybridSearchMetric(unittest.TestCase):
|
|
2803
|
+
|
|
2804
|
+
def setUp(self):
|
|
2805
|
+
self.db = GraphMemory(database=':memory:', vector_length=3, distance_metric='cosine')
|
|
2806
|
+
self.db.insert_node(Node(type="Doc", properties={"text": "machine learning"}, vector=[1.0, 0.0, 0.0]))
|
|
2807
|
+
self.db.insert_node(Node(type="Doc", properties={"text": "deep learning"}, vector=[0.9, 0.1, 0.0]))
|
|
2808
|
+
self.db.insert_node(Node(type="Doc", properties={"text": "cooking recipes"}, vector=[0.0, 0.0, 1.0]))
|
|
2809
|
+
|
|
2810
|
+
def tearDown(self):
|
|
2811
|
+
self.db.close()
|
|
2812
|
+
|
|
2813
|
+
def test_hybrid_search_uses_cosine_metric(self):
|
|
2814
|
+
results = self.db.hybrid_search(
|
|
2815
|
+
query_text="learning",
|
|
2816
|
+
query_vector=[1.0, 0.0, 0.0],
|
|
2817
|
+
limit=3
|
|
2818
|
+
)
|
|
2819
|
+
self.assertGreater(len(results), 0)
|
|
2820
|
+
# The learning docs should score higher than cooking
|
|
2821
|
+
names = [r.node.properties.get("text") for r in results]
|
|
2822
|
+
self.assertIn("machine learning", names[:2])
|
|
2823
|
+
|
|
2824
|
+
def test_hybrid_search_inner_product(self):
|
|
2825
|
+
db = GraphMemory(database=':memory:', vector_length=3, distance_metric='inner_product')
|
|
2826
|
+
db.insert_node(Node(type="Doc", properties={"text": "similar"}, vector=[1.0, 0.0, 0.0]))
|
|
2827
|
+
db.insert_node(Node(type="Doc", properties={"text": "different"}, vector=[0.0, 0.0, 1.0]))
|
|
2828
|
+
results = db.hybrid_search(
|
|
2829
|
+
query_text="similar",
|
|
2830
|
+
query_vector=[1.0, 0.0, 0.0],
|
|
2831
|
+
limit=2
|
|
2832
|
+
)
|
|
2833
|
+
self.assertGreater(len(results), 0)
|
|
2834
|
+
db.close()
|
|
2835
|
+
|
|
2836
|
+
def test_hybrid_search_l2_metric(self):
|
|
2837
|
+
db = GraphMemory(database=':memory:', vector_length=3, distance_metric='l2')
|
|
2838
|
+
db.insert_node(Node(type="Doc", properties={"text": "near"}, vector=[0.1, 0.0, 0.0]))
|
|
2839
|
+
db.insert_node(Node(type="Doc", properties={"text": "far"}, vector=[9.0, 9.0, 9.0]))
|
|
2840
|
+
results = db.hybrid_search(
|
|
2841
|
+
query_text="near",
|
|
2842
|
+
query_vector=[0.0, 0.0, 0.0],
|
|
2843
|
+
limit=2
|
|
2844
|
+
)
|
|
2845
|
+
self.assertGreater(len(results), 0)
|
|
2846
|
+
self.assertEqual(results[0].node.properties["text"], "near")
|
|
2847
|
+
db.close()
|
|
2848
|
+
|
|
2849
|
+
|
|
2850
|
+
class TestCompactAfterDelete(unittest.TestCase):
|
|
2851
|
+
|
|
2852
|
+
def setUp(self):
|
|
2853
|
+
self.db = GraphMemory(database=':memory:', vector_length=3)
|
|
2854
|
+
|
|
2855
|
+
def tearDown(self):
|
|
2856
|
+
self.db.close()
|
|
2857
|
+
|
|
2858
|
+
def test_delete_node_compacts(self):
|
|
2859
|
+
node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
|
|
2860
|
+
self.db.insert_node(node)
|
|
2861
|
+
# Should not raise — compact_index called internally
|
|
2862
|
+
self.db.delete_node(node.id)
|
|
2863
|
+
self.assertEqual(len(self.db.nodes_to_json()), 0)
|
|
2864
|
+
|
|
2865
|
+
def test_bulk_delete_nodes_compacts(self):
|
|
2866
|
+
n1 = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
|
|
2867
|
+
n2 = Node(type="Test", properties={"name": "B"}, vector=[0.0, 1.0, 0.0])
|
|
2868
|
+
self.db.insert_node(n1)
|
|
2869
|
+
self.db.insert_node(n2)
|
|
2870
|
+
self.db.bulk_delete_nodes([n1.id, n2.id])
|
|
2871
|
+
self.assertEqual(len(self.db.nodes_to_json()), 0)
|
|
2872
|
+
|
|
2873
|
+
|
|
2732
2874
|
if __name__ == '__main__':
|
|
2733
2875
|
unittest.main()
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
"""End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
|
|
2
|
-
|
|
3
|
-
import sys
|
|
4
|
-
import os
|
|
5
|
-
import re
|
|
6
|
-
import logging
|
|
7
|
-
|
|
8
|
-
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
9
|
-
|
|
10
|
-
import dspy
|
|
11
|
-
from graphmemory import GraphMemory, MergeStrategy
|
|
12
|
-
from graphmemory.extraction import extract_and_merge
|
|
13
|
-
|
|
14
|
-
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
# --- Configure DSPy with gpt-5-nano ---
|
|
18
|
-
lm = dspy.LM("openai/gpt-5-nano")
|
|
19
|
-
dspy.configure(lm=lm)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
|
|
23
|
-
"""Split text into paragraph-aware chunks."""
|
|
24
|
-
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
|
25
|
-
chunks = []
|
|
26
|
-
current = []
|
|
27
|
-
current_len = 0
|
|
28
|
-
for p in paragraphs:
|
|
29
|
-
if current_len + len(p) > max_chars and current:
|
|
30
|
-
chunks.append("\n\n".join(current))
|
|
31
|
-
current = []
|
|
32
|
-
current_len = 0
|
|
33
|
-
current.append(p)
|
|
34
|
-
current_len += len(p)
|
|
35
|
-
if current:
|
|
36
|
-
chunks.append("\n\n".join(current))
|
|
37
|
-
return chunks
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def main():
|
|
41
|
-
input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
|
|
42
|
-
with open(input_path) as f:
|
|
43
|
-
text = f.read(100_000)
|
|
44
|
-
|
|
45
|
-
text = re.sub(r"<!--.*?-->", "", text)
|
|
46
|
-
chunks = chunk_text(text, max_chars=4000)
|
|
47
|
-
|
|
48
|
-
print("=" * 60)
|
|
49
|
-
print("GraphMemory — Real LLM Extraction Test")
|
|
50
|
-
print("=" * 60)
|
|
51
|
-
print(f"Source: aimav4.txt ({len(text)} chars)")
|
|
52
|
-
print(f"Chunks: {len(chunks)}")
|
|
53
|
-
print(f"LLM: gpt-5-nano via DSPy")
|
|
54
|
-
|
|
55
|
-
db = GraphMemory(database=":memory:", vector_length=3)
|
|
56
|
-
|
|
57
|
-
print(f"\n--- Extracting entities & relationships ---")
|
|
58
|
-
total_nodes = 0
|
|
59
|
-
total_edges = 0
|
|
60
|
-
total_merged_nodes = 0
|
|
61
|
-
total_merged_edges = 0
|
|
62
|
-
|
|
63
|
-
for i, chunk in enumerate(chunks):
|
|
64
|
-
print(f"\n Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
|
|
65
|
-
try:
|
|
66
|
-
# Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
|
|
67
|
-
node_results, edge_results = extract_and_merge(
|
|
68
|
-
db,
|
|
69
|
-
chunk,
|
|
70
|
-
match_keys=["name"],
|
|
71
|
-
match_type=True,
|
|
72
|
-
similarity_threshold=0.88,
|
|
73
|
-
sentences=[chunk], # single LLM call per chunk
|
|
74
|
-
)
|
|
75
|
-
created_n = sum(1 for r in node_results if r.created)
|
|
76
|
-
merged_n = sum(1 for r in node_results if not r.created)
|
|
77
|
-
created_e = sum(1 for r in edge_results if r.created)
|
|
78
|
-
merged_e = sum(1 for r in edge_results if not r.created)
|
|
79
|
-
|
|
80
|
-
total_nodes += created_n
|
|
81
|
-
total_merged_nodes += merged_n
|
|
82
|
-
total_edges += created_e
|
|
83
|
-
total_merged_edges += merged_e
|
|
84
|
-
|
|
85
|
-
print(f" Nodes: {created_n} new, {merged_n} merged")
|
|
86
|
-
print(f" Edges: {created_e} new, {merged_e} merged")
|
|
87
|
-
except Exception as e:
|
|
88
|
-
logger.warning(f" Chunk {i + 1} failed: {e}")
|
|
89
|
-
|
|
90
|
-
# --- Post-extraction dedupe ---
|
|
91
|
-
print(f"\n--- Post-extraction duplicate resolution ---")
|
|
92
|
-
clusters = db.resolve_duplicates(
|
|
93
|
-
match_keys=["name"],
|
|
94
|
-
match_type=True,
|
|
95
|
-
similarity_threshold=0.90,
|
|
96
|
-
)
|
|
97
|
-
if clusters:
|
|
98
|
-
for c in clusters:
|
|
99
|
-
merged_names = [m.properties.get("name", "?") for m in c.merged]
|
|
100
|
-
print(f" Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
|
|
101
|
-
else:
|
|
102
|
-
print(" No additional duplicates found.")
|
|
103
|
-
|
|
104
|
-
# --- Results ---
|
|
105
|
-
all_nodes = db.nodes_to_json()
|
|
106
|
-
all_edges = db.edges_to_json()
|
|
107
|
-
|
|
108
|
-
print(f"\n--- Final Graph ---")
|
|
109
|
-
print(f" Nodes: {len(all_nodes)}")
|
|
110
|
-
print(f" Edges: {len(all_edges)}")
|
|
111
|
-
|
|
112
|
-
type_counts = {}
|
|
113
|
-
for n in all_nodes:
|
|
114
|
-
t = n.get("type", "Unknown")
|
|
115
|
-
type_counts[t] = type_counts.get(t, 0) + 1
|
|
116
|
-
print(f" Types: {type_counts}")
|
|
117
|
-
|
|
118
|
-
print(f"\n--- Extracted Entities ---")
|
|
119
|
-
for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
|
|
120
|
-
props = n.get("properties", {})
|
|
121
|
-
print(f" [{n.get('type', '?'):15}] {props.get('name', props)}")
|
|
122
|
-
|
|
123
|
-
print(f"\n--- Extracted Relationships ---")
|
|
124
|
-
node_id_map = {n["id"]: n for n in all_nodes}
|
|
125
|
-
for e in all_edges:
|
|
126
|
-
src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
|
|
127
|
-
tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
|
|
128
|
-
print(f" {src} --[{e['relation']}]--> {tgt}")
|
|
129
|
-
|
|
130
|
-
print(f"\n--- Full-text search: 'deep learning' ---")
|
|
131
|
-
results = db.search_nodes("deep learning", limit=5)
|
|
132
|
-
for sr in results:
|
|
133
|
-
print(f" [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
|
|
134
|
-
|
|
135
|
-
print(f"\n--- Summary ---")
|
|
136
|
-
print(f" Extracted: {total_nodes} nodes, {total_edges} edges")
|
|
137
|
-
print(f" Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
|
|
138
|
-
print(f" Post-dedupe clusters: {len(clusters)}")
|
|
139
|
-
print(f" Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
|
|
140
|
-
|
|
141
|
-
print("\n" + "=" * 60)
|
|
142
|
-
print("Done!")
|
|
143
|
-
print("=" * 60)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
if __name__ == "__main__":
|
|
147
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|