PyPI - graphmemory - Versions diffs - 1.2.0__tar.gz → 1.3.0__tar.gz - Mend

graphmemory 1.2.0tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{graphmemory-1.2.0 → graphmemory-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: graphmemory
-Version: 1.2.0
+Version: 1.3.0
 Summary: Graph-based memory system using DuckDB
 Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
 Project-URL: Repository, https://github.com/bradAGI/GraphMemory
@@ -224,7 +224,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
 | Method | Description |
 |--------|-------------|
-| `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
+| `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
 | `close()` | Close connection (thread-safe, idempotent). |
 | `transaction()` | Context manager for atomic operations. |
@@ -262,7 +262,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
 | `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
 | `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
 | `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
-| `create_index()` | Create HNSW index for faster vector search. |
+| `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
+| `compact_index()` | Compact HNSW index to reclaim space after deletions. |
 ### Retrieval
@@ -295,7 +296,7 @@ See `examples/` for complete usage:
 ## Testing
-265 tests covering all functionality.
+291 tests covering all functionality.
 ```sh
 python3 -m pytest tests/tests.py -v

{graphmemory-1.2.0 → graphmemory-1.3.0}/README.md RENAMED Viewed

@@ -197,7 +197,7 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
 | Method | Description |
 |--------|-------------|
-| `GraphMemory(database=None, vector_length=3, distance_metric='l2')` | Initialize. `None` = in-memory. |
+| `GraphMemory(database=None, vector_length=3, distance_metric='l2', hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True)` | Initialize. `None` = in-memory. HNSW index auto-created. |
 | `close()` | Close connection (thread-safe, idempotent). |
 | `transaction()` | Context manager for atomic operations. |
@@ -235,7 +235,8 @@ All IDs are auto-generated UUIDs. All models are [Pydantic](https://docs.pydanti
 | `nearest_nodes(vector, limit) -> list[NearestNode]` | Vector similarity search. |
 | `search_nodes(query_text, limit=10) -> list[SearchResult]` | Full-text BM25 search. |
 | `hybrid_search(query_text, query_vector, ...) -> list[SearchResult]` | Combined text + vector search. |
-| `create_index()` | Create HNSW index for faster vector search. |
+| `create_index(ef_construction=None, ef_search=None, m=None)` | Create/recreate HNSW index with tunable params. Auto-called on init. |
+| `compact_index()` | Compact HNSW index to reclaim space after deletions. |
 ### Retrieval
@@ -268,7 +269,7 @@ See `examples/` for complete usage:
 ## Testing
-265 tests covering all functionality.
+291 tests covering all functionality.
 ```sh
 python3 -m pytest tests/tests.py -v

graphmemory-1.3.0/examples/test_ingest.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""End-to-end test: ingest aimav4.txt using parallel LLM extraction via DSPy."""
+import sys
+import os
+import re
+import time
+import logging
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import dspy
+from graphmemory import GraphMemory, MergeStrategy
+from graphmemory.extraction import extract_and_merge_parallel
+logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+# --- Configure DSPy with gpt-5-nano (10k RPM, 10M TPM) ---
+lm = dspy.LM("openai/gpt-5-nano")
+dspy.configure(lm=lm)
+# With 10k RPM we can safely run 50+ concurrent requests
+MAX_WORKERS = 50
+def chunk_text(text: str, max_chars: int = 4000) -> list[str]:
+    """Split text into paragraph-aware chunks."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    current = []
+    current_len = 0
+    for p in paragraphs:
+        if current_len + len(p) > max_chars and current:
+            chunks.append("\n\n".join(current))
+            current = []
+            current_len = 0
+        current.append(p)
+        current_len += len(p)
+    if current:
+        chunks.append("\n\n".join(current))
+    return chunks
+def on_progress(phase, done, total):
+    bar_len = 30
+    filled = int(bar_len * done / total)
+    bar = "█" * filled + "░" * (bar_len - filled)
+    print(f"\r  {phase:5s} [{bar}] {done}/{total}", end="", flush=True)
+    if done == total:
+        print()
+def main():
+    input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
+    with open(input_path) as f:
+        text = f.read(200_000)
+    text = re.sub(r"<!--.*?-->", "", text)
+    chunks = chunk_text(text, max_chars=4000)
+    print("=" * 60)
+    print("GraphMemory — Parallel LLM Extraction")
+    print("=" * 60)
+    print(f"Source: aimav4.txt ({len(text):,} chars)")
+    print(f"Chunks: {len(chunks)} x ~4k chars")
+    print(f"Workers: {MAX_WORKERS} concurrent LLM calls")
+    print(f"LLM: gpt-5-nano via DSPy")
+    db = GraphMemory(database=":memory:", vector_length=3)
+    print(f"\n--- Phase 1: Node extraction (parallel) ---")
+    print(f"--- Phase 2: Edge extraction (parallel) ---")
+    t0 = time.time()
+    node_results, edge_results = extract_and_merge_parallel(
+        db,
+        chunks,
+        match_keys=["name"],
+        match_type=True,
+        similarity_threshold=0.88,
+        max_workers=MAX_WORKERS,
+        on_progress=on_progress,
+    )
+    elapsed = time.time() - t0
+    created_n = sum(1 for r in node_results if r.created)
+    merged_n = sum(1 for r in node_results if not r.created)
+    created_e = sum(1 for r in edge_results if r.created)
+    merged_e = sum(1 for r in edge_results if not r.created)
+    print(f"\n  Done in {elapsed:.1f}s ({len(chunks) * 2} LLM calls)")
+    print(f"  Nodes: {created_n} new, {merged_n} fuzzy-merged")
+    print(f"  Edges: {created_e} new, {merged_e} deduped")
+    # --- Post-extraction dedupe ---
+    print(f"\n--- Post-extraction duplicate resolution ---")
+    t1 = time.time()
+    clusters = db.resolve_duplicates(
+        match_keys=["name"],
+        match_type=True,
+        similarity_threshold=0.90,
+    )
+    print(f"  {len(clusters)} clusters resolved in {time.time() - t1:.1f}s")
+    for c in clusters[:10]:
+        merged_names = [m.properties.get("name", "?") for m in c.merged]
+        print(f"    '{c.survivor.properties.get('name')}' <- {merged_names}")
+    if len(clusters) > 10:
+        print(f"    ... and {len(clusters) - 10} more")
+    # --- Results ---
+    all_nodes = db.nodes_to_json()
+    all_edges = db.edges_to_json()
+    type_counts = {}
+    for n in all_nodes:
+        t = n.get("type", "Unknown")
+        type_counts[t] = type_counts.get(t, 0) + 1
+    print(f"\n--- Final Graph ---")
+    print(f"  Nodes: {len(all_nodes)}")
+    print(f"  Edges: {len(all_edges)}")
+    print(f"  Types: {dict(sorted(type_counts.items(), key=lambda x: -x[1]))}")
+    print(f"\n--- Sample Entities (first 30) ---")
+    sorted_nodes = sorted(all_nodes, key=lambda x: (x.get("type") or "", x.get("properties", {}).get("name") or ""))
+    for n in sorted_nodes[:30]:
+        props = n.get("properties", {})
+        print(f"  [{n.get('type', '?'):15}] {props.get('name', props)}")
+    if len(sorted_nodes) > 30:
+        print(f"  ... and {len(sorted_nodes) - 30} more")
+    print(f"\n--- Sample Relationships (first 20) ---")
+    node_id_map = {n["id"]: n for n in all_nodes}
+    for e in all_edges[:20]:
+        src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", "?")
+        tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", "?")
+        print(f"  {src} --[{e['relation']}]--> {tgt}")
+    if len(all_edges) > 20:
+        print(f"  ... and {len(all_edges) - 20} more")
+    print(f"\n--- Search: 'artificial intelligence' ---")
+    results = db.search_nodes("artificial intelligence", limit=5)
+    for sr in results:
+        print(f"  [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
+    print(f"\n{'=' * 60}")
+    print(f"{len(all_nodes)} nodes, {len(all_edges)} edges from {len(text):,} chars in {elapsed:.1f}s")
+    print(f"{'=' * 60}")
+if __name__ == "__main__":
+    main()

{graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/database.py RENAMED Viewed

@@ -86,7 +86,8 @@ class GraphMemory:
         'inner_product': {'function': 'array_negative_inner_product', 'hnsw_metric': 'ip'},
     }
-    def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1):
+    def __init__(self, database=None, vector_length=3, distance_metric='l2', max_retries=3, retry_base_delay=0.1,
+                 hnsw_ef_construction=128, hnsw_ef_search=64, hnsw_m=16, auto_index=True):
         if distance_metric not in self.DISTANCE_METRICS:
             raise ValueError(
                 f"Invalid distance_metric '{distance_metric}'. "
@@ -97,9 +98,13 @@ class GraphMemory:
         self.distance_metric = distance_metric
         self.max_retries = max_retries
         self.retry_base_delay = retry_base_delay
+        self.hnsw_ef_construction = hnsw_ef_construction
+        self.hnsw_ef_search = hnsw_ef_search
+        self.hnsw_m = hnsw_m
         self._lock = threading.RLock()
         self._fts_initialized = False
         self._fts_dirty = True
+        self._hnsw_indexed = False
         self._closed = False
         self.conn = duckdb.connect(database=self.database)
         self._load_vss_extension()
@@ -116,6 +121,9 @@ class GraphMemory:
             self._create_tables()
             logger.info("Tables created or verified successfully.")
+        if auto_index:
+            self._ensure_hnsw_index()
     def cursor(self):
         """Return a new DuckDB cursor for individual operations.
@@ -150,6 +158,8 @@ class GraphMemory:
             self._configure_database()
             self._fts_initialized = False
             self._fts_dirty = True
+            self._hnsw_indexed = False
+            self._ensure_hnsw_index()
             logger.info("Reconnection successful.")
     def close(self):
@@ -191,6 +201,8 @@ class GraphMemory:
     def set_vector_length(self, vector_length):
         self.vector_length = vector_length
+        self._hnsw_indexed = False
+        self._ensure_hnsw_index()
         logger.info(f"Vector length set to: {self.vector_length}")
     def _create_tables(self):
@@ -303,6 +315,7 @@ class GraphMemory:
                 cur.execute(
                     "DELETE FROM nodes WHERE id = ?;", (str(node_id),))
                 self._fts_dirty = True
+            self.compact_index()
         except duckdb.Error as e:
             logger.error(f"Error deleting node: {e}")
@@ -321,6 +334,7 @@ class GraphMemory:
                 cur.execute(
                     f"DELETE FROM nodes WHERE id IN ({placeholders});", id_strs)
                 self._fts_dirty = True
+            self.compact_index()
         except duckdb.Error as e:
             logger.error(f"Error during bulk delete nodes: {e}")
@@ -920,15 +934,53 @@ class GraphMemory:
             logger.error(f"Error updating edge: {e}")
             return False
+    def _ensure_hnsw_index(self):
+        """Create HNSW index if not already present. Called automatically on init."""
+        if self._hnsw_indexed:
+            return
+        try:
+            nodes_exist = self.conn.execute(
+                "SELECT 1 FROM information_schema.tables WHERE table_name = 'nodes';"
+            ).fetchone()
+            if nodes_exist:
+                self.create_index()
+        except duckdb.Error:
+            pass
     @with_retry()
-    def create_index(self):
+    def create_index(self, ef_construction: int | None = None, ef_search: int | None = None, m: int | None = None):
+        """Create or recreate the HNSW vector index.
+        Args:
+            ef_construction: Candidate vertices during build (default from init).
+            ef_search: Candidate vertices during search (default from init).
+            m: Max neighbors per vertex (default from init).
+        """
+        ef_c = ef_construction or self.hnsw_ef_construction
+        ef_s = ef_search or self.hnsw_ef_search
+        m_val = m or self.hnsw_m
+        hnsw_metric = self.DISTANCE_METRICS[self.distance_metric]['hnsw_metric']
         with self._lock:
             try:
-                hnsw_metric = self.DISTANCE_METRICS[self.distance_metric]['hnsw_metric']
+                # Drop existing index first to allow metric/param changes
+                self.conn.execute("DROP INDEX IF EXISTS vss_idx;")
                 self.conn.execute(
-                    f"CREATE INDEX IF NOT EXISTS vss_idx ON nodes USING HNSW(vector) WITH (metric = '{hnsw_metric}');")
+                    f"CREATE INDEX vss_idx ON nodes USING HNSW(vector) "
+                    f"WITH (metric = '{hnsw_metric}', ef_construction = {ef_c}, ef_search = {ef_s}, M = {m_val});"
+                )
+                self._hnsw_indexed = True
+                logger.info(f"HNSW index created (metric={hnsw_metric}, ef_construction={ef_c}, ef_search={ef_s}, M={m_val}).")
+            except duckdb.Error as e:
+                logger.error(f"Error creating HNSW index: {e}")
+    def compact_index(self):
+        """Compact the HNSW index to reclaim space after deletions."""
+        with self._lock:
+            try:
+                self.conn.execute("PRAGMA hnsw_compact_index('vss_idx');")
+                logger.info("HNSW index compacted.")
             except duckdb.Error as e:
-                logger.error(f"Error creating index: {e}")
+                logger.error(f"Error compacting HNSW index: {e}")
     @with_retry()
     def nearest_nodes(self, vector: list[float], limit: int) -> list[NearestNode]:
@@ -1334,9 +1386,10 @@ class GraphMemory:
             # Collect vector similarity results
             vss_results = {}
+            dist_func = self.DISTANCE_METRICS[self.distance_metric]['function']
             vss_query = f"""
             SELECT id, type, properties, vector,
-                   array_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) AS distance
+                   {dist_func}(vector, CAST(? AS FLOAT[{self.vector_length}])) AS distance
             FROM nodes
             WHERE vector IS NOT NULL
             ORDER BY distance;

{graphmemory-1.2.0 → graphmemory-1.3.0}/graphmemory/extraction.py RENAMED Viewed

@@ -10,7 +10,8 @@ Requires the ``dspy`` optional dependency:
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import TYPE_CHECKING, Any, Callable
 from pydantic import BaseModel, Field
@@ -296,3 +297,124 @@ def extract_and_merge(
         len(edge_results),
     )
     return node_results, edge_results
+# ---------------------------------------------------------------------------
+# Parallel extraction
+# ---------------------------------------------------------------------------
+def _extract_nodes_chunk(chunk: str) -> list[Node]:
+    """Extract nodes from a single chunk (thread-safe, no DB access)."""
+    return extract_nodes(chunk, sentences=[chunk])
+def _extract_edges_chunk(chunk: str, nodes: list[Node]) -> list[Edge]:
+    """Extract edges from a single chunk given known nodes (thread-safe)."""
+    return extract_edges(chunk, nodes, sentences=[chunk])
+def extract_and_merge_parallel(
+    graph: GraphMemory,
+    chunks: list[str],
+    match_keys: list[str] | None = None,
+    match_type: bool = True,
+    strategy: MergeStrategy = MergeStrategy.UPDATE,
+    similarity_threshold: float = 1.0,
+    vector_threshold: float | None = None,
+    max_workers: int = 8,
+    on_progress: Callable[[str, int, int], None] | None = None,
+) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
+    """Extract from multiple text chunks in parallel, then merge sequentially.
+    Runs in two parallel phases to maximize LLM throughput:
+      1. Node extraction — all chunks concurrently (saturate RPM)
+      2. Edge extraction — all chunks concurrently (with all extracted nodes as context)
+    Then merges into DB sequentially.
+    Args:
+        graph: A :class:`~graphmemory.database.GraphMemory` instance.
+        chunks: List of text chunks to process.
+        match_keys: Property names to match nodes on (default ``["name"]``).
+        match_type: Also require ``node.type`` to match.
+        strategy: How to merge properties on match.
+        similarity_threshold: Jaro-Winkler threshold for fuzzy matching.
+        vector_threshold: Max cosine distance for vector similarity.
+        max_workers: Max concurrent LLM calls (match your RPM headroom).
+        on_progress: Optional callback ``(phase, completed, total)``.
+    Returns:
+        Aggregated ``(node_results, edge_results)`` across all chunks.
+    """
+    if match_keys is None:
+        match_keys = ["name"]
+    total = len(chunks)
+    # Phase 1: Extract nodes from ALL chunks in parallel
+    chunk_nodes: dict[int, list[Node]] = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        future_to_idx = {
+            pool.submit(_extract_nodes_chunk, chunk): i
+            for i, chunk in enumerate(chunks)
+        }
+        done = 0
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                chunk_nodes[idx] = future.result()
+            except Exception as e:
+                logger.warning("Node extraction failed for chunk %d: %s", idx + 1, e)
+                chunk_nodes[idx] = []
+            done += 1
+            if on_progress:
+                on_progress("nodes", done, total)
+    # Merge all nodes into DB sequentially to build the full node set
+    all_node_results: list[MergeResult] = []
+    for idx in range(total):
+        nodes = chunk_nodes.get(idx, [])
+        if nodes:
+            results = graph.bulk_merge_nodes(
+                nodes, match_keys=match_keys, match_type=match_type,
+                strategy=strategy, similarity_threshold=similarity_threshold,
+                vector_threshold=vector_threshold,
+            )
+            all_node_results.extend(results)
+    # Build complete node list for edge extraction context
+    all_nodes = [r.node for r in all_node_results]
+    logger.info("Phase 1 complete: %d nodes extracted and merged.", len(all_nodes))
+    # Phase 2: Extract edges from ALL chunks in parallel (with full node context)
+    chunk_edges: dict[int, list[Edge]] = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        future_to_idx = {
+            pool.submit(_extract_edges_chunk, chunk, all_nodes): i
+            for i, chunk in enumerate(chunks)
+        }
+        done = 0
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                chunk_edges[idx] = future.result()
+            except Exception as e:
+                logger.warning("Edge extraction failed for chunk %d: %s", idx + 1, e)
+                chunk_edges[idx] = []
+            done += 1
+            if on_progress:
+                on_progress("edges", done, total)
+    # Merge all edges into DB sequentially
+    all_edge_results: list[EdgeMergeResult] = []
+    for idx in range(total):
+        edges = chunk_edges.get(idx, [])
+        if edges:
+            results = graph.bulk_merge_edges(edges)
+            all_edge_results.extend(results)
+    logger.info(
+        "Parallel extraction complete: %d chunks, %d nodes, %d edges.",
+        total, len(all_node_results), len(all_edge_results),
+    )
+    return all_node_results, all_edge_results

{graphmemory-1.2.0 → graphmemory-1.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "graphmemory"
-version = "1.2.0"
+version = "1.3.0"
 description = "Graph-based memory system using DuckDB"
 readme = "README.md"
 license = "MIT"

{graphmemory-1.2.0 → graphmemory-1.3.0}/tests/tests.py RENAMED Viewed

@@ -2729,5 +2729,147 @@ class TestFuzzyMatching(unittest.TestCase):
         self.assertEqual(remaining, 2)
+class TestHNSWIndex(unittest.TestCase):
+    def test_auto_index_on_init(self):
+        db = GraphMemory(database=':memory:', vector_length=3)
+        self.assertTrue(db._hnsw_indexed)
+        db.close()
+    def test_auto_index_disabled(self):
+        db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
+        self.assertFalse(db._hnsw_indexed)
+        db.close()
+    def test_create_index_with_custom_params(self):
+        db = GraphMemory(database=':memory:', vector_length=3, auto_index=False)
+        db.create_index(ef_construction=64, ef_search=32, m=8)
+        self.assertTrue(db._hnsw_indexed)
+        db.close()
+    def test_create_index_uses_configured_metric(self):
+        for metric in ['l2', 'cosine', 'inner_product']:
+            db = GraphMemory(database=':memory:', vector_length=3, distance_metric=metric)
+            self.assertTrue(db._hnsw_indexed)
+            db.close()
+    def test_create_index_idempotent_recreate(self):
+        db = GraphMemory(database=':memory:', vector_length=3)
+        db.create_index()
+        db.create_index()
+        self.assertTrue(db._hnsw_indexed)
+        db.close()
+    def test_set_vector_length_rebuilds_index(self):
+        db = GraphMemory(database=':memory:', vector_length=3)
+        self.assertTrue(db._hnsw_indexed)
+        db._hnsw_indexed = False
+        db.set_vector_length(5)
+        self.assertTrue(db._hnsw_indexed)
+        self.assertEqual(db.vector_length, 5)
+        db.close()
+    def test_compact_index_no_error(self):
+        db = GraphMemory(database=':memory:', vector_length=3)
+        node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
+        db.insert_node(node)
+        db.delete_node(node.id)
+        db.compact_index()
+        db.close()
+    def test_reconnect_rebuilds_index(self):
+        import tempfile
+        path = tempfile.mktemp(suffix='.db')
+        try:
+            db = GraphMemory(database=path, vector_length=3)
+            db._hnsw_indexed = False
+            db._reconnect()
+            self.assertTrue(db._hnsw_indexed)
+            db.close()
+        finally:
+            if os.path.exists(path):
+                os.unlink(path)
+    def test_hnsw_params_stored(self):
+        db = GraphMemory(database=':memory:', vector_length=3,
+                         hnsw_ef_construction=256, hnsw_ef_search=128, hnsw_m=32)
+        self.assertEqual(db.hnsw_ef_construction, 256)
+        self.assertEqual(db.hnsw_ef_search, 128)
+        self.assertEqual(db.hnsw_m, 32)
+        db.close()
+class TestHybridSearchMetric(unittest.TestCase):
+    def setUp(self):
+        self.db = GraphMemory(database=':memory:', vector_length=3, distance_metric='cosine')
+        self.db.insert_node(Node(type="Doc", properties={"text": "machine learning"}, vector=[1.0, 0.0, 0.0]))
+        self.db.insert_node(Node(type="Doc", properties={"text": "deep learning"}, vector=[0.9, 0.1, 0.0]))
+        self.db.insert_node(Node(type="Doc", properties={"text": "cooking recipes"}, vector=[0.0, 0.0, 1.0]))
+    def tearDown(self):
+        self.db.close()
+    def test_hybrid_search_uses_cosine_metric(self):
+        results = self.db.hybrid_search(
+            query_text="learning",
+            query_vector=[1.0, 0.0, 0.0],
+            limit=3
+        )
+        self.assertGreater(len(results), 0)
+        # The learning docs should score higher than cooking
+        names = [r.node.properties.get("text") for r in results]
+        self.assertIn("machine learning", names[:2])
+    def test_hybrid_search_inner_product(self):
+        db = GraphMemory(database=':memory:', vector_length=3, distance_metric='inner_product')
+        db.insert_node(Node(type="Doc", properties={"text": "similar"}, vector=[1.0, 0.0, 0.0]))
+        db.insert_node(Node(type="Doc", properties={"text": "different"}, vector=[0.0, 0.0, 1.0]))
+        results = db.hybrid_search(
+            query_text="similar",
+            query_vector=[1.0, 0.0, 0.0],
+            limit=2
+        )
+        self.assertGreater(len(results), 0)
+        db.close()
+    def test_hybrid_search_l2_metric(self):
+        db = GraphMemory(database=':memory:', vector_length=3, distance_metric='l2')
+        db.insert_node(Node(type="Doc", properties={"text": "near"}, vector=[0.1, 0.0, 0.0]))
+        db.insert_node(Node(type="Doc", properties={"text": "far"}, vector=[9.0, 9.0, 9.0]))
+        results = db.hybrid_search(
+            query_text="near",
+            query_vector=[0.0, 0.0, 0.0],
+            limit=2
+        )
+        self.assertGreater(len(results), 0)
+        self.assertEqual(results[0].node.properties["text"], "near")
+        db.close()
+class TestCompactAfterDelete(unittest.TestCase):
+    def setUp(self):
+        self.db = GraphMemory(database=':memory:', vector_length=3)
+    def tearDown(self):
+        self.db.close()
+    def test_delete_node_compacts(self):
+        node = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
+        self.db.insert_node(node)
+        # Should not raise — compact_index called internally
+        self.db.delete_node(node.id)
+        self.assertEqual(len(self.db.nodes_to_json()), 0)
+    def test_bulk_delete_nodes_compacts(self):
+        n1 = Node(type="Test", properties={"name": "A"}, vector=[1.0, 0.0, 0.0])
+        n2 = Node(type="Test", properties={"name": "B"}, vector=[0.0, 1.0, 0.0])
+        self.db.insert_node(n1)
+        self.db.insert_node(n2)
+        self.db.bulk_delete_nodes([n1.id, n2.id])
+        self.assertEqual(len(self.db.nodes_to_json()), 0)
 if __name__ == '__main__':
     unittest.main()

graphmemory-1.2.0/examples/test_ingest.py DELETED Viewed

@@ -1,147 +0,0 @@
-"""End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
-import sys
-import os
-import re
-import logging
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-import dspy
-from graphmemory import GraphMemory, MergeStrategy
-from graphmemory.extraction import extract_and_merge
-logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
-logger = logging.getLogger(__name__)
-# --- Configure DSPy with gpt-5-nano ---
-lm = dspy.LM("openai/gpt-5-nano")
-dspy.configure(lm=lm)
-def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
-    """Split text into paragraph-aware chunks."""
-    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
-    chunks = []
-    current = []
-    current_len = 0
-    for p in paragraphs:
-        if current_len + len(p) > max_chars and current:
-            chunks.append("\n\n".join(current))
-            current = []
-            current_len = 0
-        current.append(p)
-        current_len += len(p)
-    if current:
-        chunks.append("\n\n".join(current))
-    return chunks
-def main():
-    input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
-    with open(input_path) as f:
-        text = f.read(100_000)
-    text = re.sub(r"<!--.*?-->", "", text)
-    chunks = chunk_text(text, max_chars=4000)
-    print("=" * 60)
-    print("GraphMemory — Real LLM Extraction Test")
-    print("=" * 60)
-    print(f"Source: aimav4.txt ({len(text)} chars)")
-    print(f"Chunks: {len(chunks)}")
-    print(f"LLM: gpt-5-nano via DSPy")
-    db = GraphMemory(database=":memory:", vector_length=3)
-    print(f"\n--- Extracting entities & relationships ---")
-    total_nodes = 0
-    total_edges = 0
-    total_merged_nodes = 0
-    total_merged_edges = 0
-    for i, chunk in enumerate(chunks):
-        print(f"\n  Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
-        try:
-            # Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
-            node_results, edge_results = extract_and_merge(
-                db,
-                chunk,
-                match_keys=["name"],
-                match_type=True,
-                similarity_threshold=0.88,
-                sentences=[chunk],  # single LLM call per chunk
-            )
-            created_n = sum(1 for r in node_results if r.created)
-            merged_n = sum(1 for r in node_results if not r.created)
-            created_e = sum(1 for r in edge_results if r.created)
-            merged_e = sum(1 for r in edge_results if not r.created)
-            total_nodes += created_n
-            total_merged_nodes += merged_n
-            total_edges += created_e
-            total_merged_edges += merged_e
-            print(f"    Nodes: {created_n} new, {merged_n} merged")
-            print(f"    Edges: {created_e} new, {merged_e} merged")
-        except Exception as e:
-            logger.warning(f"  Chunk {i + 1} failed: {e}")
-    # --- Post-extraction dedupe ---
-    print(f"\n--- Post-extraction duplicate resolution ---")
-    clusters = db.resolve_duplicates(
-        match_keys=["name"],
-        match_type=True,
-        similarity_threshold=0.90,
-    )
-    if clusters:
-        for c in clusters:
-            merged_names = [m.properties.get("name", "?") for m in c.merged]
-            print(f"  Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
-    else:
-        print("  No additional duplicates found.")
-    # --- Results ---
-    all_nodes = db.nodes_to_json()
-    all_edges = db.edges_to_json()
-    print(f"\n--- Final Graph ---")
-    print(f"  Nodes: {len(all_nodes)}")
-    print(f"  Edges: {len(all_edges)}")
-    type_counts = {}
-    for n in all_nodes:
-        t = n.get("type", "Unknown")
-        type_counts[t] = type_counts.get(t, 0) + 1
-    print(f"  Types: {type_counts}")
-    print(f"\n--- Extracted Entities ---")
-    for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
-        props = n.get("properties", {})
-        print(f"  [{n.get('type', '?'):15}] {props.get('name', props)}")
-    print(f"\n--- Extracted Relationships ---")
-    node_id_map = {n["id"]: n for n in all_nodes}
-    for e in all_edges:
-        src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
-        tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
-        print(f"  {src} --[{e['relation']}]--> {tgt}")
-    print(f"\n--- Full-text search: 'deep learning' ---")
-    results = db.search_nodes("deep learning", limit=5)
-    for sr in results:
-        print(f"  [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
-    print(f"\n--- Summary ---")
-    print(f"  Extracted: {total_nodes} nodes, {total_edges} edges")
-    print(f"  Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
-    print(f"  Post-dedupe clusters: {len(clusters)}")
-    print(f"  Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
-    print("\n" + "=" * 60)
-    print("Done!")
-    print("=" * 60)
-if __name__ == "__main__":
-    main()