PyPI - graphmemory - Versions diffs - 1.2.0__tar.gz → 1.2.1__tar.gz - Mend

graphmemory 1.2.0tar.gz → 1.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{graphmemory-1.2.0 → graphmemory-1.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: graphmemory
-Version: 1.2.0
+Version: 1.2.1
 Summary: Graph-based memory system using DuckDB
 Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
 Project-URL: Repository, https://github.com/bradAGI/GraphMemory

graphmemory-1.2.1/examples/test_ingest.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""End-to-end test: ingest aimav4.txt using parallel LLM extraction via DSPy."""
+import sys
+import os
+import re
+import time
+import logging
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import dspy
+from graphmemory import GraphMemory, MergeStrategy
+from graphmemory.extraction import extract_and_merge_parallel
+logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+# --- Configure DSPy with gpt-5-nano (10k RPM, 10M TPM) ---
+lm = dspy.LM("openai/gpt-5-nano")
+dspy.configure(lm=lm)
+# With 10k RPM we can safely run 50+ concurrent requests
+MAX_WORKERS = 50
+def chunk_text(text: str, max_chars: int = 4000) -> list[str]:
+    """Split text into paragraph-aware chunks."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    current = []
+    current_len = 0
+    for p in paragraphs:
+        if current_len + len(p) > max_chars and current:
+            chunks.append("\n\n".join(current))
+            current = []
+            current_len = 0
+        current.append(p)
+        current_len += len(p)
+    if current:
+        chunks.append("\n\n".join(current))
+    return chunks
+def on_progress(phase, done, total):
+    bar_len = 30
+    filled = int(bar_len * done / total)
+    bar = "█" * filled + "░" * (bar_len - filled)
+    print(f"\r  {phase:5s} [{bar}] {done}/{total}", end="", flush=True)
+    if done == total:
+        print()
+def main():
+    input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
+    with open(input_path) as f:
+        text = f.read(200_000)
+    text = re.sub(r"<!--.*?-->", "", text)
+    chunks = chunk_text(text, max_chars=4000)
+    print("=" * 60)
+    print("GraphMemory — Parallel LLM Extraction")
+    print("=" * 60)
+    print(f"Source: aimav4.txt ({len(text):,} chars)")
+    print(f"Chunks: {len(chunks)} x ~4k chars")
+    print(f"Workers: {MAX_WORKERS} concurrent LLM calls")
+    print(f"LLM: gpt-5-nano via DSPy")
+    db = GraphMemory(database=":memory:", vector_length=3)
+    print(f"\n--- Phase 1: Node extraction (parallel) ---")
+    print(f"--- Phase 2: Edge extraction (parallel) ---")
+    t0 = time.time()
+    node_results, edge_results = extract_and_merge_parallel(
+        db,
+        chunks,
+        match_keys=["name"],
+        match_type=True,
+        similarity_threshold=0.88,
+        max_workers=MAX_WORKERS,
+        on_progress=on_progress,
+    )
+    elapsed = time.time() - t0
+    created_n = sum(1 for r in node_results if r.created)
+    merged_n = sum(1 for r in node_results if not r.created)
+    created_e = sum(1 for r in edge_results if r.created)
+    merged_e = sum(1 for r in edge_results if not r.created)
+    print(f"\n  Done in {elapsed:.1f}s ({len(chunks) * 2} LLM calls)")
+    print(f"  Nodes: {created_n} new, {merged_n} fuzzy-merged")
+    print(f"  Edges: {created_e} new, {merged_e} deduped")
+    # --- Post-extraction dedupe ---
+    print(f"\n--- Post-extraction duplicate resolution ---")
+    t1 = time.time()
+    clusters = db.resolve_duplicates(
+        match_keys=["name"],
+        match_type=True,
+        similarity_threshold=0.90,
+    )
+    print(f"  {len(clusters)} clusters resolved in {time.time() - t1:.1f}s")
+    for c in clusters[:10]:
+        merged_names = [m.properties.get("name", "?") for m in c.merged]
+        print(f"    '{c.survivor.properties.get('name')}' <- {merged_names}")
+    if len(clusters) > 10:
+        print(f"    ... and {len(clusters) - 10} more")
+    # --- Results ---
+    all_nodes = db.nodes_to_json()
+    all_edges = db.edges_to_json()
+    type_counts = {}
+    for n in all_nodes:
+        t = n.get("type", "Unknown")
+        type_counts[t] = type_counts.get(t, 0) + 1
+    print(f"\n--- Final Graph ---")
+    print(f"  Nodes: {len(all_nodes)}")
+    print(f"  Edges: {len(all_edges)}")
+    print(f"  Types: {dict(sorted(type_counts.items(), key=lambda x: -x[1]))}")
+    print(f"\n--- Sample Entities (first 30) ---")
+    sorted_nodes = sorted(all_nodes, key=lambda x: (x.get("type") or "", x.get("properties", {}).get("name") or ""))
+    for n in sorted_nodes[:30]:
+        props = n.get("properties", {})
+        print(f"  [{n.get('type', '?'):15}] {props.get('name', props)}")
+    if len(sorted_nodes) > 30:
+        print(f"  ... and {len(sorted_nodes) - 30} more")
+    print(f"\n--- Sample Relationships (first 20) ---")
+    node_id_map = {n["id"]: n for n in all_nodes}
+    for e in all_edges[:20]:
+        src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", "?")
+        tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", "?")
+        print(f"  {src} --[{e['relation']}]--> {tgt}")
+    if len(all_edges) > 20:
+        print(f"  ... and {len(all_edges) - 20} more")
+    print(f"\n--- Search: 'artificial intelligence' ---")
+    results = db.search_nodes("artificial intelligence", limit=5)
+    for sr in results:
+        print(f"  [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
+    print(f"\n{'=' * 60}")
+    print(f"{len(all_nodes)} nodes, {len(all_edges)} edges from {len(text):,} chars in {elapsed:.1f}s")
+    print(f"{'=' * 60}")
+if __name__ == "__main__":
+    main()

{graphmemory-1.2.0 → graphmemory-1.2.1}/graphmemory/extraction.py RENAMED Viewed

@@ -10,7 +10,8 @@ Requires the ``dspy`` optional dependency:
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import TYPE_CHECKING, Any, Callable
 from pydantic import BaseModel, Field
@@ -296,3 +297,124 @@ def extract_and_merge(
         len(edge_results),
     )
     return node_results, edge_results
+# ---------------------------------------------------------------------------
+# Parallel extraction
+# ---------------------------------------------------------------------------
+def _extract_nodes_chunk(chunk: str) -> list[Node]:
+    """Extract nodes from a single chunk (thread-safe, no DB access)."""
+    return extract_nodes(chunk, sentences=[chunk])
+def _extract_edges_chunk(chunk: str, nodes: list[Node]) -> list[Edge]:
+    """Extract edges from a single chunk given known nodes (thread-safe)."""
+    return extract_edges(chunk, nodes, sentences=[chunk])
+def extract_and_merge_parallel(
+    graph: GraphMemory,
+    chunks: list[str],
+    match_keys: list[str] | None = None,
+    match_type: bool = True,
+    strategy: MergeStrategy = MergeStrategy.UPDATE,
+    similarity_threshold: float = 1.0,
+    vector_threshold: float | None = None,
+    max_workers: int = 8,
+    on_progress: Callable[[str, int, int], None] | None = None,
+) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
+    """Extract from multiple text chunks in parallel, then merge sequentially.
+    Runs in two parallel phases to maximize LLM throughput:
+      1. Node extraction — all chunks concurrently (saturate RPM)
+      2. Edge extraction — all chunks concurrently (with all extracted nodes as context)
+    Then merges into DB sequentially.
+    Args:
+        graph: A :class:`~graphmemory.database.GraphMemory` instance.
+        chunks: List of text chunks to process.
+        match_keys: Property names to match nodes on (default ``["name"]``).
+        match_type: Also require ``node.type`` to match.
+        strategy: How to merge properties on match.
+        similarity_threshold: Jaro-Winkler threshold for fuzzy matching.
+        vector_threshold: Max cosine distance for vector similarity.
+        max_workers: Max concurrent LLM calls (match your RPM headroom).
+        on_progress: Optional callback ``(phase, completed, total)``.
+    Returns:
+        Aggregated ``(node_results, edge_results)`` across all chunks.
+    """
+    if match_keys is None:
+        match_keys = ["name"]
+    total = len(chunks)
+    # Phase 1: Extract nodes from ALL chunks in parallel
+    chunk_nodes: dict[int, list[Node]] = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        future_to_idx = {
+            pool.submit(_extract_nodes_chunk, chunk): i
+            for i, chunk in enumerate(chunks)
+        }
+        done = 0
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                chunk_nodes[idx] = future.result()
+            except Exception as e:
+                logger.warning("Node extraction failed for chunk %d: %s", idx + 1, e)
+                chunk_nodes[idx] = []
+            done += 1
+            if on_progress:
+                on_progress("nodes", done, total)
+    # Merge all nodes into DB sequentially to build the full node set
+    all_node_results: list[MergeResult] = []
+    for idx in range(total):
+        nodes = chunk_nodes.get(idx, [])
+        if nodes:
+            results = graph.bulk_merge_nodes(
+                nodes, match_keys=match_keys, match_type=match_type,
+                strategy=strategy, similarity_threshold=similarity_threshold,
+                vector_threshold=vector_threshold,
+            )
+            all_node_results.extend(results)
+    # Build complete node list for edge extraction context
+    all_nodes = [r.node for r in all_node_results]
+    logger.info("Phase 1 complete: %d nodes extracted and merged.", len(all_nodes))
+    # Phase 2: Extract edges from ALL chunks in parallel (with full node context)
+    chunk_edges: dict[int, list[Edge]] = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        future_to_idx = {
+            pool.submit(_extract_edges_chunk, chunk, all_nodes): i
+            for i, chunk in enumerate(chunks)
+        }
+        done = 0
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                chunk_edges[idx] = future.result()
+            except Exception as e:
+                logger.warning("Edge extraction failed for chunk %d: %s", idx + 1, e)
+                chunk_edges[idx] = []
+            done += 1
+            if on_progress:
+                on_progress("edges", done, total)
+    # Merge all edges into DB sequentially
+    all_edge_results: list[EdgeMergeResult] = []
+    for idx in range(total):
+        edges = chunk_edges.get(idx, [])
+        if edges:
+            results = graph.bulk_merge_edges(edges)
+            all_edge_results.extend(results)
+    logger.info(
+        "Parallel extraction complete: %d chunks, %d nodes, %d edges.",
+        total, len(all_node_results), len(all_edge_results),
+    )
+    return all_node_results, all_edge_results

{graphmemory-1.2.0 → graphmemory-1.2.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "graphmemory"
-version = "1.2.0"
+version = "1.2.1"
 description = "Graph-based memory system using DuckDB"
 readme = "README.md"
 license = "MIT"

graphmemory-1.2.0/examples/test_ingest.py DELETED Viewed

@@ -1,147 +0,0 @@
-"""End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
-import sys
-import os
-import re
-import logging
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-import dspy
-from graphmemory import GraphMemory, MergeStrategy
-from graphmemory.extraction import extract_and_merge
-logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
-logger = logging.getLogger(__name__)
-# --- Configure DSPy with gpt-5-nano ---
-lm = dspy.LM("openai/gpt-5-nano")
-dspy.configure(lm=lm)
-def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
-    """Split text into paragraph-aware chunks."""
-    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
-    chunks = []
-    current = []
-    current_len = 0
-    for p in paragraphs:
-        if current_len + len(p) > max_chars and current:
-            chunks.append("\n\n".join(current))
-            current = []
-            current_len = 0
-        current.append(p)
-        current_len += len(p)
-    if current:
-        chunks.append("\n\n".join(current))
-    return chunks
-def main():
-    input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
-    with open(input_path) as f:
-        text = f.read(100_000)
-    text = re.sub(r"<!--.*?-->", "", text)
-    chunks = chunk_text(text, max_chars=4000)
-    print("=" * 60)
-    print("GraphMemory — Real LLM Extraction Test")
-    print("=" * 60)
-    print(f"Source: aimav4.txt ({len(text)} chars)")
-    print(f"Chunks: {len(chunks)}")
-    print(f"LLM: gpt-5-nano via DSPy")
-    db = GraphMemory(database=":memory:", vector_length=3)
-    print(f"\n--- Extracting entities & relationships ---")
-    total_nodes = 0
-    total_edges = 0
-    total_merged_nodes = 0
-    total_merged_edges = 0
-    for i, chunk in enumerate(chunks):
-        print(f"\n  Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
-        try:
-            # Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
-            node_results, edge_results = extract_and_merge(
-                db,
-                chunk,
-                match_keys=["name"],
-                match_type=True,
-                similarity_threshold=0.88,
-                sentences=[chunk],  # single LLM call per chunk
-            )
-            created_n = sum(1 for r in node_results if r.created)
-            merged_n = sum(1 for r in node_results if not r.created)
-            created_e = sum(1 for r in edge_results if r.created)
-            merged_e = sum(1 for r in edge_results if not r.created)
-            total_nodes += created_n
-            total_merged_nodes += merged_n
-            total_edges += created_e
-            total_merged_edges += merged_e
-            print(f"    Nodes: {created_n} new, {merged_n} merged")
-            print(f"    Edges: {created_e} new, {merged_e} merged")
-        except Exception as e:
-            logger.warning(f"  Chunk {i + 1} failed: {e}")
-    # --- Post-extraction dedupe ---
-    print(f"\n--- Post-extraction duplicate resolution ---")
-    clusters = db.resolve_duplicates(
-        match_keys=["name"],
-        match_type=True,
-        similarity_threshold=0.90,
-    )
-    if clusters:
-        for c in clusters:
-            merged_names = [m.properties.get("name", "?") for m in c.merged]
-            print(f"  Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
-    else:
-        print("  No additional duplicates found.")
-    # --- Results ---
-    all_nodes = db.nodes_to_json()
-    all_edges = db.edges_to_json()
-    print(f"\n--- Final Graph ---")
-    print(f"  Nodes: {len(all_nodes)}")
-    print(f"  Edges: {len(all_edges)}")
-    type_counts = {}
-    for n in all_nodes:
-        t = n.get("type", "Unknown")
-        type_counts[t] = type_counts.get(t, 0) + 1
-    print(f"  Types: {type_counts}")
-    print(f"\n--- Extracted Entities ---")
-    for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
-        props = n.get("properties", {})
-        print(f"  [{n.get('type', '?'):15}] {props.get('name', props)}")
-    print(f"\n--- Extracted Relationships ---")
-    node_id_map = {n["id"]: n for n in all_nodes}
-    for e in all_edges:
-        src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
-        tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
-        print(f"  {src} --[{e['relation']}]--> {tgt}")
-    print(f"\n--- Full-text search: 'deep learning' ---")
-    results = db.search_nodes("deep learning", limit=5)
-    for sr in results:
-        print(f"  [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
-    print(f"\n--- Summary ---")
-    print(f"  Extracted: {total_nodes} nodes, {total_edges} edges")
-    print(f"  Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
-    print(f"  Post-dedupe clusters: {len(clusters)}")
-    print(f"  Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
-    print("\n" + "=" * 60)
-    print("Done!")
-    print("=" * 60)
-if __name__ == "__main__":
-    main()