PyPI - graphmemory - Versions diffs - 1.1.2__tar.gz → 1.2.0__tar.gz - Mend

graphmemory 1.1.2tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{graphmemory-1.1.2 → graphmemory-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: graphmemory
-Version: 1.1.2
+Version: 1.2.0
 Summary: Graph-based memory system using DuckDB
 Project-URL: Homepage, https://github.com/bradAGI/GraphMemory
 Project-URL: Repository, https://github.com/bradAGI/GraphMemory
@@ -27,7 +27,9 @@ Description-Content-Type: text/markdown
 [![](https://dcbadge.limes.pink/api/server/https://discord.gg/DSS3DmStV8)](https://discord.gg/DSS3DmStV8)
-# GraphMemory
+# GraphMemory - GraphRAG Database
+![GraphMemory](https://github.com/bradAGI/GraphMemory/assets/46579244/9897dc2a-46c9-42e0-a8d3-2dcb1d93e6ae)
 An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.

{graphmemory-1.1.2 → graphmemory-1.2.0}/README.md RENAMED Viewed

@@ -1,6 +1,8 @@
 [![](https://dcbadge.limes.pink/api/server/https://discord.gg/DSS3DmStV8)](https://discord.gg/DSS3DmStV8)
-# GraphMemory
+# GraphMemory - GraphRAG Database
+![GraphMemory](https://github.com/bradAGI/GraphMemory/assets/46579244/9897dc2a-46c9-42e0-a8d3-2dcb1d93e6ae)
 An embedded graph database for RAG and knowledge graph applications, powered by [DuckDB](https://duckdb.org/). Vector similarity search, full-text search, hybrid search, merge/upsert, graph traversal, and a full GraphRAG retrieval pipeline — all in a single Python package.

graphmemory-1.2.0/examples/test_ingest.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""End-to-end test: ingest aimav4.txt using real LLM extraction via DSPy."""
+import sys
+import os
+import re
+import logging
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import dspy
+from graphmemory import GraphMemory, MergeStrategy
+from graphmemory.extraction import extract_and_merge
+logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+# --- Configure DSPy with gpt-5-nano ---
+lm = dspy.LM("openai/gpt-5-nano")
+dspy.configure(lm=lm)
+def chunk_text(text: str, max_chars: int = 3000) -> list[str]:
+    """Split text into paragraph-aware chunks."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    current = []
+    current_len = 0
+    for p in paragraphs:
+        if current_len + len(p) > max_chars and current:
+            chunks.append("\n\n".join(current))
+            current = []
+            current_len = 0
+        current.append(p)
+        current_len += len(p)
+    if current:
+        chunks.append("\n\n".join(current))
+    return chunks
+def main():
+    input_path = os.path.join(os.path.dirname(__file__), "..", "input", "aimav4.txt")
+    with open(input_path) as f:
+        text = f.read(100_000)
+    text = re.sub(r"<!--.*?-->", "", text)
+    chunks = chunk_text(text, max_chars=4000)
+    print("=" * 60)
+    print("GraphMemory — Real LLM Extraction Test")
+    print("=" * 60)
+    print(f"Source: aimav4.txt ({len(text)} chars)")
+    print(f"Chunks: {len(chunks)}")
+    print(f"LLM: gpt-5-nano via DSPy")
+    db = GraphMemory(database=":memory:", vector_length=3)
+    print(f"\n--- Extracting entities & relationships ---")
+    total_nodes = 0
+    total_edges = 0
+    total_merged_nodes = 0
+    total_merged_edges = 0
+    for i, chunk in enumerate(chunks):
+        print(f"\n  Chunk {i + 1}/{len(chunks)} ({len(chunk)} chars)...")
+        try:
+            # Pass each chunk as a single "sentence" to avoid per-sentence LLM calls
+            node_results, edge_results = extract_and_merge(
+                db,
+                chunk,
+                match_keys=["name"],
+                match_type=True,
+                similarity_threshold=0.88,
+                sentences=[chunk],  # single LLM call per chunk
+            )
+            created_n = sum(1 for r in node_results if r.created)
+            merged_n = sum(1 for r in node_results if not r.created)
+            created_e = sum(1 for r in edge_results if r.created)
+            merged_e = sum(1 for r in edge_results if not r.created)
+            total_nodes += created_n
+            total_merged_nodes += merged_n
+            total_edges += created_e
+            total_merged_edges += merged_e
+            print(f"    Nodes: {created_n} new, {merged_n} merged")
+            print(f"    Edges: {created_e} new, {merged_e} merged")
+        except Exception as e:
+            logger.warning(f"  Chunk {i + 1} failed: {e}")
+    # --- Post-extraction dedupe ---
+    print(f"\n--- Post-extraction duplicate resolution ---")
+    clusters = db.resolve_duplicates(
+        match_keys=["name"],
+        match_type=True,
+        similarity_threshold=0.90,
+    )
+    if clusters:
+        for c in clusters:
+            merged_names = [m.properties.get("name", "?") for m in c.merged]
+            print(f"  Merged: '{c.survivor.properties.get('name')}' <- {merged_names}")
+    else:
+        print("  No additional duplicates found.")
+    # --- Results ---
+    all_nodes = db.nodes_to_json()
+    all_edges = db.edges_to_json()
+    print(f"\n--- Final Graph ---")
+    print(f"  Nodes: {len(all_nodes)}")
+    print(f"  Edges: {len(all_edges)}")
+    type_counts = {}
+    for n in all_nodes:
+        t = n.get("type", "Unknown")
+        type_counts[t] = type_counts.get(t, 0) + 1
+    print(f"  Types: {type_counts}")
+    print(f"\n--- Extracted Entities ---")
+    for n in sorted(all_nodes, key=lambda x: (x.get("type", ""), x.get("properties", {}).get("name", ""))):
+        props = n.get("properties", {})
+        print(f"  [{n.get('type', '?'):15}] {props.get('name', props)}")
+    print(f"\n--- Extracted Relationships ---")
+    node_id_map = {n["id"]: n for n in all_nodes}
+    for e in all_edges:
+        src = node_id_map.get(e["source_id"], {}).get("properties", {}).get("name", e["source_id"])
+        tgt = node_id_map.get(e["target_id"], {}).get("properties", {}).get("name", e["target_id"])
+        print(f"  {src} --[{e['relation']}]--> {tgt}")
+    print(f"\n--- Full-text search: 'deep learning' ---")
+    results = db.search_nodes("deep learning", limit=5)
+    for sr in results:
+        print(f"  [{sr.node.type}] {sr.node.properties.get('name', '?')} (score={sr.score:.3f})")
+    print(f"\n--- Summary ---")
+    print(f"  Extracted: {total_nodes} nodes, {total_edges} edges")
+    print(f"  Fuzzy-merged during ingest: {total_merged_nodes} nodes, {total_merged_edges} edges")
+    print(f"  Post-dedupe clusters: {len(clusters)}")
+    print(f"  Final graph: {len(all_nodes)} nodes, {len(all_edges)} edges")
+    print("\n" + "=" * 60)
+    print("Done!")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

graphmemory-1.2.0/graphmemory/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .database import GraphMemory, QueryBuilder
+from .models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
+__all__ = ["DuplicateCluster", "Edge", "EdgeMergeResult", "GraphMemory", "MergeResult", "MergeStrategy", "NearestNode", "Node", "QueryBuilder", "RetrievalContext", "RetrievalResult", "SearchResult", "TraversalResult", "algorithms", "extraction"]

{graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/database.py RENAMED Viewed

@@ -13,7 +13,7 @@ import xml.etree.ElementTree as ET
 from contextlib import contextmanager
 from typing import Any, Dict, List, Union, List
-from graphmemory.models import Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
+from graphmemory.models import DuplicateCluster, Edge, EdgeMergeResult, MergeResult, MergeStrategy, NearestNode, Node, RetrievalContext, RetrievalResult, SearchResult, TraversalResult
 logger = logging.getLogger(__name__)
@@ -338,25 +338,71 @@ class GraphMemory:
         except duckdb.Error as e:
             logger.error(f"Error during bulk delete edges: {e}")
-    def _find_matching_node(self, cur, node: Node, match_keys: list[str], match_type: bool) -> Node | None:
-        """Find an existing node matching the given property keys and optional type."""
+    def _find_matching_node(
+        self, cur, node: Node, match_keys: list[str], match_type: bool,
+        similarity_threshold: float = 1.0,
+        vector_threshold: float | None = None,
+    ) -> Node | None:
+        """Find an existing node matching the given property keys and optional type.
+        When ``similarity_threshold`` is 1.0 (default), matching is exact.
+        Lower values enable fuzzy matching via DuckDB's ``jaro_winkler_similarity``.
+        When ``vector_threshold`` is set and the node has a vector, candidates must
+        also have a cosine distance within that threshold.
+        """
+        fuzzy = similarity_threshold < 1.0
+        # Separate param lists for SELECT expressions vs WHERE clauses,
+        # since DuckDB binds positional params in statement order.
+        select_extra: list[str] = []
+        select_params: list = []
         where_parts: list[str] = []
-        params: list = []
+        where_params: list = []
         if match_type and node.type is not None:
             where_parts.append("type = ?")
-            params.append(node.type)
+            where_params.append(node.type)
         for key in match_keys:
             if not self._VALID_ATTRIBUTE_RE.match(key):
                 raise ValueError(f"Invalid match key: {key!r}")
             value = (node.properties or {}).get(key)
             if value is None:
                 where_parts.append(f"json_extract(properties, '$.{key}') IS NULL")
+            elif fuzzy and isinstance(value, str):
+                alias = f"sim_{key}"
+                select_extra.append(
+                    f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
+                )
+                select_params.append(value)
+                where_parts.append(f"{alias} >= ?")
+                where_params.append(similarity_threshold)
             else:
                 where_parts.append(f"json_extract(properties, '$.{key}') = ?")
-                params.append(json.dumps(value))
-        if not where_parts:
+                where_params.append(json.dumps(value))
+        if vector_threshold is not None and node.vector:
+            where_parts.append(f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?")
+            where_params.extend([node.vector, vector_threshold])
+        if not where_parts and not select_extra:
             return None
-        query = "SELECT id, type, properties, vector FROM nodes WHERE " + " AND ".join(where_parts) + " LIMIT 1;"
+        select_cols = "id, type, properties, vector"
+        if select_extra:
+            select_cols += ", " + ", ".join(select_extra)
+        where_clause = " AND ".join(where_parts) if where_parts else "TRUE"
+        order_clause = ""
+        if fuzzy:
+            sim_cols = [f"sim_{k}" for k in match_keys
+                        if isinstance((node.properties or {}).get(k), str)]
+            if sim_cols:
+                order_clause = " ORDER BY " + " + ".join(sim_cols) + " DESC"
+        query = f"SELECT {select_cols} FROM nodes WHERE {where_clause}{order_clause} LIMIT 1;"
+        params = select_params + where_params
         row = cur.execute(query, params).fetchone()
         if row:
             return Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
@@ -373,11 +419,53 @@ class GraphMemory:
             return existing or {}
         return incoming or {}
+    def _safe_update_node(self, cur, node_id: str, node_type, properties: dict, vector) -> None:
+        """Update a node, working around DuckDB FK constraints on UPDATE.
+        DuckDB internally deletes+reinserts rows on UPDATE, which triggers FK
+        violations when edges reference the node. This method temporarily removes
+        and restores those edges.
+        """
+        try:
+            cur.execute(
+                "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
+                (node_type, json.dumps(properties), vector, node_id)
+            )
+        except duckdb.ConstraintException:
+            # Stash edges, update node, restore edges
+            edges = cur.execute(
+                "SELECT id, source_id, target_id, relation, weight FROM edges "
+                "WHERE source_id = ? OR target_id = ?;",
+                (node_id, node_id)
+            ).fetchall()
+            for eid, *_ in edges:
+                cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
+            cur.execute(
+                "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
+                (node_type, json.dumps(properties), vector, node_id)
+            )
+            for eid, src, tgt, rel, wt in edges:
+                cur.execute(
+                    "INSERT INTO edges (id, source_id, target_id, relation, weight) "
+                    "VALUES (?, ?, ?, ?, ?);",
+                    (eid, src, tgt, rel, wt)
+                )
+    @staticmethod
+    def normalize_relation(relation: str) -> str:
+        """Lowercase, strip, and collapse whitespace/separators to underscores."""
+        s = relation.strip().lower()
+        s = re.sub(r'[\s\-\.]+', '_', s)
+        s = re.sub(r'_+', '_', s)
+        return s.strip('_')
     @with_retry()
     def merge_node(self, node: Node, match_keys: list[str],
                    match_type: bool = True,
                    strategy: MergeStrategy = MergeStrategy.UPDATE,
-                   update_vector: bool = True) -> MergeResult:
+                   update_vector: bool = True,
+                   similarity_threshold: float = 1.0,
+                   vector_threshold: float | None = None) -> MergeResult:
         """Insert a node or update it if a match is found by property keys.
         Args:
@@ -401,15 +489,16 @@ class GraphMemory:
         try:
             with self.transaction():
                 cur = self.cursor()
-                existing = self._find_matching_node(cur, node, match_keys, match_type)
+                existing = self._find_matching_node(
+                    cur, node, match_keys, match_type,
+                    similarity_threshold=similarity_threshold,
+                    vector_threshold=vector_threshold,
+                )
                 if existing:
                     merged_props = self._merge_properties(existing.properties, node.properties, strategy)
                     vector = node.vector if update_vector and node.vector else existing.vector
                     node_type = node.type if node.type is not None else existing.type
-                    cur.execute(
-                        "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
-                        (node_type, json.dumps(merged_props), vector, str(existing.id))
-                    )
+                    self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
                     self._fts_dirty = True
                     result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
                     return MergeResult(node=result_node, created=False)
@@ -429,7 +518,9 @@ class GraphMemory:
     def bulk_merge_nodes(self, nodes: list[Node], match_keys: list[str],
                          match_type: bool = True,
                          strategy: MergeStrategy = MergeStrategy.UPDATE,
-                         update_vector: bool = True) -> list[MergeResult]:
+                         update_vector: bool = True,
+                         similarity_threshold: float = 1.0,
+                         vector_threshold: float | None = None) -> list[MergeResult]:
         """Merge multiple nodes, inserting new ones and updating matches.
         Runs in a single transaction for atomicity.
@@ -448,15 +539,16 @@ class GraphMemory:
                     if node.vector and not self._validate_vector(node.vector):
                         logger.error(f"Invalid vector for node, skipping: {node.id}")
                         continue
-                    existing = self._find_matching_node(cur, node, match_keys, match_type)
+                    existing = self._find_matching_node(
+                        cur, node, match_keys, match_type,
+                        similarity_threshold=similarity_threshold,
+                        vector_threshold=vector_threshold,
+                    )
                     if existing:
                         merged_props = self._merge_properties(existing.properties, node.properties, strategy)
                         vector = node.vector if update_vector and node.vector else existing.vector
                         node_type = node.type if node.type is not None else existing.type
-                        cur.execute(
-                            "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
-                            (node_type, json.dumps(merged_props), vector, str(existing.id))
-                        )
+                        self._safe_update_node(cur, str(existing.id), node_type, merged_props, vector)
                         result_node = Node(id=existing.id, type=node_type, properties=merged_props, vector=vector)
                         results.append(MergeResult(node=result_node, created=False))
                     else:
@@ -474,10 +566,14 @@ class GraphMemory:
             raise
     def _find_matching_edge(self, cur, edge: Edge) -> Edge | None:
-        """Find an existing edge matching (source_id, target_id, relation)."""
+        """Find an existing edge matching (source_id, target_id, relation).
+        Relations are compared in normalized form (lowercase, underscored).
+        """
+        normalized = self.normalize_relation(edge.relation)
         row = cur.execute(
             "SELECT id, source_id, target_id, relation, weight FROM edges WHERE source_id = ? AND target_id = ? AND relation = ? LIMIT 1;",
-            (str(edge.source_id), str(edge.target_id), edge.relation)
+            (str(edge.source_id), str(edge.target_id), normalized)
         ).fetchone()
         if row:
             return Edge(id=row[0], source_id=row[1], target_id=row[2], relation=row[3], weight=row[4])
@@ -511,11 +607,13 @@ class GraphMemory:
                         result_edge = existing
                     return EdgeMergeResult(edge=result_edge, created=False)
                 else:
+                    normalized = self.normalize_relation(edge.relation)
                     cur.execute(
                         "INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
-                        (str(edge.id), str(edge.source_id), str(edge.target_id), edge.relation, edge.weight)
+                        (str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
                     )
-                    return EdgeMergeResult(edge=edge, created=True)
+                    result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
+                    return EdgeMergeResult(edge=result_edge, created=True)
         except duckdb.Error as e:
             logger.error(f"Error during merge edge: {e}")
             raise
@@ -545,16 +643,211 @@ class GraphMemory:
                             result_edge = existing
                         results.append(EdgeMergeResult(edge=result_edge, created=False))
                     else:
+                        normalized = self.normalize_relation(edge.relation)
                         cur.execute(
                             "INSERT INTO edges (id, source_id, target_id, relation, weight) VALUES (?, ?, ?, ?, ?);",
-                            (str(edge.id), str(edge.source_id), str(edge.target_id), edge.relation, edge.weight)
+                            (str(edge.id), str(edge.source_id), str(edge.target_id), normalized, edge.weight)
                         )
-                        results.append(EdgeMergeResult(edge=edge, created=True))
+                        result_edge = Edge(id=edge.id, source_id=edge.source_id, target_id=edge.target_id, relation=normalized, weight=edge.weight)
+                        results.append(EdgeMergeResult(edge=result_edge, created=True))
                 return results
         except duckdb.Error as e:
             logger.error(f"Error during bulk merge edges: {e}")
             raise
+    @with_retry()
+    def resolve_duplicates(
+        self,
+        match_keys: list[str] | None = None,
+        match_type: bool = True,
+        similarity_threshold: float = 0.9,
+        vector_threshold: float | None = None,
+        strategy: MergeStrategy = MergeStrategy.UPDATE,
+    ) -> list[DuplicateCluster]:
+        """Scan all nodes and merge clusters of likely duplicates.
+        For each unprocessed node, finds fuzzy matches among remaining nodes.
+        The first node encountered becomes the "survivor"; duplicates have their
+        edges reassigned and are then deleted.
+        Args:
+            match_keys: Property names to compare (default ``["name"]``).
+            match_type: Also require ``node.type`` to match (default ``True``).
+            similarity_threshold: Jaro-Winkler threshold for string properties.
+            vector_threshold: Max cosine distance for vector similarity (optional).
+            strategy: How to merge properties from duplicates into the survivor.
+        Returns:
+            List of :class:`~graphmemory.models.DuplicateCluster` results.
+        """
+        if match_keys is None:
+            match_keys = ["name"]
+        for key in match_keys:
+            if not self._VALID_ATTRIBUTE_RE.match(key):
+                raise ValueError(f"Invalid match key: {key!r}")
+        clusters: list[DuplicateCluster] = []
+        try:
+            cur = self.cursor()
+            all_rows = cur.execute(
+                "SELECT id, type, properties, vector FROM nodes ORDER BY id;"
+            ).fetchall()
+            all_nodes = [
+                Node(id=r[0], type=r[1], properties=json.loads(r[2]), vector=r[3])
+                for r in all_rows
+            ]
+            seen: set[str] = set()
+            for node in all_nodes:
+                nid = str(node.id)
+                if nid in seen:
+                    continue
+                seen.add(nid)
+                # Build fuzzy query for candidates (separate param lists for ordering)
+                select_extra: list[str] = []
+                select_params: list = []
+                where_parts: list[str] = ["id != ?"]
+                where_params: list = [nid]
+                if match_type and node.type is not None:
+                    where_parts.append("type = ?")
+                    where_params.append(node.type)
+                for key in match_keys:
+                    value = (node.properties or {}).get(key)
+                    if value is None:
+                        continue
+                    if isinstance(value, str):
+                        alias = f"sim_{key}"
+                        select_extra.append(
+                            f"jaro_winkler_similarity(json_extract_string(properties, '$.{key}'), ?) AS {alias}"
+                        )
+                        select_params.append(value)
+                        where_parts.append(f"{alias} >= ?")
+                        where_params.append(similarity_threshold)
+                if vector_threshold is not None and node.vector:
+                    where_parts.append(
+                        f"array_cosine_distance(vector, CAST(? AS FLOAT[{self.vector_length}])) <= ?"
+                    )
+                    where_params.extend([node.vector, vector_threshold])
+                if not select_extra:
+                    continue
+                # Exclude already-processed nodes
+                if seen - {nid}:
+                    placeholders = ", ".join("?" for _ in seen if _ != nid)
+                    where_parts.append(f"id NOT IN ({placeholders})")
+                    where_params.extend(s for s in seen if s != nid)
+                select_cols = "id, type, properties, vector"
+                if select_extra:
+                    select_cols += ", " + ", ".join(select_extra)
+                query = f"SELECT {select_cols} FROM nodes WHERE {' AND '.join(where_parts)};"
+                dup_rows = cur.execute(query, select_params + where_params).fetchall()
+                if not dup_rows:
+                    continue
+                duplicates: list[Node] = []
+                survivor_props = dict(node.properties or {})
+                survivor_vector = node.vector
+                survivor_type = node.type
+                edges_to_rewrite: list[tuple] = []
+                for row in dup_rows:
+                    dup = Node(id=row[0], type=row[1], properties=json.loads(row[2]), vector=row[3])
+                    dup_id = str(dup.id)
+                    seen.add(dup_id)
+                    duplicates.append(dup)
+                    survivor_props = self._merge_properties(survivor_props, dup.properties, strategy)
+                    if not survivor_vector and dup.vector:
+                        survivor_vector = dup.vector
+                    if not survivor_type and dup.type:
+                        survivor_type = dup.type
+                    dup_edges = cur.execute(
+                        "SELECT id, source_id, target_id, relation, weight FROM edges "
+                        "WHERE source_id = ? OR target_id = ?;",
+                        (dup_id, dup_id)
+                    ).fetchall()
+                    for eid, src, tgt, rel, wt in dup_edges:
+                        new_src = nid if src == dup_id else src
+                        new_tgt = nid if tgt == dup_id else tgt
+                        edges_to_rewrite.append((eid, new_src, new_tgt, rel, wt))
+                # Delete edges referencing duplicates
+                for dup in duplicates:
+                    cur.execute(
+                        "DELETE FROM edges WHERE source_id = ? OR target_id = ?;",
+                        (str(dup.id), str(dup.id))
+                    )
+                # Also temporarily remove edges referencing survivor (DuckDB
+                # internally does delete+reinsert on UPDATE, triggering FK checks)
+                survivor_edges = cur.execute(
+                    "SELECT id, source_id, target_id, relation, weight FROM edges "
+                    "WHERE source_id = ? OR target_id = ?;",
+                    (nid, nid)
+                ).fetchall()
+                for eid, *_ in survivor_edges:
+                    cur.execute("DELETE FROM edges WHERE id = ?;", (eid,))
+                # Delete duplicate nodes
+                for dup in duplicates:
+                    cur.execute("DELETE FROM nodes WHERE id = ?;", (str(dup.id),))
+                # Update survivor with merged properties (safe now, no FK refs)
+                cur.execute(
+                    "UPDATE nodes SET type = ?, properties = ?, vector = ? WHERE id = ?;",
+                    (survivor_type, json.dumps(survivor_props), survivor_vector, nid)
+                )
+                # Re-insert all edges, verifying both endpoints still exist
+                rewritten_eids = {e[0] for e in edges_to_rewrite}
+                all_edges_to_insert = []
+                for eid, src, tgt, rel, wt in edges_to_rewrite:
+                    if src == tgt:
+                        continue  # skip self-loops
+                    all_edges_to_insert.append((eid, src, tgt, rel, wt))
+                for eid, src, tgt, rel, wt in survivor_edges:
+                    if eid in rewritten_eids:
+                        continue
+                    all_edges_to_insert.append((eid, src, tgt, rel, wt))
+                for eid, src, tgt, rel, wt in all_edges_to_insert:
+                    src_exists = cur.execute(
+                        "SELECT 1 FROM nodes WHERE id = ?", (str(src),)
+                    ).fetchone()
+                    tgt_exists = cur.execute(
+                        "SELECT 1 FROM nodes WHERE id = ?", (str(tgt),)
+                    ).fetchone()
+                    if src_exists and tgt_exists:
+                        cur.execute(
+                            "INSERT INTO edges (id, source_id, target_id, relation, weight) "
+                            "VALUES (?, ?, ?, ?, ?);",
+                            (eid, src, tgt, rel, wt)
+                        )
+                survivor = Node(id=node.id, type=survivor_type, properties=survivor_props, vector=survivor_vector)
+                clusters.append(DuplicateCluster(survivor=survivor, merged=duplicates))
+            if clusters:
+                self._fts_dirty = True
+            logger.info(
+                "Resolved %d duplicate clusters (%d nodes merged).",
+                len(clusters),
+                sum(len(c.merged) for c in clusters),
+            )
+            return clusters
+        except duckdb.Error as e:
+            logger.error(f"Error during resolve_duplicates: {e}")
+            raise
     @with_retry()
     def delete_edge(self, source_id: uuid.UUID, target_id: uuid.UUID):
         try:

{graphmemory-1.1.2 → graphmemory-1.2.0}/graphmemory/extraction.py RENAMED Viewed

@@ -115,8 +115,13 @@ def _get_signatures():
 # ---------------------------------------------------------------------------
+def _make_predictor(dspy, signature):
+    """Create a DSPy v3 predictor from a Signature."""
+    return dspy.Predict(signature)
 def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
-    """Extract entity nodes from text using a DSPy typed predictor.
+    """Extract entity nodes from text using a DSPy predictor.
     Args:
         text: Full text to extract from (used when *sentences* is ``None``).
@@ -129,7 +134,7 @@ def extract_nodes(text: str, sentences: list[str] | None = None) -> list[Node]:
     """
     dspy = _require_dspy()
     NodeSig, _ = _get_signatures()
-    predictor = dspy.TypedPredictor(NodeSig)
+    predictor = _make_predictor(dspy, NodeSig)
     if sentences is None:
         sentences = [s.strip() for s in text.split(".") if s.strip()]
@@ -166,7 +171,7 @@ def extract_edges(
     """
     dspy = _require_dspy()
     _, EdgeSig = _get_signatures()
-    predictor = dspy.TypedPredictor(EdgeSig)
+    predictor = _make_predictor(dspy, EdgeSig)
     if sentences is None:
         sentences = [s.strip() for s in text.split(".") if s.strip()]
@@ -253,6 +258,8 @@ def extract_and_merge(
     match_type: bool = True,
     strategy: MergeStrategy = MergeStrategy.UPDATE,
     sentences: list[str] | None = None,
+    similarity_threshold: float = 1.0,
+    vector_threshold: float | None = None,
 ) -> tuple[list[MergeResult], list[EdgeMergeResult]]:
     """Extract nodes and edges from text, merging with existing graph data.
@@ -278,6 +285,7 @@ def extract_and_merge(
     node_results = graph.bulk_merge_nodes(
         nodes, match_keys=match_keys, match_type=match_type, strategy=strategy,
+        similarity_threshold=similarity_threshold, vector_threshold=vector_threshold,
     ) if nodes else []
     edge_results = graph.bulk_merge_edges(edges) if edges else []

graphmemory 1.1.2__tar.gz → 1.2.0__tar.gz

graphmemory 1.1.2tar.gz → 1.2.0tar.gz