PyPI - graphiti-core - Versions diffs - 0.17.4__py3-none-any.whl → 0.24.3__py3-none-any.whl - Mend

graphiti-core 0.17.4py3-none-any.whl → 0.24.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

graphiti_core/cross_encoder/gemini_reranker_client.py +1 -1
graphiti_core/cross_encoder/openai_reranker_client.py +1 -1
graphiti_core/decorators.py +110 -0
graphiti_core/driver/driver.py +62 -2
graphiti_core/driver/falkordb_driver.py +215 -23
graphiti_core/driver/graph_operations/graph_operations.py +191 -0
graphiti_core/driver/kuzu_driver.py +182 -0
graphiti_core/driver/neo4j_driver.py +61 -8
graphiti_core/driver/neptune_driver.py +305 -0
graphiti_core/driver/search_interface/search_interface.py +89 -0
graphiti_core/edges.py +264 -132
graphiti_core/embedder/azure_openai.py +10 -3
graphiti_core/embedder/client.py +2 -1
graphiti_core/graph_queries.py +114 -101
graphiti_core/graphiti.py +582 -255
graphiti_core/graphiti_types.py +2 -0
graphiti_core/helpers.py +21 -14
graphiti_core/llm_client/anthropic_client.py +142 -52
graphiti_core/llm_client/azure_openai_client.py +57 -19
graphiti_core/llm_client/client.py +83 -21
graphiti_core/llm_client/config.py +1 -1
graphiti_core/llm_client/gemini_client.py +75 -57
graphiti_core/llm_client/openai_base_client.py +94 -50
graphiti_core/llm_client/openai_client.py +28 -8
graphiti_core/llm_client/openai_generic_client.py +91 -56
graphiti_core/models/edges/edge_db_queries.py +259 -35
graphiti_core/models/nodes/node_db_queries.py +311 -32
graphiti_core/nodes.py +388 -164
graphiti_core/prompts/dedupe_edges.py +42 -31
graphiti_core/prompts/dedupe_nodes.py +56 -39
graphiti_core/prompts/eval.py +4 -4
graphiti_core/prompts/extract_edges.py +23 -14
graphiti_core/prompts/extract_nodes.py +73 -32
graphiti_core/prompts/prompt_helpers.py +39 -0
graphiti_core/prompts/snippets.py +29 -0
graphiti_core/prompts/summarize_nodes.py +23 -25
graphiti_core/search/search.py +154 -74
graphiti_core/search/search_config.py +39 -4
graphiti_core/search/search_filters.py +109 -31
graphiti_core/search/search_helpers.py +5 -6
graphiti_core/search/search_utils.py +1360 -473
graphiti_core/tracer.py +193 -0
graphiti_core/utils/bulk_utils.py +216 -90
graphiti_core/utils/datetime_utils.py +13 -0
graphiti_core/utils/maintenance/community_operations.py +62 -38
graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
graphiti_core/utils/maintenance/edge_operations.py +286 -126
graphiti_core/utils/maintenance/graph_data_operations.py +44 -74
graphiti_core/utils/maintenance/node_operations.py +320 -158
graphiti_core/utils/maintenance/temporal_operations.py +11 -3
graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
graphiti_core/utils/text_utils.py +53 -0
{graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/METADATA +221 -87
graphiti_core-0.24.3.dist-info/RECORD +86 -0
{graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
graphiti_core-0.17.4.dist-info/RECORD +0 -77
/graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
{graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/licenses/LICENSE +0 -0

graphiti_core/utils/maintenance/community_operations.py CHANGED Viewed

@@ -4,11 +4,12 @@ from collections import defaultdict
 from pydantic import BaseModel
-from graphiti_core.driver.driver import GraphDriver
+from graphiti_core.driver.driver import GraphDriver, GraphProvider
 from graphiti_core.edges import CommunityEdge
 from graphiti_core.embedder import EmbedderClient
 from graphiti_core.helpers import semaphore_gather
 from graphiti_core.llm_client import LLMClient
+from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
 from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
 from graphiti_core.prompts import prompt_library
 from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
@@ -33,10 +34,11 @@ async def get_community_clusters(
     if group_ids is None:
         group_id_values, _, _ = await driver.execute_query(
             """
-        MATCH (n:Entity WHERE n.group_id IS NOT NULL)
-        RETURN
-            collect(DISTINCT n.group_id) AS group_ids
-        """,
+            MATCH (n:Entity)
+            WHERE n.group_id IS NOT NULL
+            RETURN
+                collect(DISTINCT n.group_id) AS group_ids
+            """
         )
         group_ids = group_id_values[0]['group_ids'] if group_id_values else []
@@ -45,14 +47,21 @@ async def get_community_clusters(
         projection: dict[str, list[Neighbor]] = {}
         nodes = await EntityNode.get_by_group_ids(driver, [group_id])
         for node in nodes:
-            records, _, _ = await driver.execute_query(
+            match_query = """
+                MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
+            """
+            if driver.provider == GraphProvider.KUZU:
+                match_query = """
+                MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
                 """
-            MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[r:RELATES_TO]-(m: Entity {group_id: $group_id})
-            WITH count(r) AS count, m.uuid AS uuid
-            RETURN
-                uuid,
-                count
-            """,
+            records, _, _ = await driver.execute_query(
+                match_query
+                + """
+                WITH count(e) AS count, m.uuid AS uuid
+                RETURN
+                    uuid,
+                    count
+                """,
                 uuid=node.uuid,
                 group_id=group_id,
             )
@@ -124,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
 async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
     # Prepare context for LLM
-    context = {'node_summaries': [{'summary': summary} for summary in summary_pair]}
+    context = {
+        'node_summaries': [{'summary': summary} for summary in summary_pair],
+    }
     llm_response = await llm_client.generate_response(
-        prompt_library.summarize_nodes.summarize_pair(context), response_model=Summary
+        prompt_library.summarize_nodes.summarize_pair(context),
+        response_model=Summary,
+        prompt_name='summarize_nodes.summarize_pair',
     )
     pair_summary = llm_response.get('summary', '')
@@ -136,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
 async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
-    context = {'summary': summary}
+    context = {
+        'summary': summary,
+    }
     llm_response = await llm_client.generate_response(
         prompt_library.summarize_nodes.summary_description(context),
         response_model=SummaryDescription,
+        prompt_name='summarize_nodes.summary_description',
     )
     description = llm_response.get('description', '')
@@ -191,7 +207,9 @@ async def build_community(
 async def build_communities(
-    driver: GraphDriver, llm_client: LLMClient, group_ids: list[str] | None
+    driver: GraphDriver,
+    llm_client: LLMClient,
+    group_ids: list[str] | None,
 ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
     community_clusters = await get_community_clusters(driver, group_ids)
@@ -219,9 +237,9 @@ async def build_communities(
 async def remove_communities(driver: GraphDriver):
     await driver.execute_query(
         """
-    MATCH (c:Community)
-    DETACH DELETE c
-    """,
+        MATCH (c:Community)
+        DETACH DELETE c
+        """
     )
@@ -231,14 +249,10 @@ async def determine_entity_community(
     # Check if the node is already part of a community
     records, _, _ = await driver.execute_query(
         """
-    MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
-    RETURN
-        c.uuid As uuid,
-        c.name AS name,
-        c.group_id AS group_id,
-        c.created_at AS created_at,
-        c.summary AS summary
-    """,
+        MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
+        RETURN
+        """
+        + COMMUNITY_NODE_RETURN,
         entity_uuid=entity.uuid,
     )
@@ -246,16 +260,19 @@ async def determine_entity_community(
         return get_community_node_from_record(records[0]), False
     # If the node has no community, add it to the mode community of surrounding entities
+    match_query = """
+        MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
+    """
+    if driver.provider == GraphProvider.KUZU:
+        match_query = """
+            MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
+        """
     records, _, _ = await driver.execute_query(
+        match_query
+        + """
+        RETURN
         """
-    MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
-    RETURN
-        c.uuid As uuid,
-        c.name AS name,
-        c.group_id AS group_id,
-        c.created_at AS created_at,
-        c.summary AS summary
-    """,
+        + COMMUNITY_NODE_RETURN,
         entity_uuid=entity.uuid,
     )
@@ -285,12 +302,15 @@ async def determine_entity_community(
 async def update_community(
-    driver: GraphDriver, llm_client: LLMClient, embedder: EmbedderClient, entity: EntityNode
-):
+    driver: GraphDriver,
+    llm_client: LLMClient,
+    embedder: EmbedderClient,
+    entity: EntityNode,
+) -> tuple[list[CommunityNode], list[CommunityEdge]]:
     community, is_new = await determine_entity_community(driver, entity)
     if community is None:
-        return
+        return [], []
     new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
     new_name = await generate_summary_description(llm_client, new_summary)
@@ -298,10 +318,14 @@ async def update_community(
     community.summary = new_summary
     community.name = new_name
+    community_edges = []
     if is_new:
         community_edge = (build_community_edges([entity], community, utc_now()))[0]
         await community_edge.save(driver)
+        community_edges.append(community_edge)
     await community.generate_name_embedding(embedder)
     await community.save(driver)
+    return [community], community_edges

graphiti_core/utils/maintenance/dedup_helpers.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""
+Copyright 2024, Zep Software, Inc.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from __future__ import annotations
+import math
+import re
+from collections import defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from functools import lru_cache
+from hashlib import blake2b
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from graphiti_core.nodes import EntityNode
+_NAME_ENTROPY_THRESHOLD = 1.5
+_MIN_NAME_LENGTH = 6
+_MIN_TOKEN_COUNT = 2
+_FUZZY_JACCARD_THRESHOLD = 0.9
+_MINHASH_PERMUTATIONS = 32
+_MINHASH_BAND_SIZE = 4
+def _normalize_string_exact(name: str) -> str:
+    """Lowercase text and collapse whitespace so equal names map to the same key."""
+    normalized = re.sub(r'[\s]+', ' ', name.lower())
+    return normalized.strip()
+def _normalize_name_for_fuzzy(name: str) -> str:
+    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
+    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
+    normalized = normalized.strip()
+    return re.sub(r'[\s]+', ' ', normalized)
+def _name_entropy(normalized_name: str) -> float:
+    """Approximate text specificity using Shannon entropy over characters.
+    We strip spaces, count how often each character appears, and sum
+    probability * -log2(probability). Short or repetitive names yield low
+    entropy, which signals we should defer resolution to the LLM instead of
+    trusting fuzzy similarity.
+    """
+    if not normalized_name:
+        return 0.0
+    counts: dict[str, int] = {}
+    for char in normalized_name.replace(' ', ''):
+        counts[char] = counts.get(char, 0) + 1
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    entropy = 0.0
+    for count in counts.values():
+        probability = count / total
+        entropy -= probability * math.log2(probability)
+    return entropy
+def _has_high_entropy(normalized_name: str) -> bool:
+    """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
+    token_count = len(normalized_name.split())
+    if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
+        return False
+    return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
+def _shingles(normalized_name: str) -> set[str]:
+    """Create 3-gram shingles from the normalized name for MinHash calculations."""
+    cleaned = normalized_name.replace(' ', '')
+    if len(cleaned) < 2:
+        return {cleaned} if cleaned else set()
+    return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
+def _hash_shingle(shingle: str, seed: int) -> int:
+    """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
+    digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
+    return int.from_bytes(digest.digest(), 'big')
+def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
+    """Compute the MinHash signature for the shingle set across predefined permutations."""
+    if not shingles:
+        return tuple()
+    seeds = range(_MINHASH_PERMUTATIONS)
+    signature: list[int] = []
+    for seed in seeds:
+        min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
+        signature.append(min_hash)
+    return tuple(signature)
+def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
+    """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
+    signature_list = list(signature)
+    if not signature_list:
+        return []
+    bands: list[tuple[int, ...]] = []
+    for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
+        band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
+        if len(band) == _MINHASH_BAND_SIZE:
+            bands.append(band)
+    return bands
+def _jaccard_similarity(a: set[str], b: set[str]) -> float:
+    """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
+    if not a and not b:
+        return 1.0
+    if not a or not b:
+        return 0.0
+    intersection = len(a.intersection(b))
+    union = len(a.union(b))
+    return intersection / union if union else 0.0
+@lru_cache(maxsize=512)
+def _cached_shingles(name: str) -> set[str]:
+    """Cache shingle sets per normalized name to avoid recomputation within a worker."""
+    return _shingles(name)
+@dataclass
+class DedupCandidateIndexes:
+    """Precomputed lookup structures that drive entity deduplication heuristics."""
+    existing_nodes: list[EntityNode]
+    nodes_by_uuid: dict[str, EntityNode]
+    normalized_existing: defaultdict[str, list[EntityNode]]
+    shingles_by_candidate: dict[str, set[str]]
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
+@dataclass
+class DedupResolutionState:
+    """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
+    resolved_nodes: list[EntityNode | None]
+    uuid_map: dict[str, str]
+    unresolved_indices: list[int]
+    duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
+def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
+    """Precompute exact and fuzzy lookup structures once per dedupe run."""
+    normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
+    nodes_by_uuid: dict[str, EntityNode] = {}
+    shingles_by_candidate: dict[str, set[str]] = {}
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
+    for candidate in existing_nodes:
+        normalized = _normalize_string_exact(candidate.name)
+        normalized_existing[normalized].append(candidate)
+        nodes_by_uuid[candidate.uuid] = candidate
+        shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
+        shingles_by_candidate[candidate.uuid] = shingles
+        signature = _minhash_signature(shingles)
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            lsh_buckets[(band_index, band)].append(candidate.uuid)
+    return DedupCandidateIndexes(
+        existing_nodes=existing_nodes,
+        nodes_by_uuid=nodes_by_uuid,
+        normalized_existing=normalized_existing,
+        shingles_by_candidate=shingles_by_candidate,
+        lsh_buckets=lsh_buckets,
+    )
+def _resolve_with_similarity(
+    extracted_nodes: list[EntityNode],
+    indexes: DedupCandidateIndexes,
+    state: DedupResolutionState,
+) -> None:
+    """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
+    for idx, node in enumerate(extracted_nodes):
+        normalized_exact = _normalize_string_exact(node.name)
+        normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
+        if not _has_high_entropy(normalized_fuzzy):
+            state.unresolved_indices.append(idx)
+            continue
+        existing_matches = indexes.normalized_existing.get(normalized_exact, [])
+        if len(existing_matches) == 1:
+            match = existing_matches[0]
+            state.resolved_nodes[idx] = match
+            state.uuid_map[node.uuid] = match.uuid
+            if match.uuid != node.uuid:
+                state.duplicate_pairs.append((node, match))
+            continue
+        if len(existing_matches) > 1:
+            state.unresolved_indices.append(idx)
+            continue
+        shingles = _cached_shingles(normalized_fuzzy)
+        signature = _minhash_signature(shingles)
+        candidate_ids: set[str] = set()
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
+        best_candidate: EntityNode | None = None
+        best_score = 0.0
+        for candidate_id in candidate_ids:
+            candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
+            score = _jaccard_similarity(shingles, candidate_shingles)
+            if score > best_score:
+                best_score = score
+                best_candidate = indexes.nodes_by_uuid.get(candidate_id)
+        if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
+            state.resolved_nodes[idx] = best_candidate
+            state.uuid_map[node.uuid] = best_candidate.uuid
+            if best_candidate.uuid != node.uuid:
+                state.duplicate_pairs.append((node, best_candidate))
+            continue
+        state.unresolved_indices.append(idx)
+__all__ = [
+    'DedupCandidateIndexes',
+    'DedupResolutionState',
+    '_normalize_string_exact',
+    '_normalize_name_for_fuzzy',
+    '_has_high_entropy',
+    '_minhash_signature',
+    '_lsh_bands',
+    '_jaccard_similarity',
+    '_cached_shingles',
+    '_FUZZY_JACCARD_THRESHOLD',
+    '_build_candidate_indexes',
+    '_resolve_with_similarity',
+]

graphiti-core 0.17.4__py3-none-any.whl → 0.24.3__py3-none-any.whl

graphiti-core 0.17.4py3-none-any.whl → 0.24.3py3-none-any.whl