PyPI - graphiti-core - Versions diffs - 0.12.0rc1__py3-none-any.whl → 0.24.3__py3-none-any.whl - Mend

graphiti-core 0.12.0rc1py3-none-any.whl → 0.24.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

graphiti_core/cross_encoder/bge_reranker_client.py +12 -2
graphiti_core/cross_encoder/gemini_reranker_client.py +161 -0
graphiti_core/cross_encoder/openai_reranker_client.py +7 -5
graphiti_core/decorators.py +110 -0
graphiti_core/driver/__init__.py +19 -0
graphiti_core/driver/driver.py +124 -0
graphiti_core/driver/falkordb_driver.py +362 -0
graphiti_core/driver/graph_operations/graph_operations.py +191 -0
graphiti_core/driver/kuzu_driver.py +182 -0
graphiti_core/driver/neo4j_driver.py +117 -0
graphiti_core/driver/neptune_driver.py +305 -0
graphiti_core/driver/search_interface/search_interface.py +89 -0
graphiti_core/edges.py +287 -172
graphiti_core/embedder/azure_openai.py +71 -0
graphiti_core/embedder/client.py +2 -1
graphiti_core/embedder/gemini.py +116 -22
graphiti_core/embedder/voyage.py +13 -2
graphiti_core/errors.py +8 -0
graphiti_core/graph_queries.py +162 -0
graphiti_core/graphiti.py +705 -193
graphiti_core/graphiti_types.py +4 -2
graphiti_core/helpers.py +87 -10
graphiti_core/llm_client/__init__.py +16 -0
graphiti_core/llm_client/anthropic_client.py +159 -56
graphiti_core/llm_client/azure_openai_client.py +115 -0
graphiti_core/llm_client/client.py +98 -21
graphiti_core/llm_client/config.py +1 -1
graphiti_core/llm_client/gemini_client.py +290 -41
graphiti_core/llm_client/groq_client.py +14 -3
graphiti_core/llm_client/openai_base_client.py +261 -0
graphiti_core/llm_client/openai_client.py +56 -132
graphiti_core/llm_client/openai_generic_client.py +91 -56
graphiti_core/models/edges/edge_db_queries.py +259 -35
graphiti_core/models/nodes/node_db_queries.py +311 -32
graphiti_core/nodes.py +420 -205
graphiti_core/prompts/dedupe_edges.py +46 -32
graphiti_core/prompts/dedupe_nodes.py +67 -42
graphiti_core/prompts/eval.py +4 -4
graphiti_core/prompts/extract_edges.py +27 -16
graphiti_core/prompts/extract_nodes.py +74 -31
graphiti_core/prompts/prompt_helpers.py +39 -0
graphiti_core/prompts/snippets.py +29 -0
graphiti_core/prompts/summarize_nodes.py +23 -25
graphiti_core/search/search.py +158 -82
graphiti_core/search/search_config.py +39 -4
graphiti_core/search/search_filters.py +126 -35
graphiti_core/search/search_helpers.py +5 -6
graphiti_core/search/search_utils.py +1405 -485
graphiti_core/telemetry/__init__.py +9 -0
graphiti_core/telemetry/telemetry.py +117 -0
graphiti_core/tracer.py +193 -0
graphiti_core/utils/bulk_utils.py +364 -285
graphiti_core/utils/datetime_utils.py +13 -0
graphiti_core/utils/maintenance/community_operations.py +67 -49
graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
graphiti_core/utils/maintenance/edge_operations.py +339 -197
graphiti_core/utils/maintenance/graph_data_operations.py +50 -114
graphiti_core/utils/maintenance/node_operations.py +319 -238
graphiti_core/utils/maintenance/temporal_operations.py +11 -3
graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
graphiti_core/utils/text_utils.py +53 -0
graphiti_core-0.24.3.dist-info/METADATA +726 -0
graphiti_core-0.24.3.dist-info/RECORD +86 -0
{graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
graphiti_core-0.12.0rc1.dist-info/METADATA +0 -350
graphiti_core-0.12.0rc1.dist-info/RECORD +0 -66
/graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
{graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info/licenses}/LICENSE +0 -0

graphiti_core/utils/datetime_utils.py CHANGED Viewed

@@ -40,3 +40,16 @@ def ensure_utc(dt: datetime | None) -> datetime | None:
         return dt.astimezone(timezone.utc)
     return dt
+def convert_datetimes_to_strings(obj):
+    if isinstance(obj, dict):
+        return {k: convert_datetimes_to_strings(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_datetimes_to_strings(item) for item in obj]
+    elif isinstance(obj, tuple):
+        return tuple(convert_datetimes_to_strings(item) for item in obj)
+    elif isinstance(obj, datetime):
+        return obj.isoformat()
+    else:
+        return obj

graphiti_core/utils/maintenance/community_operations.py CHANGED Viewed

@@ -2,13 +2,14 @@ import asyncio
 import logging
 from collections import defaultdict
-from neo4j import AsyncDriver
 from pydantic import BaseModel
+from graphiti_core.driver.driver import GraphDriver, GraphProvider
 from graphiti_core.edges import CommunityEdge
 from graphiti_core.embedder import EmbedderClient
-from graphiti_core.helpers import DEFAULT_DATABASE, semaphore_gather
+from graphiti_core.helpers import semaphore_gather
 from graphiti_core.llm_client import LLMClient
+from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
 from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
 from graphiti_core.prompts import prompt_library
 from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
@@ -26,37 +27,43 @@ class Neighbor(BaseModel):
 async def get_community_clusters(
-    driver: AsyncDriver, group_ids: list[str] | None
+    driver: GraphDriver, group_ids: list[str] | None
 ) -> list[list[EntityNode]]:
     community_clusters: list[list[EntityNode]] = []
     if group_ids is None:
         group_id_values, _, _ = await driver.execute_query(
             """
-        MATCH (n:Entity WHERE n.group_id IS NOT NULL)
-        RETURN
-            collect(DISTINCT n.group_id) AS group_ids
-        """,
-            database_=DEFAULT_DATABASE,
+            MATCH (n:Entity)
+            WHERE n.group_id IS NOT NULL
+            RETURN
+                collect(DISTINCT n.group_id) AS group_ids
+            """
         )
-        group_ids = group_id_values[0]['group_ids']
+        group_ids = group_id_values[0]['group_ids'] if group_id_values else []
     for group_id in group_ids:
         projection: dict[str, list[Neighbor]] = {}
         nodes = await EntityNode.get_by_group_ids(driver, [group_id])
         for node in nodes:
-            records, _, _ = await driver.execute_query(
+            match_query = """
+                MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
+            """
+            if driver.provider == GraphProvider.KUZU:
+                match_query = """
+                MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
                 """
-            MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[r:RELATES_TO]-(m: Entity {group_id: $group_id})
-            WITH count(r) AS count, m.uuid AS uuid
-            RETURN
-                uuid,
-                count
-            """,
+            records, _, _ = await driver.execute_query(
+                match_query
+                + """
+                WITH count(e) AS count, m.uuid AS uuid
+                RETURN
+                    uuid,
+                    count
+                """,
                 uuid=node.uuid,
                 group_id=group_id,
-                database_=DEFAULT_DATABASE,
             )
             projection[node.uuid] = [
@@ -95,7 +102,6 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
             community_candidates: dict[int, int] = defaultdict(int)
             for neighbor in neighbors:
                 community_candidates[community_map[neighbor.node_uuid]] += neighbor.edge_count
             community_lst = [
                 (count, community) for community, count in community_candidates.items()
             ]
@@ -127,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
 async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
     # Prepare context for LLM
-    context = {'node_summaries': [{'summary': summary} for summary in summary_pair]}
+    context = {
+        'node_summaries': [{'summary': summary} for summary in summary_pair],
+    }
     llm_response = await llm_client.generate_response(
-        prompt_library.summarize_nodes.summarize_pair(context), response_model=Summary
+        prompt_library.summarize_nodes.summarize_pair(context),
+        response_model=Summary,
+        prompt_name='summarize_nodes.summarize_pair',
     )
     pair_summary = llm_response.get('summary', '')
@@ -139,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
 async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
-    context = {'summary': summary}
+    context = {
+        'summary': summary,
+    }
     llm_response = await llm_client.generate_response(
         prompt_library.summarize_nodes.summary_description(context),
         response_model=SummaryDescription,
+        prompt_name='summarize_nodes.summary_description',
     )
     description = llm_response.get('description', '')
@@ -194,7 +207,9 @@ async def build_community(
 async def build_communities(
-    driver: AsyncDriver, llm_client: LLMClient, group_ids: list[str] | None
+    driver: GraphDriver,
+    llm_client: LLMClient,
+    group_ids: list[str] | None,
 ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
     community_clusters = await get_community_clusters(driver, group_ids)
@@ -219,50 +234,46 @@ async def build_communities(
     return community_nodes, community_edges
-async def remove_communities(driver: AsyncDriver):
+async def remove_communities(driver: GraphDriver):
     await driver.execute_query(
         """
-    MATCH (c:Community)
-    DETACH DELETE c
-    """,
-        database_=DEFAULT_DATABASE,
+        MATCH (c:Community)
+        DETACH DELETE c
+        """
     )
 async def determine_entity_community(
-    driver: AsyncDriver, entity: EntityNode
+    driver: GraphDriver, entity: EntityNode
 ) -> tuple[CommunityNode | None, bool]:
     # Check if the node is already part of a community
     records, _, _ = await driver.execute_query(
         """
-    MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
-    RETURN
-        c.uuid As uuid,
-        c.name AS name,
-        c.group_id AS group_id,
-        c.created_at AS created_at,
-        c.summary AS summary
-    """,
+        MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
+        RETURN
+        """
+        + COMMUNITY_NODE_RETURN,
         entity_uuid=entity.uuid,
-        database_=DEFAULT_DATABASE,
     )
     if len(records) > 0:
         return get_community_node_from_record(records[0]), False
     # If the node has no community, add it to the mode community of surrounding entities
+    match_query = """
+        MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
+    """
+    if driver.provider == GraphProvider.KUZU:
+        match_query = """
+            MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
+        """
     records, _, _ = await driver.execute_query(
+        match_query
+        + """
+        RETURN
         """
-    MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
-    RETURN
-        c.uuid As uuid,
-        c.name AS name,
-        c.group_id AS group_id,
-        c.created_at AS created_at,
-        c.summary AS summary
-    """,
+        + COMMUNITY_NODE_RETURN,
         entity_uuid=entity.uuid,
-        database_=DEFAULT_DATABASE,
     )
     communities: list[CommunityNode] = [
@@ -291,12 +302,15 @@ async def determine_entity_community(
 async def update_community(
-    driver: AsyncDriver, llm_client: LLMClient, embedder: EmbedderClient, entity: EntityNode
-):
+    driver: GraphDriver,
+    llm_client: LLMClient,
+    embedder: EmbedderClient,
+    entity: EntityNode,
+) -> tuple[list[CommunityNode], list[CommunityEdge]]:
     community, is_new = await determine_entity_community(driver, entity)
     if community is None:
-        return
+        return [], []
     new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
     new_name = await generate_summary_description(llm_client, new_summary)
@@ -304,10 +318,14 @@ async def update_community(
     community.summary = new_summary
     community.name = new_name
+    community_edges = []
     if is_new:
         community_edge = (build_community_edges([entity], community, utc_now()))[0]
         await community_edge.save(driver)
+        community_edges.append(community_edge)
     await community.generate_name_embedding(embedder)
     await community.save(driver)
+    return [community], community_edges

graphiti_core/utils/maintenance/dedup_helpers.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""
+Copyright 2024, Zep Software, Inc.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from __future__ import annotations
+import math
+import re
+from collections import defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from functools import lru_cache
+from hashlib import blake2b
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from graphiti_core.nodes import EntityNode
+_NAME_ENTROPY_THRESHOLD = 1.5
+_MIN_NAME_LENGTH = 6
+_MIN_TOKEN_COUNT = 2
+_FUZZY_JACCARD_THRESHOLD = 0.9
+_MINHASH_PERMUTATIONS = 32
+_MINHASH_BAND_SIZE = 4
+def _normalize_string_exact(name: str) -> str:
+    """Lowercase text and collapse whitespace so equal names map to the same key."""
+    normalized = re.sub(r'[\s]+', ' ', name.lower())
+    return normalized.strip()
+def _normalize_name_for_fuzzy(name: str) -> str:
+    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
+    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
+    normalized = normalized.strip()
+    return re.sub(r'[\s]+', ' ', normalized)
+def _name_entropy(normalized_name: str) -> float:
+    """Approximate text specificity using Shannon entropy over characters.
+    We strip spaces, count how often each character appears, and sum
+    probability * -log2(probability). Short or repetitive names yield low
+    entropy, which signals we should defer resolution to the LLM instead of
+    trusting fuzzy similarity.
+    """
+    if not normalized_name:
+        return 0.0
+    counts: dict[str, int] = {}
+    for char in normalized_name.replace(' ', ''):
+        counts[char] = counts.get(char, 0) + 1
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    entropy = 0.0
+    for count in counts.values():
+        probability = count / total
+        entropy -= probability * math.log2(probability)
+    return entropy
+def _has_high_entropy(normalized_name: str) -> bool:
+    """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
+    token_count = len(normalized_name.split())
+    if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
+        return False
+    return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
+def _shingles(normalized_name: str) -> set[str]:
+    """Create 3-gram shingles from the normalized name for MinHash calculations."""
+    cleaned = normalized_name.replace(' ', '')
+    if len(cleaned) < 2:
+        return {cleaned} if cleaned else set()
+    return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
+def _hash_shingle(shingle: str, seed: int) -> int:
+    """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
+    digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
+    return int.from_bytes(digest.digest(), 'big')
+def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
+    """Compute the MinHash signature for the shingle set across predefined permutations."""
+    if not shingles:
+        return tuple()
+    seeds = range(_MINHASH_PERMUTATIONS)
+    signature: list[int] = []
+    for seed in seeds:
+        min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
+        signature.append(min_hash)
+    return tuple(signature)
+def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
+    """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
+    signature_list = list(signature)
+    if not signature_list:
+        return []
+    bands: list[tuple[int, ...]] = []
+    for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
+        band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
+        if len(band) == _MINHASH_BAND_SIZE:
+            bands.append(band)
+    return bands
+def _jaccard_similarity(a: set[str], b: set[str]) -> float:
+    """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
+    if not a and not b:
+        return 1.0
+    if not a or not b:
+        return 0.0
+    intersection = len(a.intersection(b))
+    union = len(a.union(b))
+    return intersection / union if union else 0.0
+@lru_cache(maxsize=512)
+def _cached_shingles(name: str) -> set[str]:
+    """Cache shingle sets per normalized name to avoid recomputation within a worker."""
+    return _shingles(name)
+@dataclass
+class DedupCandidateIndexes:
+    """Precomputed lookup structures that drive entity deduplication heuristics."""
+    existing_nodes: list[EntityNode]
+    nodes_by_uuid: dict[str, EntityNode]
+    normalized_existing: defaultdict[str, list[EntityNode]]
+    shingles_by_candidate: dict[str, set[str]]
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
+@dataclass
+class DedupResolutionState:
+    """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
+    resolved_nodes: list[EntityNode | None]
+    uuid_map: dict[str, str]
+    unresolved_indices: list[int]
+    duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
+def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
+    """Precompute exact and fuzzy lookup structures once per dedupe run."""
+    normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
+    nodes_by_uuid: dict[str, EntityNode] = {}
+    shingles_by_candidate: dict[str, set[str]] = {}
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
+    for candidate in existing_nodes:
+        normalized = _normalize_string_exact(candidate.name)
+        normalized_existing[normalized].append(candidate)
+        nodes_by_uuid[candidate.uuid] = candidate
+        shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
+        shingles_by_candidate[candidate.uuid] = shingles
+        signature = _minhash_signature(shingles)
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            lsh_buckets[(band_index, band)].append(candidate.uuid)
+    return DedupCandidateIndexes(
+        existing_nodes=existing_nodes,
+        nodes_by_uuid=nodes_by_uuid,
+        normalized_existing=normalized_existing,
+        shingles_by_candidate=shingles_by_candidate,
+        lsh_buckets=lsh_buckets,
+    )
+def _resolve_with_similarity(
+    extracted_nodes: list[EntityNode],
+    indexes: DedupCandidateIndexes,
+    state: DedupResolutionState,
+) -> None:
+    """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
+    for idx, node in enumerate(extracted_nodes):
+        normalized_exact = _normalize_string_exact(node.name)
+        normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
+        if not _has_high_entropy(normalized_fuzzy):
+            state.unresolved_indices.append(idx)
+            continue
+        existing_matches = indexes.normalized_existing.get(normalized_exact, [])
+        if len(existing_matches) == 1:
+            match = existing_matches[0]
+            state.resolved_nodes[idx] = match
+            state.uuid_map[node.uuid] = match.uuid
+            if match.uuid != node.uuid:
+                state.duplicate_pairs.append((node, match))
+            continue
+        if len(existing_matches) > 1:
+            state.unresolved_indices.append(idx)
+            continue
+        shingles = _cached_shingles(normalized_fuzzy)
+        signature = _minhash_signature(shingles)
+        candidate_ids: set[str] = set()
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
+        best_candidate: EntityNode | None = None
+        best_score = 0.0
+        for candidate_id in candidate_ids:
+            candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
+            score = _jaccard_similarity(shingles, candidate_shingles)
+            if score > best_score:
+                best_score = score
+                best_candidate = indexes.nodes_by_uuid.get(candidate_id)
+        if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
+            state.resolved_nodes[idx] = best_candidate
+            state.uuid_map[node.uuid] = best_candidate.uuid
+            if best_candidate.uuid != node.uuid:
+                state.duplicate_pairs.append((node, best_candidate))
+            continue
+        state.unresolved_indices.append(idx)
+__all__ = [
+    'DedupCandidateIndexes',
+    'DedupResolutionState',
+    '_normalize_string_exact',
+    '_normalize_name_for_fuzzy',
+    '_has_high_entropy',
+    '_minhash_signature',
+    '_lsh_bands',
+    '_jaccard_similarity',
+    '_cached_shingles',
+    '_FUZZY_JACCARD_THRESHOLD',
+    '_build_candidate_indexes',
+    '_resolve_with_similarity',
+]

graphiti-core 0.12.0rc1__py3-none-any.whl → 0.24.3__py3-none-any.whl

graphiti-core 0.12.0rc1py3-none-any.whl → 0.24.3py3-none-any.whl