PyPI - graphiti-core - Versions diffs - 0.21.0rc6__py3-none-any.whl → 0.21.0rc8__py3-none-any.whl - Mend - Supply Chain Defender

graphiti-core 0.21.0rc6py3-none-any.whl → 0.21.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graphiti-core might be problematic. Click here for more details.

Files changed (15) hide show

graphiti_core/utils/maintenance/dedup_helpers.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""
+Copyright 2024, Zep Software, Inc.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from __future__ import annotations
+import math
+import re
+from collections import defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from functools import lru_cache
+from hashlib import blake2b
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from graphiti_core.nodes import EntityNode
+_NAME_ENTROPY_THRESHOLD = 1.5
+_MIN_NAME_LENGTH = 6
+_MIN_TOKEN_COUNT = 2
+_FUZZY_JACCARD_THRESHOLD = 0.9
+_MINHASH_PERMUTATIONS = 32
+_MINHASH_BAND_SIZE = 4
+def _normalize_string_exact(name: str) -> str:
+    """Lowercase text and collapse whitespace so equal names map to the same key."""
+    normalized = re.sub(r'[\s]+', ' ', name.lower())
+    return normalized.strip()
+def _normalize_name_for_fuzzy(name: str) -> str:
+    """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
+    normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
+    normalized = normalized.strip()
+    return re.sub(r'[\s]+', ' ', normalized)
+def _name_entropy(normalized_name: str) -> float:
+    """Approximate text specificity using Shannon entropy over characters.
+    We strip spaces, count how often each character appears, and sum
+    probability * -log2(probability). Short or repetitive names yield low
+    entropy, which signals we should defer resolution to the LLM instead of
+    trusting fuzzy similarity.
+    """
+    if not normalized_name:
+        return 0.0
+    counts: dict[str, int] = {}
+    for char in normalized_name.replace(' ', ''):
+        counts[char] = counts.get(char, 0) + 1
+    total = sum(counts.values())
+    if total == 0:
+        return 0.0
+    entropy = 0.0
+    for count in counts.values():
+        probability = count / total
+        entropy -= probability * math.log2(probability)
+    return entropy
+def _has_high_entropy(normalized_name: str) -> bool:
+    """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
+    token_count = len(normalized_name.split())
+    if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
+        return False
+    return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
+def _shingles(normalized_name: str) -> set[str]:
+    """Create 3-gram shingles from the normalized name for MinHash calculations."""
+    cleaned = normalized_name.replace(' ', '')
+    if len(cleaned) < 2:
+        return {cleaned} if cleaned else set()
+    return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
+def _hash_shingle(shingle: str, seed: int) -> int:
+    """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
+    digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
+    return int.from_bytes(digest.digest(), 'big')
+def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
+    """Compute the MinHash signature for the shingle set across predefined permutations."""
+    if not shingles:
+        return tuple()
+    seeds = range(_MINHASH_PERMUTATIONS)
+    signature: list[int] = []
+    for seed in seeds:
+        min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
+        signature.append(min_hash)
+    return tuple(signature)
+def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
+    """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
+    signature_list = list(signature)
+    if not signature_list:
+        return []
+    bands: list[tuple[int, ...]] = []
+    for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
+        band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
+        if len(band) == _MINHASH_BAND_SIZE:
+            bands.append(band)
+    return bands
+def _jaccard_similarity(a: set[str], b: set[str]) -> float:
+    """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
+    if not a and not b:
+        return 1.0
+    if not a or not b:
+        return 0.0
+    intersection = len(a.intersection(b))
+    union = len(a.union(b))
+    return intersection / union if union else 0.0
+@lru_cache(maxsize=512)
+def _cached_shingles(name: str) -> set[str]:
+    """Cache shingle sets per normalized name to avoid recomputation within a worker."""
+    return _shingles(name)
+@dataclass
+class DedupCandidateIndexes:
+    """Precomputed lookup structures that drive entity deduplication heuristics."""
+    existing_nodes: list[EntityNode]
+    nodes_by_uuid: dict[str, EntityNode]
+    normalized_existing: defaultdict[str, list[EntityNode]]
+    shingles_by_candidate: dict[str, set[str]]
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
+@dataclass
+class DedupResolutionState:
+    """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
+    resolved_nodes: list[EntityNode | None]
+    uuid_map: dict[str, str]
+    unresolved_indices: list[int]
+    duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
+def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
+    """Precompute exact and fuzzy lookup structures once per dedupe run."""
+    normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
+    nodes_by_uuid: dict[str, EntityNode] = {}
+    shingles_by_candidate: dict[str, set[str]] = {}
+    lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
+    for candidate in existing_nodes:
+        normalized = _normalize_string_exact(candidate.name)
+        normalized_existing[normalized].append(candidate)
+        nodes_by_uuid[candidate.uuid] = candidate
+        shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
+        shingles_by_candidate[candidate.uuid] = shingles
+        signature = _minhash_signature(shingles)
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            lsh_buckets[(band_index, band)].append(candidate.uuid)
+    return DedupCandidateIndexes(
+        existing_nodes=existing_nodes,
+        nodes_by_uuid=nodes_by_uuid,
+        normalized_existing=normalized_existing,
+        shingles_by_candidate=shingles_by_candidate,
+        lsh_buckets=lsh_buckets,
+    )
+def _resolve_with_similarity(
+    extracted_nodes: list[EntityNode],
+    indexes: DedupCandidateIndexes,
+    state: DedupResolutionState,
+) -> None:
+    """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
+    for idx, node in enumerate(extracted_nodes):
+        normalized_exact = _normalize_string_exact(node.name)
+        normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
+        if not _has_high_entropy(normalized_fuzzy):
+            state.unresolved_indices.append(idx)
+            continue
+        existing_matches = indexes.normalized_existing.get(normalized_exact, [])
+        if len(existing_matches) == 1:
+            match = existing_matches[0]
+            state.resolved_nodes[idx] = match
+            state.uuid_map[node.uuid] = match.uuid
+            if match.uuid != node.uuid:
+                state.duplicate_pairs.append((node, match))
+            continue
+        if len(existing_matches) > 1:
+            state.unresolved_indices.append(idx)
+            continue
+        shingles = _cached_shingles(normalized_fuzzy)
+        signature = _minhash_signature(shingles)
+        candidate_ids: set[str] = set()
+        for band_index, band in enumerate(_lsh_bands(signature)):
+            candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
+        best_candidate: EntityNode | None = None
+        best_score = 0.0
+        for candidate_id in candidate_ids:
+            candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
+            score = _jaccard_similarity(shingles, candidate_shingles)
+            if score > best_score:
+                best_score = score
+                best_candidate = indexes.nodes_by_uuid.get(candidate_id)
+        if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
+            state.resolved_nodes[idx] = best_candidate
+            state.uuid_map[node.uuid] = best_candidate.uuid
+            if best_candidate.uuid != node.uuid:
+                state.duplicate_pairs.append((node, best_candidate))
+            continue
+        state.unresolved_indices.append(idx)
+__all__ = [
+    'DedupCandidateIndexes',
+    'DedupResolutionState',
+    '_normalize_string_exact',
+    '_normalize_name_for_fuzzy',
+    '_has_high_entropy',
+    '_minhash_signature',
+    '_lsh_bands',
+    '_jaccard_similarity',
+    '_cached_shingles',
+    '_FUZZY_JACCARD_THRESHOLD',
+    '_build_candidate_indexes',
+    '_resolve_with_similarity',
+]

graphiti_core/utils/maintenance/edge_operations.py CHANGED Viewed

@@ -41,6 +41,9 @@ from graphiti_core.search.search_config import SearchResults
 from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
 from graphiti_core.search.search_filters import SearchFilters
 from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
+from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
+DEFAULT_EDGE_NAME = 'RELATES_TO'
 logger = logging.getLogger(__name__)
@@ -229,6 +232,22 @@ async def resolve_extracted_edges(
     edge_types: dict[str, type[BaseModel]],
     edge_type_map: dict[tuple[str, str], list[str]],
 ) -> tuple[list[EntityEdge], list[EntityEdge]]:
+    # Fast path: deduplicate exact matches within the extracted edges before parallel processing
+    seen: dict[tuple[str, str, str], EntityEdge] = {}
+    deduplicated_edges: list[EntityEdge] = []
+    for edge in extracted_edges:
+        key = (
+            edge.source_node_uuid,
+            edge.target_node_uuid,
+            _normalize_string_exact(edge.fact),
+        )
+        if key not in seen:
+            seen[key] = edge
+            deduplicated_edges.append(edge)
+    extracted_edges = deduplicated_edges
     driver = clients.driver
     llm_client = clients.llm_client
     embedder = clients.embedder
@@ -280,8 +299,12 @@ async def resolve_extracted_edges(
     # Build entity hash table
     uuid_entity_map: dict[str, EntityNode] = {entity.uuid: entity for entity in entities}
-    # Determine which edge types are relevant for each edge
+    # Determine which edge types are relevant for each edge.
+    # `edge_types_lst` stores the subset of custom edge definitions whose
+    # node signature matches each extracted edge. Anything outside this subset
+    # should only stay on the edge if it is a non-custom (LLM generated) label.
     edge_types_lst: list[dict[str, type[BaseModel]]] = []
+    custom_type_names = set(edge_types or {})
     for extracted_edge in extracted_edges:
         source_node = uuid_entity_map.get(extracted_edge.source_node_uuid)
         target_node = uuid_entity_map.get(extracted_edge.target_node_uuid)
@@ -309,6 +332,20 @@ async def resolve_extracted_edges(
         edge_types_lst.append(extracted_edge_types)
+    for extracted_edge, extracted_edge_types in zip(extracted_edges, edge_types_lst, strict=True):
+        allowed_type_names = set(extracted_edge_types)
+        is_custom_name = extracted_edge.name in custom_type_names
+        if not allowed_type_names:
+            # No custom types are valid for this node pairing. Keep LLM generated
+            # labels, but flip disallowed custom names back to the default.
+            if is_custom_name and extracted_edge.name != DEFAULT_EDGE_NAME:
+                extracted_edge.name = DEFAULT_EDGE_NAME
+            continue
+        if is_custom_name and extracted_edge.name not in allowed_type_names:
+            # Custom name exists but it is not permitted for this source/target
+            # signature, so fall back to the default edge label.
+            extracted_edge.name = DEFAULT_EDGE_NAME
     # resolve edges with related edges in the graph and find invalidation candidates
     results: list[tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]] = list(
         await semaphore_gather(
@@ -320,6 +357,7 @@ async def resolve_extracted_edges(
                     existing_edges,
                     episode,
                     extracted_edge_types,
+                    custom_type_names,
                     clients.ensure_ascii,
                 )
                 for extracted_edge, related_edges, existing_edges, extracted_edge_types in zip(
@@ -391,17 +429,59 @@ async def resolve_extracted_edge(
     related_edges: list[EntityEdge],
     existing_edges: list[EntityEdge],
     episode: EpisodicNode,
-    edge_types: dict[str, type[BaseModel]] | None = None,
+    edge_type_candidates: dict[str, type[BaseModel]] | None = None,
+    custom_edge_type_names: set[str] | None = None,
     ensure_ascii: bool = True,
 ) -> tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]:
+    """Resolve an extracted edge against existing graph context.
+    Parameters
+    ----------
+    llm_client : LLMClient
+        Client used to invoke the LLM for deduplication and attribute extraction.
+    extracted_edge : EntityEdge
+        Newly extracted edge whose canonical representation is being resolved.
+    related_edges : list[EntityEdge]
+        Candidate edges with identical endpoints used for duplicate detection.
+    existing_edges : list[EntityEdge]
+        Broader set of edges evaluated for contradiction / invalidation.
+    episode : EpisodicNode
+        Episode providing content context when extracting edge attributes.
+    edge_type_candidates : dict[str, type[BaseModel]] | None
+        Custom edge types permitted for the current source/target signature.
+    custom_edge_type_names : set[str] | None
+        Full catalog of registered custom edge names. Used to distinguish
+        between disallowed custom types (which fall back to the default label)
+        and ad-hoc labels emitted by the LLM.
+    ensure_ascii : bool
+        Whether prompt payloads should coerce ASCII output.
+    Returns
+    -------
+    tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]
+        The resolved edge, any duplicates, and edges to invalidate.
+    """
     if len(related_edges) == 0 and len(existing_edges) == 0:
         return extracted_edge, [], []
+    # Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
+    normalized_fact = _normalize_string_exact(extracted_edge.fact)
+    for edge in related_edges:
+        if (
+            edge.source_node_uuid == extracted_edge.source_node_uuid
+            and edge.target_node_uuid == extracted_edge.target_node_uuid
+            and _normalize_string_exact(edge.fact) == normalized_fact
+        ):
+            resolved = edge
+            if episode is not None and episode.uuid not in resolved.episodes:
+                resolved.episodes.append(episode.uuid)
+            return resolved, [], []
     start = time()
     # Prepare context for LLM
     related_edges_context = [
-        {'id': edge.uuid, 'fact': edge.fact} for i, edge in enumerate(related_edges)
+        {'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)
     ]
     invalidation_edge_candidates_context = [
@@ -415,9 +495,9 @@ async def resolve_extracted_edge(
                 'fact_type_name': type_name,
                 'fact_type_description': type_model.__doc__,
             }
-            for i, (type_name, type_model) in enumerate(edge_types.items())
+            for i, (type_name, type_model) in enumerate(edge_type_candidates.items())
         ]
-        if edge_types is not None
+        if edge_type_candidates is not None
         else []
     )
@@ -454,7 +534,16 @@ async def resolve_extracted_edge(
     ]
     fact_type: str = response_object.fact_type
-    if fact_type.upper() != 'DEFAULT' and edge_types is not None:
+    candidate_type_names = set(edge_type_candidates or {})
+    custom_type_names = custom_edge_type_names or set()
+    is_default_type = fact_type.upper() == 'DEFAULT'
+    is_custom_type = fact_type in custom_type_names
+    is_allowed_custom_type = fact_type in candidate_type_names
+    if is_allowed_custom_type:
+        # The LLM selected a custom type that is allowed for the node pair.
+        # Adopt the custom type and, if needed, extract its structured attributes.
         resolved_edge.name = fact_type
         edge_attributes_context = {
@@ -464,7 +553,7 @@ async def resolve_extracted_edge(
             'ensure_ascii': ensure_ascii,
         }
-        edge_model = edge_types.get(fact_type)
+        edge_model = edge_type_candidates.get(fact_type) if edge_type_candidates else None
         if edge_model is not None and len(edge_model.model_fields) != 0:
             edge_attributes_response = await llm_client.generate_response(
                 prompt_library.extract_edges.extract_attributes(edge_attributes_context),
@@ -473,6 +562,16 @@ async def resolve_extracted_edge(
             )
             resolved_edge.attributes = edge_attributes_response
+    elif not is_default_type and is_custom_type:
+        # The LLM picked a custom type that is not allowed for this signature.
+        # Reset to the default label and drop any structured attributes.
+        resolved_edge.name = DEFAULT_EDGE_NAME
+        resolved_edge.attributes = {}
+    elif not is_default_type:
+        # Non-custom labels are allowed to pass through so long as the LLM does
+        # not return the sentinel DEFAULT value.
+        resolved_edge.name = fact_type
+        resolved_edge.attributes = {}
     end = time()
     logger.debug(