graphiti-core 0.21.0rc6__py3-none-any.whl → 0.21.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graphiti-core might be problematic. Click here for more details.

graphiti_core/graphiti.py CHANGED
@@ -1070,6 +1070,7 @@ class Graphiti:
1070
1070
  group_id=edge.group_id,
1071
1071
  ),
1072
1072
  None,
1073
+ None,
1073
1074
  self.ensure_ascii,
1074
1075
  )
1075
1076
 
@@ -32,9 +32,19 @@ from .errors import RateLimitError
32
32
  DEFAULT_TEMPERATURE = 0
33
33
  DEFAULT_CACHE_DIR = './llm_cache'
34
34
 
35
- MULTILINGUAL_EXTRACTION_RESPONSES = (
36
- '\n\nAny extracted information should be returned in the same language as it was written in.'
37
- )
35
+
36
+ def get_extraction_language_instruction() -> str:
37
+ """Returns instruction for language extraction behavior.
38
+
39
+ Override this function to customize language extraction:
40
+ - Return empty string to disable multilingual instructions
41
+ - Return custom instructions for specific language requirements
42
+
43
+ Returns:
44
+ str: Language instruction to append to system messages
45
+ """
46
+ return '\n\nAny extracted information should be returned in the same language as it was written in.'
47
+
38
48
 
39
49
  logger = logging.getLogger(__name__)
40
50
 
@@ -145,7 +155,7 @@ class LLMClient(ABC):
145
155
  )
146
156
 
147
157
  # Add multilingual extraction instructions
148
- messages[0].content += MULTILINGUAL_EXTRACTION_RESPONSES
158
+ messages[0].content += get_extraction_language_instruction()
149
159
 
150
160
  if self.cache_enabled and self.cache_dir is not None:
151
161
  cache_key = self._get_cache_key(messages)
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, ClassVar
23
23
  from pydantic import BaseModel
24
24
 
25
25
  from ..prompts.models import Message
26
- from .client import MULTILINGUAL_EXTRACTION_RESPONSES, LLMClient
26
+ from .client import LLMClient, get_extraction_language_instruction
27
27
  from .config import LLMConfig, ModelSize
28
28
  from .errors import RateLimitError
29
29
 
@@ -376,7 +376,7 @@ class GeminiClient(LLMClient):
376
376
  last_output = None
377
377
 
378
378
  # Add multilingual extraction instructions
379
- messages[0].content += MULTILINGUAL_EXTRACTION_RESPONSES
379
+ messages[0].content += get_extraction_language_instruction()
380
380
 
381
381
  while retry_count < self.MAX_RETRIES:
382
382
  try:
@@ -25,7 +25,7 @@ from openai.types.chat import ChatCompletionMessageParam
25
25
  from pydantic import BaseModel
26
26
 
27
27
  from ..prompts.models import Message
28
- from .client import MULTILINGUAL_EXTRACTION_RESPONSES, LLMClient
28
+ from .client import LLMClient, get_extraction_language_instruction
29
29
  from .config import DEFAULT_MAX_TOKENS, LLMConfig, ModelSize
30
30
  from .errors import RateLimitError, RefusalError
31
31
 
@@ -184,7 +184,7 @@ class BaseOpenAIClient(LLMClient):
184
184
  last_error = None
185
185
 
186
186
  # Add multilingual extraction instructions
187
- messages[0].content += MULTILINGUAL_EXTRACTION_RESPONSES
187
+ messages[0].content += get_extraction_language_instruction()
188
188
 
189
189
  while retry_count <= self.MAX_RETRIES:
190
190
  try:
@@ -25,7 +25,7 @@ from openai.types.chat import ChatCompletionMessageParam
25
25
  from pydantic import BaseModel
26
26
 
27
27
  from ..prompts.models import Message
28
- from .client import MULTILINGUAL_EXTRACTION_RESPONSES, LLMClient
28
+ from .client import LLMClient, get_extraction_language_instruction
29
29
  from .config import DEFAULT_MAX_TOKENS, LLMConfig, ModelSize
30
30
  from .errors import RateLimitError, RefusalError
31
31
 
@@ -136,7 +136,7 @@ class OpenAIGenericClient(LLMClient):
136
136
  )
137
137
 
138
138
  # Add multilingual extraction instructions
139
- messages[0].content += MULTILINGUAL_EXTRACTION_RESPONSES
139
+ messages[0].content += get_extraction_language_instruction()
140
140
 
141
141
  while retry_count <= self.MAX_RETRIES:
142
142
  try:
@@ -92,12 +92,23 @@ def node(context: dict[str, Any]) -> list[Message]:
92
92
 
93
93
  TASK:
94
94
  1. Compare `new_entity` against each item in `existing_entities`.
95
- 2. If it refers to the same realworld object or concept, collect its index.
96
- 3. Let `duplicate_idx` = the *first* collected index, or 1 if none.
97
- 4. Let `duplicates` = the list of *all* collected indices (empty list if none).
98
-
99
- Also return the full name of the NEW ENTITY (whether it is the name of the NEW ENTITY, a node it
100
- is a duplicate of, or a combination of the two).
95
+ 2. If it refers to the same real-world object or concept, collect its index.
96
+ 3. Let `duplicate_idx` = the smallest collected index, or -1 if none.
97
+ 4. Let `duplicates` = the sorted list of all collected indices (empty list if none).
98
+
99
+ Respond with a JSON object containing an "entity_resolutions" array with a single entry:
100
+ {{
101
+ "entity_resolutions": [
102
+ {{
103
+ "id": integer id from NEW ENTITY,
104
+ "name": the best full name for the entity,
105
+ "duplicate_idx": integer index of the best duplicate in EXISTING ENTITIES, or -1 if none,
106
+ "duplicates": sorted list of all duplicate indices you collected (deduplicate the list, use [] when none)
107
+ }}
108
+ ]
109
+ }}
110
+
111
+ Only reference indices that appear in EXISTING ENTITIES, and return [] / -1 when unsure.
101
112
  """,
102
113
  ),
103
114
  ]
@@ -126,26 +137,26 @@ def nodes(context: dict[str, Any]) -> list[Message]:
126
137
  {{
127
138
  id: integer id of the entity,
128
139
  name: "name of the entity",
129
- entity_type: "ontological classification of the entity",
130
- entity_type_description: "Description of what the entity type represents",
131
- duplication_candidates: [
132
- {{
133
- idx: integer index of the candidate entity,
134
- name: "name of the candidate entity",
135
- entity_type: "ontological classification of the candidate entity",
136
- ...<additional attributes>
137
- }}
138
- ]
140
+ entity_type: ["Entity", "<optional additional label>", ...],
141
+ entity_type_description: "Description of what the entity type represents"
139
142
  }}
140
-
143
+
141
144
  <ENTITIES>
142
145
  {to_prompt_json(context['extracted_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)}
143
146
  </ENTITIES>
144
-
147
+
145
148
  <EXISTING ENTITIES>
146
149
  {to_prompt_json(context['existing_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)}
147
150
  </EXISTING ENTITIES>
148
151
 
152
+ Each entry in EXISTING ENTITIES is an object with the following structure:
153
+ {{
154
+ idx: integer index of the candidate entity (use this when referencing a duplicate),
155
+ name: "name of the candidate entity",
156
+ entity_types: ["Entity", "<optional additional label>", ...],
157
+ ...<additional attributes such as summaries or metadata>
158
+ }}
159
+
149
160
  For each of the above ENTITIES, determine if the entity is a duplicate of any of the EXISTING ENTITIES.
150
161
 
151
162
  Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
@@ -155,14 +166,19 @@ def nodes(context: dict[str, Any]) -> list[Message]:
155
166
  - They have similar names or purposes but refer to separate instances or concepts.
156
167
 
157
168
  Task:
158
- Your response will be a list called entity_resolutions which contains one entry for each entity.
159
-
160
- For each entity, return the id of the entity as id, the name of the entity as name, and the duplicate_idx
161
- as an integer.
162
-
163
- - If an entity is a duplicate of one of the EXISTING ENTITIES, return the idx of the candidate it is a
164
- duplicate of.
165
- - If an entity is not a duplicate of one of the EXISTING ENTITIES, return the -1 as the duplication_idx
169
+ Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id.
170
+
171
+ For every entity, return an object with the following keys:
172
+ {{
173
+ "id": integer id from ENTITIES,
174
+ "name": the best full name for the entity (preserve the original name unless a duplicate has a more complete name),
175
+ "duplicate_idx": the idx of the EXISTING ENTITY that is the best duplicate match, or -1 if there is no duplicate,
176
+ "duplicates": a sorted list of all idx values from EXISTING ENTITIES that refer to duplicates (deduplicate the list, use [] when none or unsure)
177
+ }}
178
+
179
+ - Only use idx values that appear in EXISTING ENTITIES.
180
+ - Set duplicate_idx to the smallest idx you collected for that entity, or -1 if duplicates is empty.
181
+ - Never fabricate entities or indices.
166
182
  """,
167
183
  ),
168
184
  ]
@@ -152,7 +152,8 @@ Indicate the classified entity type by providing its entity_type_id.
152
152
 
153
153
  Guidelines:
154
154
  1. Always try to extract an entities that the JSON represents. This will often be something like a "name" or "user field
155
- 2. Do NOT extract any properties that contain dates
155
+ 2. Extract all entities mentioned in all other properties throughout the JSON structure
156
+ 3. Do NOT extract any properties that contain dates
156
157
  """
157
158
  return [
158
159
  Message(role='system', content=sys_prompt),
@@ -43,8 +43,14 @@ from graphiti_core.models.nodes.node_db_queries import (
43
43
  get_entity_node_save_bulk_query,
44
44
  get_episode_node_save_bulk_query,
45
45
  )
46
- from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode, create_entity_node_embeddings
46
+ from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode
47
47
  from graphiti_core.utils.datetime_utils import convert_datetimes_to_strings
48
+ from graphiti_core.utils.maintenance.dedup_helpers import (
49
+ DedupResolutionState,
50
+ _build_candidate_indexes,
51
+ _normalize_string_exact,
52
+ _resolve_with_similarity,
53
+ )
48
54
  from graphiti_core.utils.maintenance.edge_operations import (
49
55
  extract_edges,
50
56
  resolve_extracted_edge,
@@ -63,6 +69,38 @@ logger = logging.getLogger(__name__)
63
69
  CHUNK_SIZE = 10
64
70
 
65
71
 
72
+ def _build_directed_uuid_map(pairs: list[tuple[str, str]]) -> dict[str, str]:
73
+ """Collapse alias -> canonical chains while preserving direction.
74
+
75
+ The incoming pairs represent directed mappings discovered during node dedupe. We use a simple
76
+ union-find with iterative path compression to ensure every source UUID resolves to its ultimate
77
+ canonical target, even if aliases appear lexicographically smaller than the canonical UUID.
78
+ """
79
+
80
+ parent: dict[str, str] = {}
81
+
82
+ def find(uuid: str) -> str:
83
+ """Directed union-find lookup using iterative path compression."""
84
+ parent.setdefault(uuid, uuid)
85
+ root = uuid
86
+ while parent[root] != root:
87
+ root = parent[root]
88
+
89
+ while parent[uuid] != root:
90
+ next_uuid = parent[uuid]
91
+ parent[uuid] = root
92
+ uuid = next_uuid
93
+
94
+ return root
95
+
96
+ for source_uuid, target_uuid in pairs:
97
+ parent.setdefault(source_uuid, source_uuid)
98
+ parent.setdefault(target_uuid, target_uuid)
99
+ parent[find(source_uuid)] = find(target_uuid)
100
+
101
+ return {uuid: find(uuid) for uuid in parent}
102
+
103
+
66
104
  class RawEpisode(BaseModel):
67
105
  name: str
68
106
  uuid: str | None = Field(default=None)
@@ -266,83 +304,111 @@ async def dedupe_nodes_bulk(
266
304
  episode_tuples: list[tuple[EpisodicNode, list[EpisodicNode]]],
267
305
  entity_types: dict[str, type[BaseModel]] | None = None,
268
306
  ) -> tuple[dict[str, list[EntityNode]], dict[str, str]]:
269
- embedder = clients.embedder
270
- min_score = 0.8
271
-
272
- # generate embeddings
273
- await semaphore_gather(
274
- *[create_entity_node_embeddings(embedder, nodes) for nodes in extracted_nodes]
275
- )
276
-
277
- # Find similar results
278
- dedupe_tuples: list[tuple[list[EntityNode], list[EntityNode]]] = []
279
- for i, nodes_i in enumerate(extracted_nodes):
280
- existing_nodes: list[EntityNode] = []
281
- for j, nodes_j in enumerate(extracted_nodes):
282
- if i == j:
283
- continue
284
- existing_nodes += nodes_j
285
-
286
- candidates_i: list[EntityNode] = []
287
- for node in nodes_i:
288
- for existing_node in existing_nodes:
289
- # Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
290
- # This approach will cast a wider net than BM25, which is ideal for this use case
291
- node_words = set(node.name.lower().split())
292
- existing_node_words = set(existing_node.name.lower().split())
293
- has_overlap = not node_words.isdisjoint(existing_node_words)
294
- if has_overlap:
295
- candidates_i.append(existing_node)
296
- continue
307
+ """Resolve entity duplicates across an in-memory batch using a two-pass strategy.
297
308
 
298
- # Check for semantic similarity even if there is no overlap
299
- similarity = np.dot(
300
- normalize_l2(node.name_embedding or []),
301
- normalize_l2(existing_node.name_embedding or []),
302
- )
303
- if similarity >= min_score:
304
- candidates_i.append(existing_node)
305
-
306
- dedupe_tuples.append((nodes_i, candidates_i))
309
+ 1. Run :func:`resolve_extracted_nodes` for every episode in parallel so each batch item is
310
+ reconciled against the live graph just like the non-batch flow.
311
+ 2. Re-run the deterministic similarity heuristics across the union of resolved nodes to catch
312
+ duplicates that only co-occur inside this batch, emitting a canonical UUID map that callers
313
+ can apply to edges and persistence.
314
+ """
307
315
 
308
- # Determine Node Resolutions
309
- bulk_node_resolutions: list[
310
- tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]
311
- ] = await semaphore_gather(
316
+ first_pass_results = await semaphore_gather(
312
317
  *[
313
318
  resolve_extracted_nodes(
314
319
  clients,
315
- dedupe_tuple[0],
320
+ nodes,
316
321
  episode_tuples[i][0],
317
322
  episode_tuples[i][1],
318
323
  entity_types,
319
- existing_nodes_override=dedupe_tuples[i][1],
320
324
  )
321
- for i, dedupe_tuple in enumerate(dedupe_tuples)
325
+ for i, nodes in enumerate(extracted_nodes)
322
326
  ]
323
327
  )
324
328
 
325
- # Collect all duplicate pairs sorted by uuid
329
+ episode_resolutions: list[tuple[str, list[EntityNode]]] = []
330
+ per_episode_uuid_maps: list[dict[str, str]] = []
326
331
  duplicate_pairs: list[tuple[str, str]] = []
327
- for _, _, duplicates in bulk_node_resolutions:
328
- for duplicate in duplicates:
329
- n, m = duplicate
330
- duplicate_pairs.append((n.uuid, m.uuid))
331
332
 
332
- # Now we compress the duplicate_map, so that 3 -> 2 and 2 -> becomes 3 -> 1 (sorted by uuid)
333
- compressed_map: dict[str, str] = compress_uuid_map(duplicate_pairs)
333
+ for (resolved_nodes, uuid_map, duplicates), (episode, _) in zip(
334
+ first_pass_results, episode_tuples, strict=True
335
+ ):
336
+ episode_resolutions.append((episode.uuid, resolved_nodes))
337
+ per_episode_uuid_maps.append(uuid_map)
338
+ duplicate_pairs.extend((source.uuid, target.uuid) for source, target in duplicates)
339
+
340
+ canonical_nodes: dict[str, EntityNode] = {}
341
+ for _, resolved_nodes in episode_resolutions:
342
+ for node in resolved_nodes:
343
+ # NOTE: this loop is O(n^2) in the number of nodes inside the batch because we rebuild
344
+ # the MinHash index for the accumulated canonical pool each time. The LRU-backed
345
+ # shingle cache keeps the constant factors low for typical batch sizes (≤ CHUNK_SIZE),
346
+ # but if batches grow significantly we should switch to an incremental index or chunked
347
+ # processing.
348
+ if not canonical_nodes:
349
+ canonical_nodes[node.uuid] = node
350
+ continue
334
351
 
335
- node_uuid_map: dict[str, EntityNode] = {
336
- node.uuid: node for nodes in extracted_nodes for node in nodes
337
- }
352
+ existing_candidates = list(canonical_nodes.values())
353
+ normalized = _normalize_string_exact(node.name)
354
+ exact_match = next(
355
+ (
356
+ candidate
357
+ for candidate in existing_candidates
358
+ if _normalize_string_exact(candidate.name) == normalized
359
+ ),
360
+ None,
361
+ )
362
+ if exact_match is not None:
363
+ if exact_match.uuid != node.uuid:
364
+ duplicate_pairs.append((node.uuid, exact_match.uuid))
365
+ continue
366
+
367
+ indexes = _build_candidate_indexes(existing_candidates)
368
+ state = DedupResolutionState(
369
+ resolved_nodes=[None],
370
+ uuid_map={},
371
+ unresolved_indices=[],
372
+ )
373
+ _resolve_with_similarity([node], indexes, state)
374
+
375
+ resolved = state.resolved_nodes[0]
376
+ if resolved is None:
377
+ canonical_nodes[node.uuid] = node
378
+ continue
379
+
380
+ canonical_uuid = resolved.uuid
381
+ canonical_nodes.setdefault(canonical_uuid, resolved)
382
+ if canonical_uuid != node.uuid:
383
+ duplicate_pairs.append((node.uuid, canonical_uuid))
384
+
385
+ union_pairs: list[tuple[str, str]] = []
386
+ for uuid_map in per_episode_uuid_maps:
387
+ union_pairs.extend(uuid_map.items())
388
+ union_pairs.extend(duplicate_pairs)
389
+
390
+ compressed_map: dict[str, str] = _build_directed_uuid_map(union_pairs)
338
391
 
339
392
  nodes_by_episode: dict[str, list[EntityNode]] = {}
340
- for i, nodes in enumerate(extracted_nodes):
341
- episode = episode_tuples[i][0]
393
+ for episode_uuid, resolved_nodes in episode_resolutions:
394
+ deduped_nodes: list[EntityNode] = []
395
+ seen: set[str] = set()
396
+ for node in resolved_nodes:
397
+ canonical_uuid = compressed_map.get(node.uuid, node.uuid)
398
+ if canonical_uuid in seen:
399
+ continue
400
+ seen.add(canonical_uuid)
401
+ canonical_node = canonical_nodes.get(canonical_uuid)
402
+ if canonical_node is None:
403
+ logger.error(
404
+ 'Canonical node %s missing during batch dedupe; falling back to %s',
405
+ canonical_uuid,
406
+ node.uuid,
407
+ )
408
+ canonical_node = node
409
+ deduped_nodes.append(canonical_node)
342
410
 
343
- nodes_by_episode[episode.uuid] = [
344
- node_uuid_map[compressed_map.get(node.uuid, node.uuid)] for node in nodes
345
- ]
411
+ nodes_by_episode[episode_uuid] = deduped_nodes
346
412
 
347
413
  return nodes_by_episode, compressed_map
348
414
 
@@ -411,6 +477,7 @@ async def dedupe_edges_bulk(
411
477
  candidates,
412
478
  episode,
413
479
  edge_types,
480
+ set(edge_types),
414
481
  clients.ensure_ascii,
415
482
  )
416
483
  for episode, edge, candidates in dedupe_tuples
@@ -0,0 +1,262 @@
1
+ """
2
+ Copyright 2024, Zep Software, Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import math
20
+ import re
21
+ from collections import defaultdict
22
+ from collections.abc import Iterable
23
+ from dataclasses import dataclass, field
24
+ from functools import lru_cache
25
+ from hashlib import blake2b
26
+ from typing import TYPE_CHECKING
27
+
28
+ if TYPE_CHECKING:
29
+ from graphiti_core.nodes import EntityNode
30
+
31
+ _NAME_ENTROPY_THRESHOLD = 1.5
32
+ _MIN_NAME_LENGTH = 6
33
+ _MIN_TOKEN_COUNT = 2
34
+ _FUZZY_JACCARD_THRESHOLD = 0.9
35
+ _MINHASH_PERMUTATIONS = 32
36
+ _MINHASH_BAND_SIZE = 4
37
+
38
+
39
+ def _normalize_string_exact(name: str) -> str:
40
+ """Lowercase text and collapse whitespace so equal names map to the same key."""
41
+ normalized = re.sub(r'[\s]+', ' ', name.lower())
42
+ return normalized.strip()
43
+
44
+
45
+ def _normalize_name_for_fuzzy(name: str) -> str:
46
+ """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
47
+ normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
48
+ normalized = normalized.strip()
49
+ return re.sub(r'[\s]+', ' ', normalized)
50
+
51
+
52
+ def _name_entropy(normalized_name: str) -> float:
53
+ """Approximate text specificity using Shannon entropy over characters.
54
+
55
+ We strip spaces, count how often each character appears, and sum
56
+ probability * -log2(probability). Short or repetitive names yield low
57
+ entropy, which signals we should defer resolution to the LLM instead of
58
+ trusting fuzzy similarity.
59
+ """
60
+ if not normalized_name:
61
+ return 0.0
62
+
63
+ counts: dict[str, int] = {}
64
+ for char in normalized_name.replace(' ', ''):
65
+ counts[char] = counts.get(char, 0) + 1
66
+
67
+ total = sum(counts.values())
68
+ if total == 0:
69
+ return 0.0
70
+
71
+ entropy = 0.0
72
+ for count in counts.values():
73
+ probability = count / total
74
+ entropy -= probability * math.log2(probability)
75
+
76
+ return entropy
77
+
78
+
79
+ def _has_high_entropy(normalized_name: str) -> bool:
80
+ """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
81
+ token_count = len(normalized_name.split())
82
+ if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
83
+ return False
84
+
85
+ return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
86
+
87
+
88
+ def _shingles(normalized_name: str) -> set[str]:
89
+ """Create 3-gram shingles from the normalized name for MinHash calculations."""
90
+ cleaned = normalized_name.replace(' ', '')
91
+ if len(cleaned) < 2:
92
+ return {cleaned} if cleaned else set()
93
+
94
+ return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
95
+
96
+
97
+ def _hash_shingle(shingle: str, seed: int) -> int:
98
+ """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
99
+ digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
100
+ return int.from_bytes(digest.digest(), 'big')
101
+
102
+
103
+ def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
104
+ """Compute the MinHash signature for the shingle set across predefined permutations."""
105
+ if not shingles:
106
+ return tuple()
107
+
108
+ seeds = range(_MINHASH_PERMUTATIONS)
109
+ signature: list[int] = []
110
+ for seed in seeds:
111
+ min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
112
+ signature.append(min_hash)
113
+
114
+ return tuple(signature)
115
+
116
+
117
+ def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
118
+ """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
119
+ signature_list = list(signature)
120
+ if not signature_list:
121
+ return []
122
+
123
+ bands: list[tuple[int, ...]] = []
124
+ for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
125
+ band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
126
+ if len(band) == _MINHASH_BAND_SIZE:
127
+ bands.append(band)
128
+ return bands
129
+
130
+
131
+ def _jaccard_similarity(a: set[str], b: set[str]) -> float:
132
+ """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
133
+ if not a and not b:
134
+ return 1.0
135
+ if not a or not b:
136
+ return 0.0
137
+
138
+ intersection = len(a.intersection(b))
139
+ union = len(a.union(b))
140
+ return intersection / union if union else 0.0
141
+
142
+
143
+ @lru_cache(maxsize=512)
144
+ def _cached_shingles(name: str) -> set[str]:
145
+ """Cache shingle sets per normalized name to avoid recomputation within a worker."""
146
+ return _shingles(name)
147
+
148
+
149
+ @dataclass
150
+ class DedupCandidateIndexes:
151
+ """Precomputed lookup structures that drive entity deduplication heuristics."""
152
+
153
+ existing_nodes: list[EntityNode]
154
+ nodes_by_uuid: dict[str, EntityNode]
155
+ normalized_existing: defaultdict[str, list[EntityNode]]
156
+ shingles_by_candidate: dict[str, set[str]]
157
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
158
+
159
+
160
+ @dataclass
161
+ class DedupResolutionState:
162
+ """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
163
+
164
+ resolved_nodes: list[EntityNode | None]
165
+ uuid_map: dict[str, str]
166
+ unresolved_indices: list[int]
167
+ duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
168
+
169
+
170
+ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
171
+ """Precompute exact and fuzzy lookup structures once per dedupe run."""
172
+ normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
173
+ nodes_by_uuid: dict[str, EntityNode] = {}
174
+ shingles_by_candidate: dict[str, set[str]] = {}
175
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
176
+
177
+ for candidate in existing_nodes:
178
+ normalized = _normalize_string_exact(candidate.name)
179
+ normalized_existing[normalized].append(candidate)
180
+ nodes_by_uuid[candidate.uuid] = candidate
181
+
182
+ shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
183
+ shingles_by_candidate[candidate.uuid] = shingles
184
+
185
+ signature = _minhash_signature(shingles)
186
+ for band_index, band in enumerate(_lsh_bands(signature)):
187
+ lsh_buckets[(band_index, band)].append(candidate.uuid)
188
+
189
+ return DedupCandidateIndexes(
190
+ existing_nodes=existing_nodes,
191
+ nodes_by_uuid=nodes_by_uuid,
192
+ normalized_existing=normalized_existing,
193
+ shingles_by_candidate=shingles_by_candidate,
194
+ lsh_buckets=lsh_buckets,
195
+ )
196
+
197
+
198
+ def _resolve_with_similarity(
199
+ extracted_nodes: list[EntityNode],
200
+ indexes: DedupCandidateIndexes,
201
+ state: DedupResolutionState,
202
+ ) -> None:
203
+ """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
204
+ for idx, node in enumerate(extracted_nodes):
205
+ normalized_exact = _normalize_string_exact(node.name)
206
+ normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
207
+
208
+ if not _has_high_entropy(normalized_fuzzy):
209
+ state.unresolved_indices.append(idx)
210
+ continue
211
+
212
+ existing_matches = indexes.normalized_existing.get(normalized_exact, [])
213
+ if len(existing_matches) == 1:
214
+ match = existing_matches[0]
215
+ state.resolved_nodes[idx] = match
216
+ state.uuid_map[node.uuid] = match.uuid
217
+ if match.uuid != node.uuid:
218
+ state.duplicate_pairs.append((node, match))
219
+ continue
220
+ if len(existing_matches) > 1:
221
+ state.unresolved_indices.append(idx)
222
+ continue
223
+
224
+ shingles = _cached_shingles(normalized_fuzzy)
225
+ signature = _minhash_signature(shingles)
226
+ candidate_ids: set[str] = set()
227
+ for band_index, band in enumerate(_lsh_bands(signature)):
228
+ candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
229
+
230
+ best_candidate: EntityNode | None = None
231
+ best_score = 0.0
232
+ for candidate_id in candidate_ids:
233
+ candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
234
+ score = _jaccard_similarity(shingles, candidate_shingles)
235
+ if score > best_score:
236
+ best_score = score
237
+ best_candidate = indexes.nodes_by_uuid.get(candidate_id)
238
+
239
+ if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
240
+ state.resolved_nodes[idx] = best_candidate
241
+ state.uuid_map[node.uuid] = best_candidate.uuid
242
+ if best_candidate.uuid != node.uuid:
243
+ state.duplicate_pairs.append((node, best_candidate))
244
+ continue
245
+
246
+ state.unresolved_indices.append(idx)
247
+
248
+
249
+ __all__ = [
250
+ 'DedupCandidateIndexes',
251
+ 'DedupResolutionState',
252
+ '_normalize_string_exact',
253
+ '_normalize_name_for_fuzzy',
254
+ '_has_high_entropy',
255
+ '_minhash_signature',
256
+ '_lsh_bands',
257
+ '_jaccard_similarity',
258
+ '_cached_shingles',
259
+ '_FUZZY_JACCARD_THRESHOLD',
260
+ '_build_candidate_indexes',
261
+ '_resolve_with_similarity',
262
+ ]
@@ -41,6 +41,9 @@ from graphiti_core.search.search_config import SearchResults
41
41
  from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
42
42
  from graphiti_core.search.search_filters import SearchFilters
43
43
  from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
44
+ from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
45
+
46
+ DEFAULT_EDGE_NAME = 'RELATES_TO'
44
47
 
45
48
  logger = logging.getLogger(__name__)
46
49
 
@@ -280,8 +283,12 @@ async def resolve_extracted_edges(
280
283
  # Build entity hash table
281
284
  uuid_entity_map: dict[str, EntityNode] = {entity.uuid: entity for entity in entities}
282
285
 
283
- # Determine which edge types are relevant for each edge
286
+ # Determine which edge types are relevant for each edge.
287
+ # `edge_types_lst` stores the subset of custom edge definitions whose
288
+ # node signature matches each extracted edge. Anything outside this subset
289
+ # should only stay on the edge if it is a non-custom (LLM generated) label.
284
290
  edge_types_lst: list[dict[str, type[BaseModel]]] = []
291
+ custom_type_names = set(edge_types or {})
285
292
  for extracted_edge in extracted_edges:
286
293
  source_node = uuid_entity_map.get(extracted_edge.source_node_uuid)
287
294
  target_node = uuid_entity_map.get(extracted_edge.target_node_uuid)
@@ -309,6 +316,20 @@ async def resolve_extracted_edges(
309
316
 
310
317
  edge_types_lst.append(extracted_edge_types)
311
318
 
319
+ for extracted_edge, extracted_edge_types in zip(extracted_edges, edge_types_lst, strict=True):
320
+ allowed_type_names = set(extracted_edge_types)
321
+ is_custom_name = extracted_edge.name in custom_type_names
322
+ if not allowed_type_names:
323
+ # No custom types are valid for this node pairing. Keep LLM generated
324
+ # labels, but flip disallowed custom names back to the default.
325
+ if is_custom_name and extracted_edge.name != DEFAULT_EDGE_NAME:
326
+ extracted_edge.name = DEFAULT_EDGE_NAME
327
+ continue
328
+ if is_custom_name and extracted_edge.name not in allowed_type_names:
329
+ # Custom name exists but it is not permitted for this source/target
330
+ # signature, so fall back to the default edge label.
331
+ extracted_edge.name = DEFAULT_EDGE_NAME
332
+
312
333
  # resolve edges with related edges in the graph and find invalidation candidates
313
334
  results: list[tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]] = list(
314
335
  await semaphore_gather(
@@ -320,6 +341,7 @@ async def resolve_extracted_edges(
320
341
  existing_edges,
321
342
  episode,
322
343
  extracted_edge_types,
344
+ custom_type_names,
323
345
  clients.ensure_ascii,
324
346
  )
325
347
  for extracted_edge, related_edges, existing_edges, extracted_edge_types in zip(
@@ -391,12 +413,54 @@ async def resolve_extracted_edge(
391
413
  related_edges: list[EntityEdge],
392
414
  existing_edges: list[EntityEdge],
393
415
  episode: EpisodicNode,
394
- edge_types: dict[str, type[BaseModel]] | None = None,
416
+ edge_type_candidates: dict[str, type[BaseModel]] | None = None,
417
+ custom_edge_type_names: set[str] | None = None,
395
418
  ensure_ascii: bool = True,
396
419
  ) -> tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]:
420
+ """Resolve an extracted edge against existing graph context.
421
+
422
+ Parameters
423
+ ----------
424
+ llm_client : LLMClient
425
+ Client used to invoke the LLM for deduplication and attribute extraction.
426
+ extracted_edge : EntityEdge
427
+ Newly extracted edge whose canonical representation is being resolved.
428
+ related_edges : list[EntityEdge]
429
+ Candidate edges with identical endpoints used for duplicate detection.
430
+ existing_edges : list[EntityEdge]
431
+ Broader set of edges evaluated for contradiction / invalidation.
432
+ episode : EpisodicNode
433
+ Episode providing content context when extracting edge attributes.
434
+ edge_type_candidates : dict[str, type[BaseModel]] | None
435
+ Custom edge types permitted for the current source/target signature.
436
+ custom_edge_type_names : set[str] | None
437
+ Full catalog of registered custom edge names. Used to distinguish
438
+ between disallowed custom types (which fall back to the default label)
439
+ and ad-hoc labels emitted by the LLM.
440
+ ensure_ascii : bool
441
+ Whether prompt payloads should coerce ASCII output.
442
+
443
+ Returns
444
+ -------
445
+ tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]
446
+ The resolved edge, any duplicates, and edges to invalidate.
447
+ """
397
448
  if len(related_edges) == 0 and len(existing_edges) == 0:
398
449
  return extracted_edge, [], []
399
450
 
451
+ # Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
452
+ normalized_fact = _normalize_string_exact(extracted_edge.fact)
453
+ for edge in related_edges:
454
+ if (
455
+ edge.source_node_uuid == extracted_edge.source_node_uuid
456
+ and edge.target_node_uuid == extracted_edge.target_node_uuid
457
+ and _normalize_string_exact(edge.fact) == normalized_fact
458
+ ):
459
+ resolved = edge
460
+ if episode is not None and episode.uuid not in resolved.episodes:
461
+ resolved.episodes.append(episode.uuid)
462
+ return resolved, [], []
463
+
400
464
  start = time()
401
465
 
402
466
  # Prepare context for LLM
@@ -415,9 +479,9 @@ async def resolve_extracted_edge(
415
479
  'fact_type_name': type_name,
416
480
  'fact_type_description': type_model.__doc__,
417
481
  }
418
- for i, (type_name, type_model) in enumerate(edge_types.items())
482
+ for i, (type_name, type_model) in enumerate(edge_type_candidates.items())
419
483
  ]
420
- if edge_types is not None
484
+ if edge_type_candidates is not None
421
485
  else []
422
486
  )
423
487
 
@@ -454,7 +518,16 @@ async def resolve_extracted_edge(
454
518
  ]
455
519
 
456
520
  fact_type: str = response_object.fact_type
457
- if fact_type.upper() != 'DEFAULT' and edge_types is not None:
521
+ candidate_type_names = set(edge_type_candidates or {})
522
+ custom_type_names = custom_edge_type_names or set()
523
+
524
+ is_default_type = fact_type.upper() == 'DEFAULT'
525
+ is_custom_type = fact_type in custom_type_names
526
+ is_allowed_custom_type = fact_type in candidate_type_names
527
+
528
+ if is_allowed_custom_type:
529
+ # The LLM selected a custom type that is allowed for the node pair.
530
+ # Adopt the custom type and, if needed, extract its structured attributes.
458
531
  resolved_edge.name = fact_type
459
532
 
460
533
  edge_attributes_context = {
@@ -464,7 +537,7 @@ async def resolve_extracted_edge(
464
537
  'ensure_ascii': ensure_ascii,
465
538
  }
466
539
 
467
- edge_model = edge_types.get(fact_type)
540
+ edge_model = edge_type_candidates.get(fact_type) if edge_type_candidates else None
468
541
  if edge_model is not None and len(edge_model.model_fields) != 0:
469
542
  edge_attributes_response = await llm_client.generate_response(
470
543
  prompt_library.extract_edges.extract_attributes(edge_attributes_context),
@@ -473,6 +546,16 @@ async def resolve_extracted_edge(
473
546
  )
474
547
 
475
548
  resolved_edge.attributes = edge_attributes_response
549
+ elif not is_default_type and is_custom_type:
550
+ # The LLM picked a custom type that is not allowed for this signature.
551
+ # Reset to the default label and drop any structured attributes.
552
+ resolved_edge.name = DEFAULT_EDGE_NAME
553
+ resolved_edge.attributes = {}
554
+ elif not is_default_type:
555
+ # Non-custom labels are allowed to pass through so long as the LLM does
556
+ # not return the sentinel DEFAULT value.
557
+ resolved_edge.name = fact_type
558
+ resolved_edge.attributes = {}
476
559
 
477
560
  end = time()
478
561
  logger.debug(
@@ -24,7 +24,12 @@ from graphiti_core.graphiti_types import GraphitiClients
24
24
  from graphiti_core.helpers import MAX_REFLEXION_ITERATIONS, semaphore_gather
25
25
  from graphiti_core.llm_client import LLMClient
26
26
  from graphiti_core.llm_client.config import ModelSize
27
- from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode, create_entity_node_embeddings
27
+ from graphiti_core.nodes import (
28
+ EntityNode,
29
+ EpisodeType,
30
+ EpisodicNode,
31
+ create_entity_node_embeddings,
32
+ )
28
33
  from graphiti_core.prompts import prompt_library
29
34
  from graphiti_core.prompts.dedupe_nodes import NodeDuplicate, NodeResolutions
30
35
  from graphiti_core.prompts.extract_nodes import (
@@ -38,7 +43,15 @@ from graphiti_core.search.search_config import SearchResults
38
43
  from graphiti_core.search.search_config_recipes import NODE_HYBRID_SEARCH_RRF
39
44
  from graphiti_core.search.search_filters import SearchFilters
40
45
  from graphiti_core.utils.datetime_utils import utc_now
41
- from graphiti_core.utils.maintenance.edge_operations import filter_existing_duplicate_of_edges
46
+ from graphiti_core.utils.maintenance.dedup_helpers import (
47
+ DedupCandidateIndexes,
48
+ DedupResolutionState,
49
+ _build_candidate_indexes,
50
+ _resolve_with_similarity,
51
+ )
52
+ from graphiti_core.utils.maintenance.edge_operations import (
53
+ filter_existing_duplicate_of_edges,
54
+ )
42
55
 
43
56
  logger = logging.getLogger(__name__)
44
57
 
@@ -119,11 +132,13 @@ async def extract_nodes(
119
132
  )
120
133
  elif episode.source == EpisodeType.text:
121
134
  llm_response = await llm_client.generate_response(
122
- prompt_library.extract_nodes.extract_text(context), response_model=ExtractedEntities
135
+ prompt_library.extract_nodes.extract_text(context),
136
+ response_model=ExtractedEntities,
123
137
  )
124
138
  elif episode.source == EpisodeType.json:
125
139
  llm_response = await llm_client.generate_response(
126
- prompt_library.extract_nodes.extract_json(context), response_model=ExtractedEntities
140
+ prompt_library.extract_nodes.extract_json(context),
141
+ response_model=ExtractedEntities,
127
142
  )
128
143
 
129
144
  response_object = ExtractedEntities(**llm_response)
@@ -181,17 +196,12 @@ async def extract_nodes(
181
196
  return extracted_nodes
182
197
 
183
198
 
184
- async def resolve_extracted_nodes(
199
+ async def _collect_candidate_nodes(
185
200
  clients: GraphitiClients,
186
201
  extracted_nodes: list[EntityNode],
187
- episode: EpisodicNode | None = None,
188
- previous_episodes: list[EpisodicNode] | None = None,
189
- entity_types: dict[str, type[BaseModel]] | None = None,
190
- existing_nodes_override: list[EntityNode] | None = None,
191
- ) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
192
- llm_client = clients.llm_client
193
- driver = clients.driver
194
-
202
+ existing_nodes_override: list[EntityNode] | None,
203
+ ) -> list[EntityNode]:
204
+ """Search per extracted name and return unique candidates with overrides honored in order."""
195
205
  search_results: list[SearchResults] = await semaphore_gather(
196
206
  *[
197
207
  search(
@@ -205,33 +215,44 @@ async def resolve_extracted_nodes(
205
215
  ]
206
216
  )
207
217
 
208
- candidate_nodes: list[EntityNode] = (
209
- [node for result in search_results for node in result.nodes]
210
- if existing_nodes_override is None
211
- else existing_nodes_override
212
- )
218
+ candidate_nodes: list[EntityNode] = [node for result in search_results for node in result.nodes]
213
219
 
214
- existing_nodes_dict: dict[str, EntityNode] = {node.uuid: node for node in candidate_nodes}
220
+ if existing_nodes_override is not None:
221
+ candidate_nodes.extend(existing_nodes_override)
215
222
 
216
- existing_nodes: list[EntityNode] = list(existing_nodes_dict.values())
223
+ seen_candidate_uuids: set[str] = set()
224
+ ordered_candidates: list[EntityNode] = []
225
+ for candidate in candidate_nodes:
226
+ if candidate.uuid in seen_candidate_uuids:
227
+ continue
228
+ seen_candidate_uuids.add(candidate.uuid)
229
+ ordered_candidates.append(candidate)
230
+
231
+ return ordered_candidates
217
232
 
218
- existing_nodes_context = (
219
- [
220
- {
221
- **{
222
- 'idx': i,
223
- 'name': candidate.name,
224
- 'entity_types': candidate.labels,
225
- },
226
- **candidate.attributes,
227
- }
228
- for i, candidate in enumerate(existing_nodes)
229
- ],
230
- )
233
+
234
+ async def _resolve_with_llm(
235
+ llm_client: LLMClient,
236
+ extracted_nodes: list[EntityNode],
237
+ indexes: DedupCandidateIndexes,
238
+ state: DedupResolutionState,
239
+ ensure_ascii: bool,
240
+ episode: EpisodicNode | None,
241
+ previous_episodes: list[EpisodicNode] | None,
242
+ entity_types: dict[str, type[BaseModel]] | None,
243
+ ) -> None:
244
+ """Escalate unresolved nodes to the dedupe prompt so the LLM can select or reject duplicates.
245
+
246
+ The guardrails below defensively ignore malformed or duplicate LLM responses so the
247
+ ingestion workflow remains deterministic even when the model misbehaves.
248
+ """
249
+ if not state.unresolved_indices:
250
+ return
231
251
 
232
252
  entity_types_dict: dict[str, type[BaseModel]] = entity_types if entity_types is not None else {}
233
253
 
234
- # Prepare context for LLM
254
+ llm_extracted_nodes = [extracted_nodes[i] for i in state.unresolved_indices]
255
+
235
256
  extracted_nodes_context = [
236
257
  {
237
258
  'id': i,
@@ -242,17 +263,29 @@ async def resolve_extracted_nodes(
242
263
  ).__doc__
243
264
  or 'Default Entity Type',
244
265
  }
245
- for i, node in enumerate(extracted_nodes)
266
+ for i, node in enumerate(llm_extracted_nodes)
267
+ ]
268
+
269
+ existing_nodes_context = [
270
+ {
271
+ **{
272
+ 'idx': i,
273
+ 'name': candidate.name,
274
+ 'entity_types': candidate.labels,
275
+ },
276
+ **candidate.attributes,
277
+ }
278
+ for i, candidate in enumerate(indexes.existing_nodes)
246
279
  ]
247
280
 
248
281
  context = {
249
282
  'extracted_nodes': extracted_nodes_context,
250
283
  'existing_nodes': existing_nodes_context,
251
284
  'episode_content': episode.content if episode is not None else '',
252
- 'previous_episodes': [ep.content for ep in previous_episodes]
253
- if previous_episodes is not None
254
- else [],
255
- 'ensure_ascii': clients.ensure_ascii,
285
+ 'previous_episodes': (
286
+ [ep.content for ep in previous_episodes] if previous_episodes is not None else []
287
+ ),
288
+ 'ensure_ascii': ensure_ascii,
256
289
  }
257
290
 
258
291
  llm_response = await llm_client.generate_response(
@@ -262,33 +295,105 @@ async def resolve_extracted_nodes(
262
295
 
263
296
  node_resolutions: list[NodeDuplicate] = NodeResolutions(**llm_response).entity_resolutions
264
297
 
265
- resolved_nodes: list[EntityNode] = []
266
- uuid_map: dict[str, str] = {}
267
- node_duplicates: list[tuple[EntityNode, EntityNode]] = []
298
+ valid_relative_range = range(len(state.unresolved_indices))
299
+ processed_relative_ids: set[int] = set()
300
+
268
301
  for resolution in node_resolutions:
269
- resolution_id: int = resolution.id
302
+ relative_id: int = resolution.id
270
303
  duplicate_idx: int = resolution.duplicate_idx
271
304
 
272
- extracted_node = extracted_nodes[resolution_id]
305
+ if relative_id not in valid_relative_range:
306
+ logger.warning(
307
+ 'Skipping invalid LLM dedupe id %s (unresolved indices: %s)',
308
+ relative_id,
309
+ state.unresolved_indices,
310
+ )
311
+ continue
312
+
313
+ if relative_id in processed_relative_ids:
314
+ logger.warning('Duplicate LLM dedupe id %s received; ignoring.', relative_id)
315
+ continue
316
+ processed_relative_ids.add(relative_id)
273
317
 
274
- resolved_node = (
275
- existing_nodes[duplicate_idx]
276
- if 0 <= duplicate_idx < len(existing_nodes)
277
- else extracted_node
278
- )
318
+ original_index = state.unresolved_indices[relative_id]
319
+ extracted_node = extracted_nodes[original_index]
279
320
 
280
- # resolved_node.name = resolution.get('name')
321
+ resolved_node: EntityNode
322
+ if duplicate_idx == -1:
323
+ resolved_node = extracted_node
324
+ elif 0 <= duplicate_idx < len(indexes.existing_nodes):
325
+ resolved_node = indexes.existing_nodes[duplicate_idx]
326
+ else:
327
+ logger.warning(
328
+ 'Invalid duplicate_idx %s for extracted node %s; treating as no duplicate.',
329
+ duplicate_idx,
330
+ extracted_node.uuid,
331
+ )
332
+ resolved_node = extracted_node
281
333
 
282
- resolved_nodes.append(resolved_node)
283
- uuid_map[extracted_node.uuid] = resolved_node.uuid
334
+ state.resolved_nodes[original_index] = resolved_node
335
+ state.uuid_map[extracted_node.uuid] = resolved_node.uuid
336
+ if resolved_node.uuid != extracted_node.uuid:
337
+ state.duplicate_pairs.append((extracted_node, resolved_node))
284
338
 
285
- logger.debug(f'Resolved nodes: {[(n.name, n.uuid) for n in resolved_nodes]}')
339
+
340
+ async def resolve_extracted_nodes(
341
+ clients: GraphitiClients,
342
+ extracted_nodes: list[EntityNode],
343
+ episode: EpisodicNode | None = None,
344
+ previous_episodes: list[EpisodicNode] | None = None,
345
+ entity_types: dict[str, type[BaseModel]] | None = None,
346
+ existing_nodes_override: list[EntityNode] | None = None,
347
+ ) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
348
+ """Search for existing nodes, resolve deterministic matches, then escalate holdouts to the LLM dedupe prompt."""
349
+ llm_client = clients.llm_client
350
+ driver = clients.driver
351
+ existing_nodes = await _collect_candidate_nodes(
352
+ clients,
353
+ extracted_nodes,
354
+ existing_nodes_override,
355
+ )
356
+
357
+ indexes: DedupCandidateIndexes = _build_candidate_indexes(existing_nodes)
358
+
359
+ state = DedupResolutionState(
360
+ resolved_nodes=[None] * len(extracted_nodes),
361
+ uuid_map={},
362
+ unresolved_indices=[],
363
+ )
364
+
365
+ _resolve_with_similarity(extracted_nodes, indexes, state)
366
+
367
+ await _resolve_with_llm(
368
+ llm_client,
369
+ extracted_nodes,
370
+ indexes,
371
+ state,
372
+ clients.ensure_ascii,
373
+ episode,
374
+ previous_episodes,
375
+ entity_types,
376
+ )
377
+
378
+ for idx, node in enumerate(extracted_nodes):
379
+ if state.resolved_nodes[idx] is None:
380
+ state.resolved_nodes[idx] = node
381
+ state.uuid_map[node.uuid] = node.uuid
382
+
383
+ logger.debug(
384
+ 'Resolved nodes: %s',
385
+ [(node.name, node.uuid) for node in state.resolved_nodes if node is not None],
386
+ )
286
387
 
287
388
  new_node_duplicates: list[
288
389
  tuple[EntityNode, EntityNode]
289
- ] = await filter_existing_duplicate_of_edges(driver, node_duplicates)
390
+ ] = await filter_existing_duplicate_of_edges(driver, state.duplicate_pairs)
290
391
 
291
- return resolved_nodes, uuid_map, new_node_duplicates
392
+ return (
393
+ [node for node in state.resolved_nodes if node is not None],
394
+ state.uuid_map,
395
+ new_node_duplicates,
396
+ )
292
397
 
293
398
 
294
399
  async def extract_attributes_from_nodes(
@@ -307,9 +412,11 @@ async def extract_attributes_from_nodes(
307
412
  node,
308
413
  episode,
309
414
  previous_episodes,
310
- entity_types.get(next((item for item in node.labels if item != 'Entity'), ''))
311
- if entity_types is not None
312
- else None,
415
+ (
416
+ entity_types.get(next((item for item in node.labels if item != 'Entity'), ''))
417
+ if entity_types is not None
418
+ else None
419
+ ),
313
420
  clients.ensure_ascii,
314
421
  )
315
422
  for node in nodes
@@ -339,18 +446,18 @@ async def extract_attributes_from_node(
339
446
  attributes_context: dict[str, Any] = {
340
447
  'node': node_context,
341
448
  'episode_content': episode.content if episode is not None else '',
342
- 'previous_episodes': [ep.content for ep in previous_episodes]
343
- if previous_episodes is not None
344
- else [],
449
+ 'previous_episodes': (
450
+ [ep.content for ep in previous_episodes] if previous_episodes is not None else []
451
+ ),
345
452
  'ensure_ascii': ensure_ascii,
346
453
  }
347
454
 
348
455
  summary_context: dict[str, Any] = {
349
456
  'node': node_context,
350
457
  'episode_content': episode.content if episode is not None else '',
351
- 'previous_episodes': [ep.content for ep in previous_episodes]
352
- if previous_episodes is not None
353
- else [],
458
+ 'previous_episodes': (
459
+ [ep.content for ep in previous_episodes] if previous_episodes is not None else []
460
+ ),
354
461
  'ensure_ascii': ensure_ascii,
355
462
  }
356
463
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphiti-core
3
- Version: 0.21.0rc6
3
+ Version: 0.21.0rc7
4
4
  Summary: A temporal graph building library
5
5
  Project-URL: Homepage, https://help.getzep.com/graphiti/graphiti/overview
6
6
  Project-URL: Repository, https://github.com/getzep/graphiti
@@ -20,6 +20,7 @@ Provides-Extra: anthropic
20
20
  Requires-Dist: anthropic>=0.49.0; extra == 'anthropic'
21
21
  Provides-Extra: dev
22
22
  Requires-Dist: anthropic>=0.49.0; extra == 'dev'
23
+ Requires-Dist: boto3>=1.39.16; extra == 'dev'
23
24
  Requires-Dist: diskcache-stubs>=5.6.3.6.20240818; extra == 'dev'
24
25
  Requires-Dist: falkordb<2.0.0,>=1.1.2; extra == 'dev'
25
26
  Requires-Dist: google-genai>=1.8.0; extra == 'dev'
@@ -28,9 +29,11 @@ Requires-Dist: ipykernel>=6.29.5; extra == 'dev'
28
29
  Requires-Dist: jupyterlab>=4.2.4; extra == 'dev'
29
30
  Requires-Dist: kuzu>=0.11.2; extra == 'dev'
30
31
  Requires-Dist: langchain-anthropic>=0.2.4; extra == 'dev'
32
+ Requires-Dist: langchain-aws>=0.2.29; extra == 'dev'
31
33
  Requires-Dist: langchain-openai>=0.2.6; extra == 'dev'
32
34
  Requires-Dist: langgraph>=0.2.15; extra == 'dev'
33
35
  Requires-Dist: langsmith>=0.1.108; extra == 'dev'
36
+ Requires-Dist: opensearch-py>=3.0.0; extra == 'dev'
34
37
  Requires-Dist: pyright>=1.1.404; extra == 'dev'
35
38
  Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
36
39
  Requires-Dist: pytest-xdist>=3.6.1; extra == 'dev'
@@ -2,7 +2,7 @@ graphiti_core/__init__.py,sha256=e5SWFkRiaUwfprYIeIgVIh7JDedNiloZvd3roU-0aDY,55
2
2
  graphiti_core/edges.py,sha256=PhJm_s28cHLEaIqcw66wP16hOq4P4bVQbC_sESHQkXU,20919
3
3
  graphiti_core/errors.py,sha256=cH_v9TPgEPeQE6GFOHIg5TvejpUCBddGarMY2Whxbwc,2707
4
4
  graphiti_core/graph_queries.py,sha256=ZWMqAo5pwb8PO5ddg4zZ0ArhHWuWV42g3R9ULIxsHOs,8058
5
- graphiti_core/graphiti.py,sha256=fC7CCGeZDNAGK2Bj2tW30QzHMAjqWvpYZiY1A0yUvHM,41917
5
+ graphiti_core/graphiti.py,sha256=5Y3SdcC_Ebhp-oqbbIxb0KGshWU24EQx4YYKvK8Id8g,41935
6
6
  graphiti_core/graphiti_types.py,sha256=C_p2XwScQlCzo7ets097TrSLs9ATxPZQ4WCsxDS7QHc,1066
7
7
  graphiti_core/helpers.py,sha256=q8kbL9gz8igdlh-oMUS-ylUyeMlXZb-ccf-HQkrES_0,5184
8
8
  graphiti_core/nodes.py,sha256=wYLQcVEXvQMxTpTc9LWSoPTzzaoUOm0rl07c9wS1XSY,30323
@@ -27,14 +27,14 @@ graphiti_core/embedder/voyage.py,sha256=oJHAZiNqjdEJOKgoKfGWcxK2-Ewqn5UB3vrBwIwP
27
27
  graphiti_core/llm_client/__init__.py,sha256=QgBWUiCeBp6YiA_xqyrDvJ9jIyy1hngH8g7FWahN3nw,776
28
28
  graphiti_core/llm_client/anthropic_client.py,sha256=xTFcrgMDK77BwnChBhYj51Jaa2mRNI850oJv2pKZI0A,12892
29
29
  graphiti_core/llm_client/azure_openai_client.py,sha256=ekERggAekbb7enes1RJqdRChf_mjaZTFXsnMbxO7azQ,2497
30
- graphiti_core/llm_client/client.py,sha256=cUwwCZEhP9jJAI04AhHxsFPecggajSgCRCM3frrYJqA,6473
30
+ graphiti_core/llm_client/client.py,sha256=KUWq7Gq9J4PdP06lLCBEb8OSZOE6luPqaQ3xgtpZwWg,6835
31
31
  graphiti_core/llm_client/config.py,sha256=pivp29CDIbDPqgw5NF9Ok2AwcqTV5z5_Q1bgNs1CDGs,2560
32
32
  graphiti_core/llm_client/errors.py,sha256=pn6brRiLW60DAUIXJYKBT6MInrS4ueuH1hNLbn_JbQo,1243
33
- graphiti_core/llm_client/gemini_client.py,sha256=m0-6SFUs8qqoR5rGTrASAcMtTbJKfZqO4-MaDr4CYCQ,17719
33
+ graphiti_core/llm_client/gemini_client.py,sha256=AxD7sqsPQdgfcZCBIGN302s1hFYlBN9FOQcDEV0tw08,17725
34
34
  graphiti_core/llm_client/groq_client.py,sha256=bYLE_cg1QEhugsJOXh4b1vPbxagKeMWqk48240GCzMs,2922
35
- graphiti_core/llm_client/openai_base_client.py,sha256=c_K9hMaSfBQuiG4Kq_4Zy04eh4_SrNtNQ0aMc2tmAoY,8482
35
+ graphiti_core/llm_client/openai_base_client.py,sha256=LeEBZ33Y_bIz-YSr6aCbYKMI9r0SNPeZkALXQ0iFsSE,8488
36
36
  graphiti_core/llm_client/openai_client.py,sha256=AuaCFQFMJEGzBkFVouccq3XentmWRIKW0RLRBCUMm7Y,3763
37
- graphiti_core/llm_client/openai_generic_client.py,sha256=WElMnPqdb1CxzYH4p2-m_9rVMr5M93-eXnc3yVxBgFg,7001
37
+ graphiti_core/llm_client/openai_generic_client.py,sha256=lyOQwzIMVb9pk3WWrU5zsG38J26QGKebxC40-lRYMJg,7007
38
38
  graphiti_core/llm_client/utils.py,sha256=zKpxXEbKa369m4W7RDEf-m56kH46V1Mx3RowcWZEWWs,1000
39
39
  graphiti_core/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  graphiti_core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -44,11 +44,11 @@ graphiti_core/models/nodes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
44
44
  graphiti_core/models/nodes/node_db_queries.py,sha256=TCHZKG5bQNarV9C5k4hOFFqc-LwTVQ8Pnd6okVVNKbo,12826
45
45
  graphiti_core/prompts/__init__.py,sha256=EA-x9xUki9l8wnu2l8ek_oNf75-do5tq5hVq7Zbv8Kw,101
46
46
  graphiti_core/prompts/dedupe_edges.py,sha256=WRXQi7JQZdIfKDICWyU7Wbs5WyD_KBblLBSeKdbLyuk,5914
47
- graphiti_core/prompts/dedupe_nodes.py,sha256=eYDk0axHEKLjZS2tKlT4Zy1fW9EJkn6EnrJLSN0fvAY,8235
47
+ graphiti_core/prompts/dedupe_nodes.py,sha256=H4sIzpi1gBwPedTMhdY175jnLj5JtnEeb_WNITitPLU,9171
48
48
  graphiti_core/prompts/eval.py,sha256=ijwxbE87G678imdhfPvRujepQMq_JZ3XHX4vOAcVnVI,5507
49
49
  graphiti_core/prompts/extract_edge_dates.py,sha256=3Drs3CmvP0gJN5BidWSxrNvLet3HPoTybU3BUIAoc0Y,4218
50
50
  graphiti_core/prompts/extract_edges.py,sha256=mnncxb6lyr3ufKajRAh09czmJawiEM54sSPNy9ukiio,6888
51
- graphiti_core/prompts/extract_nodes.py,sha256=YbdpFzVyMo7J0rPSbw4l5qqzoNQKsSfPKrDo75t2GWQ,11541
51
+ graphiti_core/prompts/extract_nodes.py,sha256=GYX97qlSSrR_3QLc48EGCti8tdC1_OKpEdAR0Y2wfVY,11629
52
52
  graphiti_core/prompts/invalidate_edges.py,sha256=yfpcs_pyctnoM77ULPZXEtKW0oHr1MeLsJzC5yrE-o4,3547
53
53
  graphiti_core/prompts/lib.py,sha256=DCyHePM4_q-CptTpEXGO_dBv9k7xDtclEaB1dGu7EcI,4092
54
54
  graphiti_core/prompts/models.py,sha256=NgxdbPHJpBEcpbXovKyScgpBc73Q-GIW-CBDlBtDjto,894
@@ -64,17 +64,18 @@ graphiti_core/search/search_utils.py,sha256=ak1aBeKNuxS7szydNHwva2ABWSRlQ0S_v8ZO
64
64
  graphiti_core/telemetry/__init__.py,sha256=5kALLDlU9bb2v19CdN7qVANsJWyfnL9E60J6FFgzm3o,226
65
65
  graphiti_core/telemetry/telemetry.py,sha256=47LrzOVBCcZxsYPsnSxWFiztHoxYKKxPwyRX0hnbDGc,3230
66
66
  graphiti_core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
- graphiti_core/utils/bulk_utils.py,sha256=9XWXqjxiu2ydKMLKQRTbvzO6cO1o1HRjjpmaf5Ym51k,17633
67
+ graphiti_core/utils/bulk_utils.py,sha256=0rpBaPg1CBQu7djcSS9XWfv9T1unRRVW8_ge_Tf7lF0,20288
68
68
  graphiti_core/utils/datetime_utils.py,sha256=J-zYSq7-H-2n9hYOXNIun12kM10vNX9mMATGR_egTmY,1806
69
69
  graphiti_core/utils/maintenance/__init__.py,sha256=vW4H1KyapTl-OOz578uZABYcpND4wPx3Vt6aAPaXh78,301
70
70
  graphiti_core/utils/maintenance/community_operations.py,sha256=XMiokEemn96GlvjkOvbo9hIX04Fea3eVj408NHG5P4o,11042
71
- graphiti_core/utils/maintenance/edge_operations.py,sha256=rfFsqigWXNcUGKu1l1-RdSoFdEeioK78oo4VWOagqgs,20576
71
+ graphiti_core/utils/maintenance/dedup_helpers.py,sha256=B7k6KkB6Sii8PZCWNNTvsNiy4BNTNWpoLeGgrPLq6BE,9220
72
+ graphiti_core/utils/maintenance/edge_operations.py,sha256=9bRCI_3loKJX3EAMLpNULWLnhSDCHsCghiqbXPdicPM,24808
72
73
  graphiti_core/utils/maintenance/graph_data_operations.py,sha256=42icj3S_ELAJ-NK3jVS_rg_243dmnaZOyUitJj_uJ-M,6085
73
- graphiti_core/utils/maintenance/node_operations.py,sha256=xSeRK8cdit9GT9ZKNGpawg01Wcu1W3FsBH9moFH_mao,13423
74
+ graphiti_core/utils/maintenance/node_operations.py,sha256=TKpXPtnTVxxan8I1xQyVkGn3zyRdb_Q00cgUpLcloig,16860
74
75
  graphiti_core/utils/maintenance/temporal_operations.py,sha256=IIaVtShpVkOYe6haxz3a1x3v54-MzaEXG8VsxFUNeoY,3582
75
76
  graphiti_core/utils/maintenance/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  graphiti_core/utils/ontology_utils/entity_types_utils.py,sha256=4eVgxLWY6Q8k9cRJ5pW59IYF--U4nXZsZIGOVb_yHfQ,1285
77
- graphiti_core-0.21.0rc6.dist-info/METADATA,sha256=55pEDr5ujjIjMUKQTqbJ_YRvCSAuUi41ejVjYXsQiSA,26933
78
- graphiti_core-0.21.0rc6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
- graphiti_core-0.21.0rc6.dist-info/licenses/LICENSE,sha256=KCUwCyDXuVEgmDWkozHyniRyWjnWUWjkuDHfU6o3JlA,11325
80
- graphiti_core-0.21.0rc6.dist-info/RECORD,,
78
+ graphiti_core-0.21.0rc7.dist-info/METADATA,sha256=pAEEXoHTF8p8L1ds3kF_6KEjdhyB2iX9DPTuucxMe0o,27084
79
+ graphiti_core-0.21.0rc7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
80
+ graphiti_core-0.21.0rc7.dist-info/licenses/LICENSE,sha256=KCUwCyDXuVEgmDWkozHyniRyWjnWUWjkuDHfU6o3JlA,11325
81
+ graphiti_core-0.21.0rc7.dist-info/RECORD,,