graphiti-core 0.12.0rc1__py3-none-any.whl → 0.24.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. graphiti_core/cross_encoder/bge_reranker_client.py +12 -2
  2. graphiti_core/cross_encoder/gemini_reranker_client.py +161 -0
  3. graphiti_core/cross_encoder/openai_reranker_client.py +7 -5
  4. graphiti_core/decorators.py +110 -0
  5. graphiti_core/driver/__init__.py +19 -0
  6. graphiti_core/driver/driver.py +124 -0
  7. graphiti_core/driver/falkordb_driver.py +362 -0
  8. graphiti_core/driver/graph_operations/graph_operations.py +191 -0
  9. graphiti_core/driver/kuzu_driver.py +182 -0
  10. graphiti_core/driver/neo4j_driver.py +117 -0
  11. graphiti_core/driver/neptune_driver.py +305 -0
  12. graphiti_core/driver/search_interface/search_interface.py +89 -0
  13. graphiti_core/edges.py +287 -172
  14. graphiti_core/embedder/azure_openai.py +71 -0
  15. graphiti_core/embedder/client.py +2 -1
  16. graphiti_core/embedder/gemini.py +116 -22
  17. graphiti_core/embedder/voyage.py +13 -2
  18. graphiti_core/errors.py +8 -0
  19. graphiti_core/graph_queries.py +162 -0
  20. graphiti_core/graphiti.py +705 -193
  21. graphiti_core/graphiti_types.py +4 -2
  22. graphiti_core/helpers.py +87 -10
  23. graphiti_core/llm_client/__init__.py +16 -0
  24. graphiti_core/llm_client/anthropic_client.py +159 -56
  25. graphiti_core/llm_client/azure_openai_client.py +115 -0
  26. graphiti_core/llm_client/client.py +98 -21
  27. graphiti_core/llm_client/config.py +1 -1
  28. graphiti_core/llm_client/gemini_client.py +290 -41
  29. graphiti_core/llm_client/groq_client.py +14 -3
  30. graphiti_core/llm_client/openai_base_client.py +261 -0
  31. graphiti_core/llm_client/openai_client.py +56 -132
  32. graphiti_core/llm_client/openai_generic_client.py +91 -56
  33. graphiti_core/models/edges/edge_db_queries.py +259 -35
  34. graphiti_core/models/nodes/node_db_queries.py +311 -32
  35. graphiti_core/nodes.py +420 -205
  36. graphiti_core/prompts/dedupe_edges.py +46 -32
  37. graphiti_core/prompts/dedupe_nodes.py +67 -42
  38. graphiti_core/prompts/eval.py +4 -4
  39. graphiti_core/prompts/extract_edges.py +27 -16
  40. graphiti_core/prompts/extract_nodes.py +74 -31
  41. graphiti_core/prompts/prompt_helpers.py +39 -0
  42. graphiti_core/prompts/snippets.py +29 -0
  43. graphiti_core/prompts/summarize_nodes.py +23 -25
  44. graphiti_core/search/search.py +158 -82
  45. graphiti_core/search/search_config.py +39 -4
  46. graphiti_core/search/search_filters.py +126 -35
  47. graphiti_core/search/search_helpers.py +5 -6
  48. graphiti_core/search/search_utils.py +1405 -485
  49. graphiti_core/telemetry/__init__.py +9 -0
  50. graphiti_core/telemetry/telemetry.py +117 -0
  51. graphiti_core/tracer.py +193 -0
  52. graphiti_core/utils/bulk_utils.py +364 -285
  53. graphiti_core/utils/datetime_utils.py +13 -0
  54. graphiti_core/utils/maintenance/community_operations.py +67 -49
  55. graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
  56. graphiti_core/utils/maintenance/edge_operations.py +339 -197
  57. graphiti_core/utils/maintenance/graph_data_operations.py +50 -114
  58. graphiti_core/utils/maintenance/node_operations.py +319 -238
  59. graphiti_core/utils/maintenance/temporal_operations.py +11 -3
  60. graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
  61. graphiti_core/utils/text_utils.py +53 -0
  62. graphiti_core-0.24.3.dist-info/METADATA +726 -0
  63. graphiti_core-0.24.3.dist-info/RECORD +86 -0
  64. {graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
  65. graphiti_core-0.12.0rc1.dist-info/METADATA +0 -350
  66. graphiti_core-0.12.0rc1.dist-info/RECORD +0 -66
  67. /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
  68. {graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info/licenses}/LICENSE +0 -0
@@ -40,3 +40,16 @@ def ensure_utc(dt: datetime | None) -> datetime | None:
40
40
  return dt.astimezone(timezone.utc)
41
41
 
42
42
  return dt
43
+
44
+
45
+ def convert_datetimes_to_strings(obj):
46
+ if isinstance(obj, dict):
47
+ return {k: convert_datetimes_to_strings(v) for k, v in obj.items()}
48
+ elif isinstance(obj, list):
49
+ return [convert_datetimes_to_strings(item) for item in obj]
50
+ elif isinstance(obj, tuple):
51
+ return tuple(convert_datetimes_to_strings(item) for item in obj)
52
+ elif isinstance(obj, datetime):
53
+ return obj.isoformat()
54
+ else:
55
+ return obj
@@ -2,13 +2,14 @@ import asyncio
2
2
  import logging
3
3
  from collections import defaultdict
4
4
 
5
- from neo4j import AsyncDriver
6
5
  from pydantic import BaseModel
7
6
 
7
+ from graphiti_core.driver.driver import GraphDriver, GraphProvider
8
8
  from graphiti_core.edges import CommunityEdge
9
9
  from graphiti_core.embedder import EmbedderClient
10
- from graphiti_core.helpers import DEFAULT_DATABASE, semaphore_gather
10
+ from graphiti_core.helpers import semaphore_gather
11
11
  from graphiti_core.llm_client import LLMClient
12
+ from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
12
13
  from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
13
14
  from graphiti_core.prompts import prompt_library
14
15
  from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
@@ -26,37 +27,43 @@ class Neighbor(BaseModel):
26
27
 
27
28
 
28
29
  async def get_community_clusters(
29
- driver: AsyncDriver, group_ids: list[str] | None
30
+ driver: GraphDriver, group_ids: list[str] | None
30
31
  ) -> list[list[EntityNode]]:
31
32
  community_clusters: list[list[EntityNode]] = []
32
33
 
33
34
  if group_ids is None:
34
35
  group_id_values, _, _ = await driver.execute_query(
35
36
  """
36
- MATCH (n:Entity WHERE n.group_id IS NOT NULL)
37
- RETURN
38
- collect(DISTINCT n.group_id) AS group_ids
39
- """,
40
- database_=DEFAULT_DATABASE,
37
+ MATCH (n:Entity)
38
+ WHERE n.group_id IS NOT NULL
39
+ RETURN
40
+ collect(DISTINCT n.group_id) AS group_ids
41
+ """
41
42
  )
42
43
 
43
- group_ids = group_id_values[0]['group_ids']
44
+ group_ids = group_id_values[0]['group_ids'] if group_id_values else []
44
45
 
45
46
  for group_id in group_ids:
46
47
  projection: dict[str, list[Neighbor]] = {}
47
48
  nodes = await EntityNode.get_by_group_ids(driver, [group_id])
48
49
  for node in nodes:
49
- records, _, _ = await driver.execute_query(
50
+ match_query = """
51
+ MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
52
+ """
53
+ if driver.provider == GraphProvider.KUZU:
54
+ match_query = """
55
+ MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
50
56
  """
51
- MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[r:RELATES_TO]-(m: Entity {group_id: $group_id})
52
- WITH count(r) AS count, m.uuid AS uuid
53
- RETURN
54
- uuid,
55
- count
56
- """,
57
+ records, _, _ = await driver.execute_query(
58
+ match_query
59
+ + """
60
+ WITH count(e) AS count, m.uuid AS uuid
61
+ RETURN
62
+ uuid,
63
+ count
64
+ """,
57
65
  uuid=node.uuid,
58
66
  group_id=group_id,
59
- database_=DEFAULT_DATABASE,
60
67
  )
61
68
 
62
69
  projection[node.uuid] = [
@@ -95,7 +102,6 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
95
102
  community_candidates: dict[int, int] = defaultdict(int)
96
103
  for neighbor in neighbors:
97
104
  community_candidates[community_map[neighbor.node_uuid]] += neighbor.edge_count
98
-
99
105
  community_lst = [
100
106
  (count, community) for community, count in community_candidates.items()
101
107
  ]
@@ -127,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
127
133
 
128
134
  async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
129
135
  # Prepare context for LLM
130
- context = {'node_summaries': [{'summary': summary} for summary in summary_pair]}
136
+ context = {
137
+ 'node_summaries': [{'summary': summary} for summary in summary_pair],
138
+ }
131
139
 
132
140
  llm_response = await llm_client.generate_response(
133
- prompt_library.summarize_nodes.summarize_pair(context), response_model=Summary
141
+ prompt_library.summarize_nodes.summarize_pair(context),
142
+ response_model=Summary,
143
+ prompt_name='summarize_nodes.summarize_pair',
134
144
  )
135
145
 
136
146
  pair_summary = llm_response.get('summary', '')
@@ -139,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
139
149
 
140
150
 
141
151
  async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
142
- context = {'summary': summary}
152
+ context = {
153
+ 'summary': summary,
154
+ }
143
155
 
144
156
  llm_response = await llm_client.generate_response(
145
157
  prompt_library.summarize_nodes.summary_description(context),
146
158
  response_model=SummaryDescription,
159
+ prompt_name='summarize_nodes.summary_description',
147
160
  )
148
161
 
149
162
  description = llm_response.get('description', '')
@@ -194,7 +207,9 @@ async def build_community(
194
207
 
195
208
 
196
209
  async def build_communities(
197
- driver: AsyncDriver, llm_client: LLMClient, group_ids: list[str] | None
210
+ driver: GraphDriver,
211
+ llm_client: LLMClient,
212
+ group_ids: list[str] | None,
198
213
  ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
199
214
  community_clusters = await get_community_clusters(driver, group_ids)
200
215
 
@@ -219,50 +234,46 @@ async def build_communities(
219
234
  return community_nodes, community_edges
220
235
 
221
236
 
222
- async def remove_communities(driver: AsyncDriver):
237
+ async def remove_communities(driver: GraphDriver):
223
238
  await driver.execute_query(
224
239
  """
225
- MATCH (c:Community)
226
- DETACH DELETE c
227
- """,
228
- database_=DEFAULT_DATABASE,
240
+ MATCH (c:Community)
241
+ DETACH DELETE c
242
+ """
229
243
  )
230
244
 
231
245
 
232
246
  async def determine_entity_community(
233
- driver: AsyncDriver, entity: EntityNode
247
+ driver: GraphDriver, entity: EntityNode
234
248
  ) -> tuple[CommunityNode | None, bool]:
235
249
  # Check if the node is already part of a community
236
250
  records, _, _ = await driver.execute_query(
237
251
  """
238
- MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
239
- RETURN
240
- c.uuid As uuid,
241
- c.name AS name,
242
- c.group_id AS group_id,
243
- c.created_at AS created_at,
244
- c.summary AS summary
245
- """,
252
+ MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
253
+ RETURN
254
+ """
255
+ + COMMUNITY_NODE_RETURN,
246
256
  entity_uuid=entity.uuid,
247
- database_=DEFAULT_DATABASE,
248
257
  )
249
258
 
250
259
  if len(records) > 0:
251
260
  return get_community_node_from_record(records[0]), False
252
261
 
253
262
  # If the node has no community, add it to the mode community of surrounding entities
263
+ match_query = """
264
+ MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
265
+ """
266
+ if driver.provider == GraphProvider.KUZU:
267
+ match_query = """
268
+ MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
269
+ """
254
270
  records, _, _ = await driver.execute_query(
271
+ match_query
272
+ + """
273
+ RETURN
255
274
  """
256
- MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
257
- RETURN
258
- c.uuid As uuid,
259
- c.name AS name,
260
- c.group_id AS group_id,
261
- c.created_at AS created_at,
262
- c.summary AS summary
263
- """,
275
+ + COMMUNITY_NODE_RETURN,
264
276
  entity_uuid=entity.uuid,
265
- database_=DEFAULT_DATABASE,
266
277
  )
267
278
 
268
279
  communities: list[CommunityNode] = [
@@ -291,12 +302,15 @@ async def determine_entity_community(
291
302
 
292
303
 
293
304
  async def update_community(
294
- driver: AsyncDriver, llm_client: LLMClient, embedder: EmbedderClient, entity: EntityNode
295
- ):
305
+ driver: GraphDriver,
306
+ llm_client: LLMClient,
307
+ embedder: EmbedderClient,
308
+ entity: EntityNode,
309
+ ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
296
310
  community, is_new = await determine_entity_community(driver, entity)
297
311
 
298
312
  if community is None:
299
- return
313
+ return [], []
300
314
 
301
315
  new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
302
316
  new_name = await generate_summary_description(llm_client, new_summary)
@@ -304,10 +318,14 @@ async def update_community(
304
318
  community.summary = new_summary
305
319
  community.name = new_name
306
320
 
321
+ community_edges = []
307
322
  if is_new:
308
323
  community_edge = (build_community_edges([entity], community, utc_now()))[0]
309
324
  await community_edge.save(driver)
325
+ community_edges.append(community_edge)
310
326
 
311
327
  await community.generate_name_embedding(embedder)
312
328
 
313
329
  await community.save(driver)
330
+
331
+ return [community], community_edges
@@ -0,0 +1,262 @@
1
+ """
2
+ Copyright 2024, Zep Software, Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import math
20
+ import re
21
+ from collections import defaultdict
22
+ from collections.abc import Iterable
23
+ from dataclasses import dataclass, field
24
+ from functools import lru_cache
25
+ from hashlib import blake2b
26
+ from typing import TYPE_CHECKING
27
+
28
+ if TYPE_CHECKING:
29
+ from graphiti_core.nodes import EntityNode
30
+
31
+ _NAME_ENTROPY_THRESHOLD = 1.5
32
+ _MIN_NAME_LENGTH = 6
33
+ _MIN_TOKEN_COUNT = 2
34
+ _FUZZY_JACCARD_THRESHOLD = 0.9
35
+ _MINHASH_PERMUTATIONS = 32
36
+ _MINHASH_BAND_SIZE = 4
37
+
38
+
39
+ def _normalize_string_exact(name: str) -> str:
40
+ """Lowercase text and collapse whitespace so equal names map to the same key."""
41
+ normalized = re.sub(r'[\s]+', ' ', name.lower())
42
+ return normalized.strip()
43
+
44
+
45
+ def _normalize_name_for_fuzzy(name: str) -> str:
46
+ """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
47
+ normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
48
+ normalized = normalized.strip()
49
+ return re.sub(r'[\s]+', ' ', normalized)
50
+
51
+
52
+ def _name_entropy(normalized_name: str) -> float:
53
+ """Approximate text specificity using Shannon entropy over characters.
54
+
55
+ We strip spaces, count how often each character appears, and sum
56
+ probability * -log2(probability). Short or repetitive names yield low
57
+ entropy, which signals we should defer resolution to the LLM instead of
58
+ trusting fuzzy similarity.
59
+ """
60
+ if not normalized_name:
61
+ return 0.0
62
+
63
+ counts: dict[str, int] = {}
64
+ for char in normalized_name.replace(' ', ''):
65
+ counts[char] = counts.get(char, 0) + 1
66
+
67
+ total = sum(counts.values())
68
+ if total == 0:
69
+ return 0.0
70
+
71
+ entropy = 0.0
72
+ for count in counts.values():
73
+ probability = count / total
74
+ entropy -= probability * math.log2(probability)
75
+
76
+ return entropy
77
+
78
+
79
+ def _has_high_entropy(normalized_name: str) -> bool:
80
+ """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
81
+ token_count = len(normalized_name.split())
82
+ if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
83
+ return False
84
+
85
+ return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
86
+
87
+
88
+ def _shingles(normalized_name: str) -> set[str]:
89
+ """Create 3-gram shingles from the normalized name for MinHash calculations."""
90
+ cleaned = normalized_name.replace(' ', '')
91
+ if len(cleaned) < 2:
92
+ return {cleaned} if cleaned else set()
93
+
94
+ return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
95
+
96
+
97
+ def _hash_shingle(shingle: str, seed: int) -> int:
98
+ """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
99
+ digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
100
+ return int.from_bytes(digest.digest(), 'big')
101
+
102
+
103
+ def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
104
+ """Compute the MinHash signature for the shingle set across predefined permutations."""
105
+ if not shingles:
106
+ return tuple()
107
+
108
+ seeds = range(_MINHASH_PERMUTATIONS)
109
+ signature: list[int] = []
110
+ for seed in seeds:
111
+ min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
112
+ signature.append(min_hash)
113
+
114
+ return tuple(signature)
115
+
116
+
117
+ def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
118
+ """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
119
+ signature_list = list(signature)
120
+ if not signature_list:
121
+ return []
122
+
123
+ bands: list[tuple[int, ...]] = []
124
+ for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
125
+ band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
126
+ if len(band) == _MINHASH_BAND_SIZE:
127
+ bands.append(band)
128
+ return bands
129
+
130
+
131
+ def _jaccard_similarity(a: set[str], b: set[str]) -> float:
132
+ """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
133
+ if not a and not b:
134
+ return 1.0
135
+ if not a or not b:
136
+ return 0.0
137
+
138
+ intersection = len(a.intersection(b))
139
+ union = len(a.union(b))
140
+ return intersection / union if union else 0.0
141
+
142
+
143
+ @lru_cache(maxsize=512)
144
+ def _cached_shingles(name: str) -> set[str]:
145
+ """Cache shingle sets per normalized name to avoid recomputation within a worker."""
146
+ return _shingles(name)
147
+
148
+
149
+ @dataclass
150
+ class DedupCandidateIndexes:
151
+ """Precomputed lookup structures that drive entity deduplication heuristics."""
152
+
153
+ existing_nodes: list[EntityNode]
154
+ nodes_by_uuid: dict[str, EntityNode]
155
+ normalized_existing: defaultdict[str, list[EntityNode]]
156
+ shingles_by_candidate: dict[str, set[str]]
157
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
158
+
159
+
160
+ @dataclass
161
+ class DedupResolutionState:
162
+ """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
163
+
164
+ resolved_nodes: list[EntityNode | None]
165
+ uuid_map: dict[str, str]
166
+ unresolved_indices: list[int]
167
+ duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
168
+
169
+
170
+ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
171
+ """Precompute exact and fuzzy lookup structures once per dedupe run."""
172
+ normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
173
+ nodes_by_uuid: dict[str, EntityNode] = {}
174
+ shingles_by_candidate: dict[str, set[str]] = {}
175
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
176
+
177
+ for candidate in existing_nodes:
178
+ normalized = _normalize_string_exact(candidate.name)
179
+ normalized_existing[normalized].append(candidate)
180
+ nodes_by_uuid[candidate.uuid] = candidate
181
+
182
+ shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
183
+ shingles_by_candidate[candidate.uuid] = shingles
184
+
185
+ signature = _minhash_signature(shingles)
186
+ for band_index, band in enumerate(_lsh_bands(signature)):
187
+ lsh_buckets[(band_index, band)].append(candidate.uuid)
188
+
189
+ return DedupCandidateIndexes(
190
+ existing_nodes=existing_nodes,
191
+ nodes_by_uuid=nodes_by_uuid,
192
+ normalized_existing=normalized_existing,
193
+ shingles_by_candidate=shingles_by_candidate,
194
+ lsh_buckets=lsh_buckets,
195
+ )
196
+
197
+
198
+ def _resolve_with_similarity(
199
+ extracted_nodes: list[EntityNode],
200
+ indexes: DedupCandidateIndexes,
201
+ state: DedupResolutionState,
202
+ ) -> None:
203
+ """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
204
+ for idx, node in enumerate(extracted_nodes):
205
+ normalized_exact = _normalize_string_exact(node.name)
206
+ normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
207
+
208
+ if not _has_high_entropy(normalized_fuzzy):
209
+ state.unresolved_indices.append(idx)
210
+ continue
211
+
212
+ existing_matches = indexes.normalized_existing.get(normalized_exact, [])
213
+ if len(existing_matches) == 1:
214
+ match = existing_matches[0]
215
+ state.resolved_nodes[idx] = match
216
+ state.uuid_map[node.uuid] = match.uuid
217
+ if match.uuid != node.uuid:
218
+ state.duplicate_pairs.append((node, match))
219
+ continue
220
+ if len(existing_matches) > 1:
221
+ state.unresolved_indices.append(idx)
222
+ continue
223
+
224
+ shingles = _cached_shingles(normalized_fuzzy)
225
+ signature = _minhash_signature(shingles)
226
+ candidate_ids: set[str] = set()
227
+ for band_index, band in enumerate(_lsh_bands(signature)):
228
+ candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
229
+
230
+ best_candidate: EntityNode | None = None
231
+ best_score = 0.0
232
+ for candidate_id in candidate_ids:
233
+ candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
234
+ score = _jaccard_similarity(shingles, candidate_shingles)
235
+ if score > best_score:
236
+ best_score = score
237
+ best_candidate = indexes.nodes_by_uuid.get(candidate_id)
238
+
239
+ if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
240
+ state.resolved_nodes[idx] = best_candidate
241
+ state.uuid_map[node.uuid] = best_candidate.uuid
242
+ if best_candidate.uuid != node.uuid:
243
+ state.duplicate_pairs.append((node, best_candidate))
244
+ continue
245
+
246
+ state.unresolved_indices.append(idx)
247
+
248
+
249
+ __all__ = [
250
+ 'DedupCandidateIndexes',
251
+ 'DedupResolutionState',
252
+ '_normalize_string_exact',
253
+ '_normalize_name_for_fuzzy',
254
+ '_has_high_entropy',
255
+ '_minhash_signature',
256
+ '_lsh_bands',
257
+ '_jaccard_similarity',
258
+ '_cached_shingles',
259
+ '_FUZZY_JACCARD_THRESHOLD',
260
+ '_build_candidate_indexes',
261
+ '_resolve_with_similarity',
262
+ ]