graphiti-core 0.17.4__py3-none-any.whl → 0.24.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphiti_core/cross_encoder/gemini_reranker_client.py +1 -1
- graphiti_core/cross_encoder/openai_reranker_client.py +1 -1
- graphiti_core/decorators.py +110 -0
- graphiti_core/driver/driver.py +62 -2
- graphiti_core/driver/falkordb_driver.py +215 -23
- graphiti_core/driver/graph_operations/graph_operations.py +191 -0
- graphiti_core/driver/kuzu_driver.py +182 -0
- graphiti_core/driver/neo4j_driver.py +61 -8
- graphiti_core/driver/neptune_driver.py +305 -0
- graphiti_core/driver/search_interface/search_interface.py +89 -0
- graphiti_core/edges.py +264 -132
- graphiti_core/embedder/azure_openai.py +10 -3
- graphiti_core/embedder/client.py +2 -1
- graphiti_core/graph_queries.py +114 -101
- graphiti_core/graphiti.py +582 -255
- graphiti_core/graphiti_types.py +2 -0
- graphiti_core/helpers.py +21 -14
- graphiti_core/llm_client/anthropic_client.py +142 -52
- graphiti_core/llm_client/azure_openai_client.py +57 -19
- graphiti_core/llm_client/client.py +83 -21
- graphiti_core/llm_client/config.py +1 -1
- graphiti_core/llm_client/gemini_client.py +75 -57
- graphiti_core/llm_client/openai_base_client.py +94 -50
- graphiti_core/llm_client/openai_client.py +28 -8
- graphiti_core/llm_client/openai_generic_client.py +91 -56
- graphiti_core/models/edges/edge_db_queries.py +259 -35
- graphiti_core/models/nodes/node_db_queries.py +311 -32
- graphiti_core/nodes.py +388 -164
- graphiti_core/prompts/dedupe_edges.py +42 -31
- graphiti_core/prompts/dedupe_nodes.py +56 -39
- graphiti_core/prompts/eval.py +4 -4
- graphiti_core/prompts/extract_edges.py +23 -14
- graphiti_core/prompts/extract_nodes.py +73 -32
- graphiti_core/prompts/prompt_helpers.py +39 -0
- graphiti_core/prompts/snippets.py +29 -0
- graphiti_core/prompts/summarize_nodes.py +23 -25
- graphiti_core/search/search.py +154 -74
- graphiti_core/search/search_config.py +39 -4
- graphiti_core/search/search_filters.py +109 -31
- graphiti_core/search/search_helpers.py +5 -6
- graphiti_core/search/search_utils.py +1360 -473
- graphiti_core/tracer.py +193 -0
- graphiti_core/utils/bulk_utils.py +216 -90
- graphiti_core/utils/datetime_utils.py +13 -0
- graphiti_core/utils/maintenance/community_operations.py +62 -38
- graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
- graphiti_core/utils/maintenance/edge_operations.py +286 -126
- graphiti_core/utils/maintenance/graph_data_operations.py +44 -74
- graphiti_core/utils/maintenance/node_operations.py +320 -158
- graphiti_core/utils/maintenance/temporal_operations.py +11 -3
- graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
- graphiti_core/utils/text_utils.py +53 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/METADATA +221 -87
- graphiti_core-0.24.3.dist-info/RECORD +86 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
- graphiti_core-0.17.4.dist-info/RECORD +0 -77
- /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,11 +4,12 @@ from collections import defaultdict
|
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
|
-
from graphiti_core.driver.driver import GraphDriver
|
|
7
|
+
from graphiti_core.driver.driver import GraphDriver, GraphProvider
|
|
8
8
|
from graphiti_core.edges import CommunityEdge
|
|
9
9
|
from graphiti_core.embedder import EmbedderClient
|
|
10
10
|
from graphiti_core.helpers import semaphore_gather
|
|
11
11
|
from graphiti_core.llm_client import LLMClient
|
|
12
|
+
from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
|
|
12
13
|
from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
|
|
13
14
|
from graphiti_core.prompts import prompt_library
|
|
14
15
|
from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
|
|
@@ -33,10 +34,11 @@ async def get_community_clusters(
|
|
|
33
34
|
if group_ids is None:
|
|
34
35
|
group_id_values, _, _ = await driver.execute_query(
|
|
35
36
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
37
|
+
MATCH (n:Entity)
|
|
38
|
+
WHERE n.group_id IS NOT NULL
|
|
39
|
+
RETURN
|
|
40
|
+
collect(DISTINCT n.group_id) AS group_ids
|
|
41
|
+
"""
|
|
40
42
|
)
|
|
41
43
|
|
|
42
44
|
group_ids = group_id_values[0]['group_ids'] if group_id_values else []
|
|
@@ -45,14 +47,21 @@ async def get_community_clusters(
|
|
|
45
47
|
projection: dict[str, list[Neighbor]] = {}
|
|
46
48
|
nodes = await EntityNode.get_by_group_ids(driver, [group_id])
|
|
47
49
|
for node in nodes:
|
|
48
|
-
|
|
50
|
+
match_query = """
|
|
51
|
+
MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
|
|
52
|
+
"""
|
|
53
|
+
if driver.provider == GraphProvider.KUZU:
|
|
54
|
+
match_query = """
|
|
55
|
+
MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
|
|
49
56
|
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
uuid
|
|
54
|
-
|
|
55
|
-
|
|
57
|
+
records, _, _ = await driver.execute_query(
|
|
58
|
+
match_query
|
|
59
|
+
+ """
|
|
60
|
+
WITH count(e) AS count, m.uuid AS uuid
|
|
61
|
+
RETURN
|
|
62
|
+
uuid,
|
|
63
|
+
count
|
|
64
|
+
""",
|
|
56
65
|
uuid=node.uuid,
|
|
57
66
|
group_id=group_id,
|
|
58
67
|
)
|
|
@@ -124,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
|
|
|
124
133
|
|
|
125
134
|
async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
|
|
126
135
|
# Prepare context for LLM
|
|
127
|
-
context = {
|
|
136
|
+
context = {
|
|
137
|
+
'node_summaries': [{'summary': summary} for summary in summary_pair],
|
|
138
|
+
}
|
|
128
139
|
|
|
129
140
|
llm_response = await llm_client.generate_response(
|
|
130
|
-
prompt_library.summarize_nodes.summarize_pair(context),
|
|
141
|
+
prompt_library.summarize_nodes.summarize_pair(context),
|
|
142
|
+
response_model=Summary,
|
|
143
|
+
prompt_name='summarize_nodes.summarize_pair',
|
|
131
144
|
)
|
|
132
145
|
|
|
133
146
|
pair_summary = llm_response.get('summary', '')
|
|
@@ -136,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
|
|
|
136
149
|
|
|
137
150
|
|
|
138
151
|
async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
|
|
139
|
-
context = {
|
|
152
|
+
context = {
|
|
153
|
+
'summary': summary,
|
|
154
|
+
}
|
|
140
155
|
|
|
141
156
|
llm_response = await llm_client.generate_response(
|
|
142
157
|
prompt_library.summarize_nodes.summary_description(context),
|
|
143
158
|
response_model=SummaryDescription,
|
|
159
|
+
prompt_name='summarize_nodes.summary_description',
|
|
144
160
|
)
|
|
145
161
|
|
|
146
162
|
description = llm_response.get('description', '')
|
|
@@ -191,7 +207,9 @@ async def build_community(
|
|
|
191
207
|
|
|
192
208
|
|
|
193
209
|
async def build_communities(
|
|
194
|
-
driver: GraphDriver,
|
|
210
|
+
driver: GraphDriver,
|
|
211
|
+
llm_client: LLMClient,
|
|
212
|
+
group_ids: list[str] | None,
|
|
195
213
|
) -> tuple[list[CommunityNode], list[CommunityEdge]]:
|
|
196
214
|
community_clusters = await get_community_clusters(driver, group_ids)
|
|
197
215
|
|
|
@@ -219,9 +237,9 @@ async def build_communities(
|
|
|
219
237
|
async def remove_communities(driver: GraphDriver):
|
|
220
238
|
await driver.execute_query(
|
|
221
239
|
"""
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
240
|
+
MATCH (c:Community)
|
|
241
|
+
DETACH DELETE c
|
|
242
|
+
"""
|
|
225
243
|
)
|
|
226
244
|
|
|
227
245
|
|
|
@@ -231,14 +249,10 @@ async def determine_entity_community(
|
|
|
231
249
|
# Check if the node is already part of a community
|
|
232
250
|
records, _, _ = await driver.execute_query(
|
|
233
251
|
"""
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
c.group_id AS group_id,
|
|
239
|
-
c.created_at AS created_at,
|
|
240
|
-
c.summary AS summary
|
|
241
|
-
""",
|
|
252
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
|
|
253
|
+
RETURN
|
|
254
|
+
"""
|
|
255
|
+
+ COMMUNITY_NODE_RETURN,
|
|
242
256
|
entity_uuid=entity.uuid,
|
|
243
257
|
)
|
|
244
258
|
|
|
@@ -246,16 +260,19 @@ async def determine_entity_community(
|
|
|
246
260
|
return get_community_node_from_record(records[0]), False
|
|
247
261
|
|
|
248
262
|
# If the node has no community, add it to the mode community of surrounding entities
|
|
263
|
+
match_query = """
|
|
264
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
|
|
265
|
+
"""
|
|
266
|
+
if driver.provider == GraphProvider.KUZU:
|
|
267
|
+
match_query = """
|
|
268
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
|
|
269
|
+
"""
|
|
249
270
|
records, _, _ = await driver.execute_query(
|
|
271
|
+
match_query
|
|
272
|
+
+ """
|
|
273
|
+
RETURN
|
|
250
274
|
"""
|
|
251
|
-
|
|
252
|
-
RETURN
|
|
253
|
-
c.uuid As uuid,
|
|
254
|
-
c.name AS name,
|
|
255
|
-
c.group_id AS group_id,
|
|
256
|
-
c.created_at AS created_at,
|
|
257
|
-
c.summary AS summary
|
|
258
|
-
""",
|
|
275
|
+
+ COMMUNITY_NODE_RETURN,
|
|
259
276
|
entity_uuid=entity.uuid,
|
|
260
277
|
)
|
|
261
278
|
|
|
@@ -285,12 +302,15 @@ async def determine_entity_community(
|
|
|
285
302
|
|
|
286
303
|
|
|
287
304
|
async def update_community(
|
|
288
|
-
driver: GraphDriver,
|
|
289
|
-
|
|
305
|
+
driver: GraphDriver,
|
|
306
|
+
llm_client: LLMClient,
|
|
307
|
+
embedder: EmbedderClient,
|
|
308
|
+
entity: EntityNode,
|
|
309
|
+
) -> tuple[list[CommunityNode], list[CommunityEdge]]:
|
|
290
310
|
community, is_new = await determine_entity_community(driver, entity)
|
|
291
311
|
|
|
292
312
|
if community is None:
|
|
293
|
-
return
|
|
313
|
+
return [], []
|
|
294
314
|
|
|
295
315
|
new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
|
|
296
316
|
new_name = await generate_summary_description(llm_client, new_summary)
|
|
@@ -298,10 +318,14 @@ async def update_community(
|
|
|
298
318
|
community.summary = new_summary
|
|
299
319
|
community.name = new_name
|
|
300
320
|
|
|
321
|
+
community_edges = []
|
|
301
322
|
if is_new:
|
|
302
323
|
community_edge = (build_community_edges([entity], community, utc_now()))[0]
|
|
303
324
|
await community_edge.save(driver)
|
|
325
|
+
community_edges.append(community_edge)
|
|
304
326
|
|
|
305
327
|
await community.generate_name_embedding(embedder)
|
|
306
328
|
|
|
307
329
|
await community.save(driver)
|
|
330
|
+
|
|
331
|
+
return [community], community_edges
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024, Zep Software, Inc.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
from hashlib import blake2b
|
|
26
|
+
from typing import TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from graphiti_core.nodes import EntityNode
|
|
30
|
+
|
|
31
|
+
_NAME_ENTROPY_THRESHOLD = 1.5
|
|
32
|
+
_MIN_NAME_LENGTH = 6
|
|
33
|
+
_MIN_TOKEN_COUNT = 2
|
|
34
|
+
_FUZZY_JACCARD_THRESHOLD = 0.9
|
|
35
|
+
_MINHASH_PERMUTATIONS = 32
|
|
36
|
+
_MINHASH_BAND_SIZE = 4
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_string_exact(name: str) -> str:
|
|
40
|
+
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
|
41
|
+
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
|
42
|
+
return normalized.strip()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_name_for_fuzzy(name: str) -> str:
|
|
46
|
+
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
|
47
|
+
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
|
48
|
+
normalized = normalized.strip()
|
|
49
|
+
return re.sub(r'[\s]+', ' ', normalized)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _name_entropy(normalized_name: str) -> float:
|
|
53
|
+
"""Approximate text specificity using Shannon entropy over characters.
|
|
54
|
+
|
|
55
|
+
We strip spaces, count how often each character appears, and sum
|
|
56
|
+
probability * -log2(probability). Short or repetitive names yield low
|
|
57
|
+
entropy, which signals we should defer resolution to the LLM instead of
|
|
58
|
+
trusting fuzzy similarity.
|
|
59
|
+
"""
|
|
60
|
+
if not normalized_name:
|
|
61
|
+
return 0.0
|
|
62
|
+
|
|
63
|
+
counts: dict[str, int] = {}
|
|
64
|
+
for char in normalized_name.replace(' ', ''):
|
|
65
|
+
counts[char] = counts.get(char, 0) + 1
|
|
66
|
+
|
|
67
|
+
total = sum(counts.values())
|
|
68
|
+
if total == 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
entropy = 0.0
|
|
72
|
+
for count in counts.values():
|
|
73
|
+
probability = count / total
|
|
74
|
+
entropy -= probability * math.log2(probability)
|
|
75
|
+
|
|
76
|
+
return entropy
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _has_high_entropy(normalized_name: str) -> bool:
|
|
80
|
+
"""Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
|
|
81
|
+
token_count = len(normalized_name.split())
|
|
82
|
+
if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _shingles(normalized_name: str) -> set[str]:
|
|
89
|
+
"""Create 3-gram shingles from the normalized name for MinHash calculations."""
|
|
90
|
+
cleaned = normalized_name.replace(' ', '')
|
|
91
|
+
if len(cleaned) < 2:
|
|
92
|
+
return {cleaned} if cleaned else set()
|
|
93
|
+
|
|
94
|
+
return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _hash_shingle(shingle: str, seed: int) -> int:
|
|
98
|
+
"""Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
|
|
99
|
+
digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
|
|
100
|
+
return int.from_bytes(digest.digest(), 'big')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
|
|
104
|
+
"""Compute the MinHash signature for the shingle set across predefined permutations."""
|
|
105
|
+
if not shingles:
|
|
106
|
+
return tuple()
|
|
107
|
+
|
|
108
|
+
seeds = range(_MINHASH_PERMUTATIONS)
|
|
109
|
+
signature: list[int] = []
|
|
110
|
+
for seed in seeds:
|
|
111
|
+
min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
|
|
112
|
+
signature.append(min_hash)
|
|
113
|
+
|
|
114
|
+
return tuple(signature)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
|
|
118
|
+
"""Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
|
|
119
|
+
signature_list = list(signature)
|
|
120
|
+
if not signature_list:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
bands: list[tuple[int, ...]] = []
|
|
124
|
+
for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
|
|
125
|
+
band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
|
|
126
|
+
if len(band) == _MINHASH_BAND_SIZE:
|
|
127
|
+
bands.append(band)
|
|
128
|
+
return bands
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _jaccard_similarity(a: set[str], b: set[str]) -> float:
|
|
132
|
+
"""Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
|
|
133
|
+
if not a and not b:
|
|
134
|
+
return 1.0
|
|
135
|
+
if not a or not b:
|
|
136
|
+
return 0.0
|
|
137
|
+
|
|
138
|
+
intersection = len(a.intersection(b))
|
|
139
|
+
union = len(a.union(b))
|
|
140
|
+
return intersection / union if union else 0.0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@lru_cache(maxsize=512)
|
|
144
|
+
def _cached_shingles(name: str) -> set[str]:
|
|
145
|
+
"""Cache shingle sets per normalized name to avoid recomputation within a worker."""
|
|
146
|
+
return _shingles(name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class DedupCandidateIndexes:
|
|
151
|
+
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
|
152
|
+
|
|
153
|
+
existing_nodes: list[EntityNode]
|
|
154
|
+
nodes_by_uuid: dict[str, EntityNode]
|
|
155
|
+
normalized_existing: defaultdict[str, list[EntityNode]]
|
|
156
|
+
shingles_by_candidate: dict[str, set[str]]
|
|
157
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DedupResolutionState:
|
|
162
|
+
"""Mutable resolution bookkeeping shared across deterministic and LLM passes."""
|
|
163
|
+
|
|
164
|
+
resolved_nodes: list[EntityNode | None]
|
|
165
|
+
uuid_map: dict[str, str]
|
|
166
|
+
unresolved_indices: list[int]
|
|
167
|
+
duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
|
171
|
+
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
|
172
|
+
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
|
173
|
+
nodes_by_uuid: dict[str, EntityNode] = {}
|
|
174
|
+
shingles_by_candidate: dict[str, set[str]] = {}
|
|
175
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
|
176
|
+
|
|
177
|
+
for candidate in existing_nodes:
|
|
178
|
+
normalized = _normalize_string_exact(candidate.name)
|
|
179
|
+
normalized_existing[normalized].append(candidate)
|
|
180
|
+
nodes_by_uuid[candidate.uuid] = candidate
|
|
181
|
+
|
|
182
|
+
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
|
183
|
+
shingles_by_candidate[candidate.uuid] = shingles
|
|
184
|
+
|
|
185
|
+
signature = _minhash_signature(shingles)
|
|
186
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
187
|
+
lsh_buckets[(band_index, band)].append(candidate.uuid)
|
|
188
|
+
|
|
189
|
+
return DedupCandidateIndexes(
|
|
190
|
+
existing_nodes=existing_nodes,
|
|
191
|
+
nodes_by_uuid=nodes_by_uuid,
|
|
192
|
+
normalized_existing=normalized_existing,
|
|
193
|
+
shingles_by_candidate=shingles_by_candidate,
|
|
194
|
+
lsh_buckets=lsh_buckets,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _resolve_with_similarity(
|
|
199
|
+
extracted_nodes: list[EntityNode],
|
|
200
|
+
indexes: DedupCandidateIndexes,
|
|
201
|
+
state: DedupResolutionState,
|
|
202
|
+
) -> None:
|
|
203
|
+
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
|
204
|
+
for idx, node in enumerate(extracted_nodes):
|
|
205
|
+
normalized_exact = _normalize_string_exact(node.name)
|
|
206
|
+
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
|
207
|
+
|
|
208
|
+
if not _has_high_entropy(normalized_fuzzy):
|
|
209
|
+
state.unresolved_indices.append(idx)
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
existing_matches = indexes.normalized_existing.get(normalized_exact, [])
|
|
213
|
+
if len(existing_matches) == 1:
|
|
214
|
+
match = existing_matches[0]
|
|
215
|
+
state.resolved_nodes[idx] = match
|
|
216
|
+
state.uuid_map[node.uuid] = match.uuid
|
|
217
|
+
if match.uuid != node.uuid:
|
|
218
|
+
state.duplicate_pairs.append((node, match))
|
|
219
|
+
continue
|
|
220
|
+
if len(existing_matches) > 1:
|
|
221
|
+
state.unresolved_indices.append(idx)
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
shingles = _cached_shingles(normalized_fuzzy)
|
|
225
|
+
signature = _minhash_signature(shingles)
|
|
226
|
+
candidate_ids: set[str] = set()
|
|
227
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
228
|
+
candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
|
|
229
|
+
|
|
230
|
+
best_candidate: EntityNode | None = None
|
|
231
|
+
best_score = 0.0
|
|
232
|
+
for candidate_id in candidate_ids:
|
|
233
|
+
candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
|
|
234
|
+
score = _jaccard_similarity(shingles, candidate_shingles)
|
|
235
|
+
if score > best_score:
|
|
236
|
+
best_score = score
|
|
237
|
+
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
|
238
|
+
|
|
239
|
+
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
|
240
|
+
state.resolved_nodes[idx] = best_candidate
|
|
241
|
+
state.uuid_map[node.uuid] = best_candidate.uuid
|
|
242
|
+
if best_candidate.uuid != node.uuid:
|
|
243
|
+
state.duplicate_pairs.append((node, best_candidate))
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
state.unresolved_indices.append(idx)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
__all__ = [
|
|
250
|
+
'DedupCandidateIndexes',
|
|
251
|
+
'DedupResolutionState',
|
|
252
|
+
'_normalize_string_exact',
|
|
253
|
+
'_normalize_name_for_fuzzy',
|
|
254
|
+
'_has_high_entropy',
|
|
255
|
+
'_minhash_signature',
|
|
256
|
+
'_lsh_bands',
|
|
257
|
+
'_jaccard_similarity',
|
|
258
|
+
'_cached_shingles',
|
|
259
|
+
'_FUZZY_JACCARD_THRESHOLD',
|
|
260
|
+
'_build_candidate_indexes',
|
|
261
|
+
'_resolve_with_similarity',
|
|
262
|
+
]
|