graphiti-core 0.12.0rc1__py3-none-any.whl → 0.24.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphiti_core/cross_encoder/bge_reranker_client.py +12 -2
- graphiti_core/cross_encoder/gemini_reranker_client.py +161 -0
- graphiti_core/cross_encoder/openai_reranker_client.py +7 -5
- graphiti_core/decorators.py +110 -0
- graphiti_core/driver/__init__.py +19 -0
- graphiti_core/driver/driver.py +124 -0
- graphiti_core/driver/falkordb_driver.py +362 -0
- graphiti_core/driver/graph_operations/graph_operations.py +191 -0
- graphiti_core/driver/kuzu_driver.py +182 -0
- graphiti_core/driver/neo4j_driver.py +117 -0
- graphiti_core/driver/neptune_driver.py +305 -0
- graphiti_core/driver/search_interface/search_interface.py +89 -0
- graphiti_core/edges.py +287 -172
- graphiti_core/embedder/azure_openai.py +71 -0
- graphiti_core/embedder/client.py +2 -1
- graphiti_core/embedder/gemini.py +116 -22
- graphiti_core/embedder/voyage.py +13 -2
- graphiti_core/errors.py +8 -0
- graphiti_core/graph_queries.py +162 -0
- graphiti_core/graphiti.py +705 -193
- graphiti_core/graphiti_types.py +4 -2
- graphiti_core/helpers.py +87 -10
- graphiti_core/llm_client/__init__.py +16 -0
- graphiti_core/llm_client/anthropic_client.py +159 -56
- graphiti_core/llm_client/azure_openai_client.py +115 -0
- graphiti_core/llm_client/client.py +98 -21
- graphiti_core/llm_client/config.py +1 -1
- graphiti_core/llm_client/gemini_client.py +290 -41
- graphiti_core/llm_client/groq_client.py +14 -3
- graphiti_core/llm_client/openai_base_client.py +261 -0
- graphiti_core/llm_client/openai_client.py +56 -132
- graphiti_core/llm_client/openai_generic_client.py +91 -56
- graphiti_core/models/edges/edge_db_queries.py +259 -35
- graphiti_core/models/nodes/node_db_queries.py +311 -32
- graphiti_core/nodes.py +420 -205
- graphiti_core/prompts/dedupe_edges.py +46 -32
- graphiti_core/prompts/dedupe_nodes.py +67 -42
- graphiti_core/prompts/eval.py +4 -4
- graphiti_core/prompts/extract_edges.py +27 -16
- graphiti_core/prompts/extract_nodes.py +74 -31
- graphiti_core/prompts/prompt_helpers.py +39 -0
- graphiti_core/prompts/snippets.py +29 -0
- graphiti_core/prompts/summarize_nodes.py +23 -25
- graphiti_core/search/search.py +158 -82
- graphiti_core/search/search_config.py +39 -4
- graphiti_core/search/search_filters.py +126 -35
- graphiti_core/search/search_helpers.py +5 -6
- graphiti_core/search/search_utils.py +1405 -485
- graphiti_core/telemetry/__init__.py +9 -0
- graphiti_core/telemetry/telemetry.py +117 -0
- graphiti_core/tracer.py +193 -0
- graphiti_core/utils/bulk_utils.py +364 -285
- graphiti_core/utils/datetime_utils.py +13 -0
- graphiti_core/utils/maintenance/community_operations.py +67 -49
- graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
- graphiti_core/utils/maintenance/edge_operations.py +339 -197
- graphiti_core/utils/maintenance/graph_data_operations.py +50 -114
- graphiti_core/utils/maintenance/node_operations.py +319 -238
- graphiti_core/utils/maintenance/temporal_operations.py +11 -3
- graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
- graphiti_core/utils/text_utils.py +53 -0
- graphiti_core-0.24.3.dist-info/METADATA +726 -0
- graphiti_core-0.24.3.dist-info/RECORD +86 -0
- {graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
- graphiti_core-0.12.0rc1.dist-info/METADATA +0 -350
- graphiti_core-0.12.0rc1.dist-info/RECORD +0 -66
- /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
- {graphiti_core-0.12.0rc1.dist-info → graphiti_core-0.24.3.dist-info/licenses}/LICENSE +0 -0
|
@@ -40,3 +40,16 @@ def ensure_utc(dt: datetime | None) -> datetime | None:
|
|
|
40
40
|
return dt.astimezone(timezone.utc)
|
|
41
41
|
|
|
42
42
|
return dt
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def convert_datetimes_to_strings(obj):
|
|
46
|
+
if isinstance(obj, dict):
|
|
47
|
+
return {k: convert_datetimes_to_strings(v) for k, v in obj.items()}
|
|
48
|
+
elif isinstance(obj, list):
|
|
49
|
+
return [convert_datetimes_to_strings(item) for item in obj]
|
|
50
|
+
elif isinstance(obj, tuple):
|
|
51
|
+
return tuple(convert_datetimes_to_strings(item) for item in obj)
|
|
52
|
+
elif isinstance(obj, datetime):
|
|
53
|
+
return obj.isoformat()
|
|
54
|
+
else:
|
|
55
|
+
return obj
|
|
@@ -2,13 +2,14 @@ import asyncio
|
|
|
2
2
|
import logging
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
-
from neo4j import AsyncDriver
|
|
6
5
|
from pydantic import BaseModel
|
|
7
6
|
|
|
7
|
+
from graphiti_core.driver.driver import GraphDriver, GraphProvider
|
|
8
8
|
from graphiti_core.edges import CommunityEdge
|
|
9
9
|
from graphiti_core.embedder import EmbedderClient
|
|
10
|
-
from graphiti_core.helpers import
|
|
10
|
+
from graphiti_core.helpers import semaphore_gather
|
|
11
11
|
from graphiti_core.llm_client import LLMClient
|
|
12
|
+
from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
|
|
12
13
|
from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
|
|
13
14
|
from graphiti_core.prompts import prompt_library
|
|
14
15
|
from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
|
|
@@ -26,37 +27,43 @@ class Neighbor(BaseModel):
|
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
async def get_community_clusters(
|
|
29
|
-
driver:
|
|
30
|
+
driver: GraphDriver, group_ids: list[str] | None
|
|
30
31
|
) -> list[list[EntityNode]]:
|
|
31
32
|
community_clusters: list[list[EntityNode]] = []
|
|
32
33
|
|
|
33
34
|
if group_ids is None:
|
|
34
35
|
group_id_values, _, _ = await driver.execute_query(
|
|
35
36
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
MATCH (n:Entity)
|
|
38
|
+
WHERE n.group_id IS NOT NULL
|
|
39
|
+
RETURN
|
|
40
|
+
collect(DISTINCT n.group_id) AS group_ids
|
|
41
|
+
"""
|
|
41
42
|
)
|
|
42
43
|
|
|
43
|
-
group_ids = group_id_values[0]['group_ids']
|
|
44
|
+
group_ids = group_id_values[0]['group_ids'] if group_id_values else []
|
|
44
45
|
|
|
45
46
|
for group_id in group_ids:
|
|
46
47
|
projection: dict[str, list[Neighbor]] = {}
|
|
47
48
|
nodes = await EntityNode.get_by_group_ids(driver, [group_id])
|
|
48
49
|
for node in nodes:
|
|
49
|
-
|
|
50
|
+
match_query = """
|
|
51
|
+
MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
|
|
52
|
+
"""
|
|
53
|
+
if driver.provider == GraphProvider.KUZU:
|
|
54
|
+
match_query = """
|
|
55
|
+
MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
|
|
50
56
|
"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
uuid
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
records, _, _ = await driver.execute_query(
|
|
58
|
+
match_query
|
|
59
|
+
+ """
|
|
60
|
+
WITH count(e) AS count, m.uuid AS uuid
|
|
61
|
+
RETURN
|
|
62
|
+
uuid,
|
|
63
|
+
count
|
|
64
|
+
""",
|
|
57
65
|
uuid=node.uuid,
|
|
58
66
|
group_id=group_id,
|
|
59
|
-
database_=DEFAULT_DATABASE,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
projection[node.uuid] = [
|
|
@@ -95,7 +102,6 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
|
|
|
95
102
|
community_candidates: dict[int, int] = defaultdict(int)
|
|
96
103
|
for neighbor in neighbors:
|
|
97
104
|
community_candidates[community_map[neighbor.node_uuid]] += neighbor.edge_count
|
|
98
|
-
|
|
99
105
|
community_lst = [
|
|
100
106
|
(count, community) for community, count in community_candidates.items()
|
|
101
107
|
]
|
|
@@ -127,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
|
|
|
127
133
|
|
|
128
134
|
async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
|
|
129
135
|
# Prepare context for LLM
|
|
130
|
-
context = {
|
|
136
|
+
context = {
|
|
137
|
+
'node_summaries': [{'summary': summary} for summary in summary_pair],
|
|
138
|
+
}
|
|
131
139
|
|
|
132
140
|
llm_response = await llm_client.generate_response(
|
|
133
|
-
prompt_library.summarize_nodes.summarize_pair(context),
|
|
141
|
+
prompt_library.summarize_nodes.summarize_pair(context),
|
|
142
|
+
response_model=Summary,
|
|
143
|
+
prompt_name='summarize_nodes.summarize_pair',
|
|
134
144
|
)
|
|
135
145
|
|
|
136
146
|
pair_summary = llm_response.get('summary', '')
|
|
@@ -139,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
|
|
|
139
149
|
|
|
140
150
|
|
|
141
151
|
async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
|
|
142
|
-
context = {
|
|
152
|
+
context = {
|
|
153
|
+
'summary': summary,
|
|
154
|
+
}
|
|
143
155
|
|
|
144
156
|
llm_response = await llm_client.generate_response(
|
|
145
157
|
prompt_library.summarize_nodes.summary_description(context),
|
|
146
158
|
response_model=SummaryDescription,
|
|
159
|
+
prompt_name='summarize_nodes.summary_description',
|
|
147
160
|
)
|
|
148
161
|
|
|
149
162
|
description = llm_response.get('description', '')
|
|
@@ -194,7 +207,9 @@ async def build_community(
|
|
|
194
207
|
|
|
195
208
|
|
|
196
209
|
async def build_communities(
|
|
197
|
-
driver:
|
|
210
|
+
driver: GraphDriver,
|
|
211
|
+
llm_client: LLMClient,
|
|
212
|
+
group_ids: list[str] | None,
|
|
198
213
|
) -> tuple[list[CommunityNode], list[CommunityEdge]]:
|
|
199
214
|
community_clusters = await get_community_clusters(driver, group_ids)
|
|
200
215
|
|
|
@@ -219,50 +234,46 @@ async def build_communities(
|
|
|
219
234
|
return community_nodes, community_edges
|
|
220
235
|
|
|
221
236
|
|
|
222
|
-
async def remove_communities(driver:
|
|
237
|
+
async def remove_communities(driver: GraphDriver):
|
|
223
238
|
await driver.execute_query(
|
|
224
239
|
"""
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
database_=DEFAULT_DATABASE,
|
|
240
|
+
MATCH (c:Community)
|
|
241
|
+
DETACH DELETE c
|
|
242
|
+
"""
|
|
229
243
|
)
|
|
230
244
|
|
|
231
245
|
|
|
232
246
|
async def determine_entity_community(
|
|
233
|
-
driver:
|
|
247
|
+
driver: GraphDriver, entity: EntityNode
|
|
234
248
|
) -> tuple[CommunityNode | None, bool]:
|
|
235
249
|
# Check if the node is already part of a community
|
|
236
250
|
records, _, _ = await driver.execute_query(
|
|
237
251
|
"""
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
c.group_id AS group_id,
|
|
243
|
-
c.created_at AS created_at,
|
|
244
|
-
c.summary AS summary
|
|
245
|
-
""",
|
|
252
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
|
|
253
|
+
RETURN
|
|
254
|
+
"""
|
|
255
|
+
+ COMMUNITY_NODE_RETURN,
|
|
246
256
|
entity_uuid=entity.uuid,
|
|
247
|
-
database_=DEFAULT_DATABASE,
|
|
248
257
|
)
|
|
249
258
|
|
|
250
259
|
if len(records) > 0:
|
|
251
260
|
return get_community_node_from_record(records[0]), False
|
|
252
261
|
|
|
253
262
|
# If the node has no community, add it to the mode community of surrounding entities
|
|
263
|
+
match_query = """
|
|
264
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
|
|
265
|
+
"""
|
|
266
|
+
if driver.provider == GraphProvider.KUZU:
|
|
267
|
+
match_query = """
|
|
268
|
+
MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
|
|
269
|
+
"""
|
|
254
270
|
records, _, _ = await driver.execute_query(
|
|
271
|
+
match_query
|
|
272
|
+
+ """
|
|
273
|
+
RETURN
|
|
255
274
|
"""
|
|
256
|
-
|
|
257
|
-
RETURN
|
|
258
|
-
c.uuid As uuid,
|
|
259
|
-
c.name AS name,
|
|
260
|
-
c.group_id AS group_id,
|
|
261
|
-
c.created_at AS created_at,
|
|
262
|
-
c.summary AS summary
|
|
263
|
-
""",
|
|
275
|
+
+ COMMUNITY_NODE_RETURN,
|
|
264
276
|
entity_uuid=entity.uuid,
|
|
265
|
-
database_=DEFAULT_DATABASE,
|
|
266
277
|
)
|
|
267
278
|
|
|
268
279
|
communities: list[CommunityNode] = [
|
|
@@ -291,12 +302,15 @@ async def determine_entity_community(
|
|
|
291
302
|
|
|
292
303
|
|
|
293
304
|
async def update_community(
|
|
294
|
-
driver:
|
|
295
|
-
|
|
305
|
+
driver: GraphDriver,
|
|
306
|
+
llm_client: LLMClient,
|
|
307
|
+
embedder: EmbedderClient,
|
|
308
|
+
entity: EntityNode,
|
|
309
|
+
) -> tuple[list[CommunityNode], list[CommunityEdge]]:
|
|
296
310
|
community, is_new = await determine_entity_community(driver, entity)
|
|
297
311
|
|
|
298
312
|
if community is None:
|
|
299
|
-
return
|
|
313
|
+
return [], []
|
|
300
314
|
|
|
301
315
|
new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
|
|
302
316
|
new_name = await generate_summary_description(llm_client, new_summary)
|
|
@@ -304,10 +318,14 @@ async def update_community(
|
|
|
304
318
|
community.summary = new_summary
|
|
305
319
|
community.name = new_name
|
|
306
320
|
|
|
321
|
+
community_edges = []
|
|
307
322
|
if is_new:
|
|
308
323
|
community_edge = (build_community_edges([entity], community, utc_now()))[0]
|
|
309
324
|
await community_edge.save(driver)
|
|
325
|
+
community_edges.append(community_edge)
|
|
310
326
|
|
|
311
327
|
await community.generate_name_embedding(embedder)
|
|
312
328
|
|
|
313
329
|
await community.save(driver)
|
|
330
|
+
|
|
331
|
+
return [community], community_edges
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024, Zep Software, Inc.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
from hashlib import blake2b
|
|
26
|
+
from typing import TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from graphiti_core.nodes import EntityNode
|
|
30
|
+
|
|
31
|
+
_NAME_ENTROPY_THRESHOLD = 1.5
|
|
32
|
+
_MIN_NAME_LENGTH = 6
|
|
33
|
+
_MIN_TOKEN_COUNT = 2
|
|
34
|
+
_FUZZY_JACCARD_THRESHOLD = 0.9
|
|
35
|
+
_MINHASH_PERMUTATIONS = 32
|
|
36
|
+
_MINHASH_BAND_SIZE = 4
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_string_exact(name: str) -> str:
|
|
40
|
+
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
|
41
|
+
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
|
42
|
+
return normalized.strip()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_name_for_fuzzy(name: str) -> str:
|
|
46
|
+
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
|
47
|
+
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
|
48
|
+
normalized = normalized.strip()
|
|
49
|
+
return re.sub(r'[\s]+', ' ', normalized)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _name_entropy(normalized_name: str) -> float:
|
|
53
|
+
"""Approximate text specificity using Shannon entropy over characters.
|
|
54
|
+
|
|
55
|
+
We strip spaces, count how often each character appears, and sum
|
|
56
|
+
probability * -log2(probability). Short or repetitive names yield low
|
|
57
|
+
entropy, which signals we should defer resolution to the LLM instead of
|
|
58
|
+
trusting fuzzy similarity.
|
|
59
|
+
"""
|
|
60
|
+
if not normalized_name:
|
|
61
|
+
return 0.0
|
|
62
|
+
|
|
63
|
+
counts: dict[str, int] = {}
|
|
64
|
+
for char in normalized_name.replace(' ', ''):
|
|
65
|
+
counts[char] = counts.get(char, 0) + 1
|
|
66
|
+
|
|
67
|
+
total = sum(counts.values())
|
|
68
|
+
if total == 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
entropy = 0.0
|
|
72
|
+
for count in counts.values():
|
|
73
|
+
probability = count / total
|
|
74
|
+
entropy -= probability * math.log2(probability)
|
|
75
|
+
|
|
76
|
+
return entropy
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _has_high_entropy(normalized_name: str) -> bool:
|
|
80
|
+
"""Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
|
|
81
|
+
token_count = len(normalized_name.split())
|
|
82
|
+
if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _shingles(normalized_name: str) -> set[str]:
|
|
89
|
+
"""Create 3-gram shingles from the normalized name for MinHash calculations."""
|
|
90
|
+
cleaned = normalized_name.replace(' ', '')
|
|
91
|
+
if len(cleaned) < 2:
|
|
92
|
+
return {cleaned} if cleaned else set()
|
|
93
|
+
|
|
94
|
+
return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _hash_shingle(shingle: str, seed: int) -> int:
|
|
98
|
+
"""Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
|
|
99
|
+
digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
|
|
100
|
+
return int.from_bytes(digest.digest(), 'big')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
|
|
104
|
+
"""Compute the MinHash signature for the shingle set across predefined permutations."""
|
|
105
|
+
if not shingles:
|
|
106
|
+
return tuple()
|
|
107
|
+
|
|
108
|
+
seeds = range(_MINHASH_PERMUTATIONS)
|
|
109
|
+
signature: list[int] = []
|
|
110
|
+
for seed in seeds:
|
|
111
|
+
min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
|
|
112
|
+
signature.append(min_hash)
|
|
113
|
+
|
|
114
|
+
return tuple(signature)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
|
|
118
|
+
"""Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
|
|
119
|
+
signature_list = list(signature)
|
|
120
|
+
if not signature_list:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
bands: list[tuple[int, ...]] = []
|
|
124
|
+
for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
|
|
125
|
+
band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
|
|
126
|
+
if len(band) == _MINHASH_BAND_SIZE:
|
|
127
|
+
bands.append(band)
|
|
128
|
+
return bands
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _jaccard_similarity(a: set[str], b: set[str]) -> float:
|
|
132
|
+
"""Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
|
|
133
|
+
if not a and not b:
|
|
134
|
+
return 1.0
|
|
135
|
+
if not a or not b:
|
|
136
|
+
return 0.0
|
|
137
|
+
|
|
138
|
+
intersection = len(a.intersection(b))
|
|
139
|
+
union = len(a.union(b))
|
|
140
|
+
return intersection / union if union else 0.0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@lru_cache(maxsize=512)
|
|
144
|
+
def _cached_shingles(name: str) -> set[str]:
|
|
145
|
+
"""Cache shingle sets per normalized name to avoid recomputation within a worker."""
|
|
146
|
+
return _shingles(name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class DedupCandidateIndexes:
|
|
151
|
+
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
|
152
|
+
|
|
153
|
+
existing_nodes: list[EntityNode]
|
|
154
|
+
nodes_by_uuid: dict[str, EntityNode]
|
|
155
|
+
normalized_existing: defaultdict[str, list[EntityNode]]
|
|
156
|
+
shingles_by_candidate: dict[str, set[str]]
|
|
157
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DedupResolutionState:
|
|
162
|
+
"""Mutable resolution bookkeeping shared across deterministic and LLM passes."""
|
|
163
|
+
|
|
164
|
+
resolved_nodes: list[EntityNode | None]
|
|
165
|
+
uuid_map: dict[str, str]
|
|
166
|
+
unresolved_indices: list[int]
|
|
167
|
+
duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
|
171
|
+
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
|
172
|
+
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
|
173
|
+
nodes_by_uuid: dict[str, EntityNode] = {}
|
|
174
|
+
shingles_by_candidate: dict[str, set[str]] = {}
|
|
175
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
|
176
|
+
|
|
177
|
+
for candidate in existing_nodes:
|
|
178
|
+
normalized = _normalize_string_exact(candidate.name)
|
|
179
|
+
normalized_existing[normalized].append(candidate)
|
|
180
|
+
nodes_by_uuid[candidate.uuid] = candidate
|
|
181
|
+
|
|
182
|
+
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
|
183
|
+
shingles_by_candidate[candidate.uuid] = shingles
|
|
184
|
+
|
|
185
|
+
signature = _minhash_signature(shingles)
|
|
186
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
187
|
+
lsh_buckets[(band_index, band)].append(candidate.uuid)
|
|
188
|
+
|
|
189
|
+
return DedupCandidateIndexes(
|
|
190
|
+
existing_nodes=existing_nodes,
|
|
191
|
+
nodes_by_uuid=nodes_by_uuid,
|
|
192
|
+
normalized_existing=normalized_existing,
|
|
193
|
+
shingles_by_candidate=shingles_by_candidate,
|
|
194
|
+
lsh_buckets=lsh_buckets,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _resolve_with_similarity(
|
|
199
|
+
extracted_nodes: list[EntityNode],
|
|
200
|
+
indexes: DedupCandidateIndexes,
|
|
201
|
+
state: DedupResolutionState,
|
|
202
|
+
) -> None:
|
|
203
|
+
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
|
204
|
+
for idx, node in enumerate(extracted_nodes):
|
|
205
|
+
normalized_exact = _normalize_string_exact(node.name)
|
|
206
|
+
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
|
207
|
+
|
|
208
|
+
if not _has_high_entropy(normalized_fuzzy):
|
|
209
|
+
state.unresolved_indices.append(idx)
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
existing_matches = indexes.normalized_existing.get(normalized_exact, [])
|
|
213
|
+
if len(existing_matches) == 1:
|
|
214
|
+
match = existing_matches[0]
|
|
215
|
+
state.resolved_nodes[idx] = match
|
|
216
|
+
state.uuid_map[node.uuid] = match.uuid
|
|
217
|
+
if match.uuid != node.uuid:
|
|
218
|
+
state.duplicate_pairs.append((node, match))
|
|
219
|
+
continue
|
|
220
|
+
if len(existing_matches) > 1:
|
|
221
|
+
state.unresolved_indices.append(idx)
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
shingles = _cached_shingles(normalized_fuzzy)
|
|
225
|
+
signature = _minhash_signature(shingles)
|
|
226
|
+
candidate_ids: set[str] = set()
|
|
227
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
228
|
+
candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
|
|
229
|
+
|
|
230
|
+
best_candidate: EntityNode | None = None
|
|
231
|
+
best_score = 0.0
|
|
232
|
+
for candidate_id in candidate_ids:
|
|
233
|
+
candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
|
|
234
|
+
score = _jaccard_similarity(shingles, candidate_shingles)
|
|
235
|
+
if score > best_score:
|
|
236
|
+
best_score = score
|
|
237
|
+
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
|
238
|
+
|
|
239
|
+
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
|
240
|
+
state.resolved_nodes[idx] = best_candidate
|
|
241
|
+
state.uuid_map[node.uuid] = best_candidate.uuid
|
|
242
|
+
if best_candidate.uuid != node.uuid:
|
|
243
|
+
state.duplicate_pairs.append((node, best_candidate))
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
state.unresolved_indices.append(idx)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
__all__ = [
|
|
250
|
+
'DedupCandidateIndexes',
|
|
251
|
+
'DedupResolutionState',
|
|
252
|
+
'_normalize_string_exact',
|
|
253
|
+
'_normalize_name_for_fuzzy',
|
|
254
|
+
'_has_high_entropy',
|
|
255
|
+
'_minhash_signature',
|
|
256
|
+
'_lsh_bands',
|
|
257
|
+
'_jaccard_similarity',
|
|
258
|
+
'_cached_shingles',
|
|
259
|
+
'_FUZZY_JACCARD_THRESHOLD',
|
|
260
|
+
'_build_candidate_indexes',
|
|
261
|
+
'_resolve_with_similarity',
|
|
262
|
+
]
|