graphiti-core 0.21.0rc6__py3-none-any.whl → 0.30.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graphiti-core might be problematic. Click here for more details.
- graphiti_core/utils/maintenance/dedup_helpers.py +257 -0
- graphiti_core/utils/maintenance/edge_operations.py +14 -0
- graphiti_core/utils/maintenance/node_operations.py +139 -60
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.30.0rc0.dist-info}/METADATA +1 -1
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.30.0rc0.dist-info}/RECORD +7 -6
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.30.0rc0.dist-info}/WHEEL +0 -0
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.30.0rc0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024, Zep Software, Inc.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
from hashlib import blake2b
|
|
26
|
+
from typing import TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from graphiti_core.nodes import EntityNode
|
|
30
|
+
|
|
31
|
+
_NAME_ENTROPY_THRESHOLD = 1.5
|
|
32
|
+
_MIN_NAME_LENGTH = 6
|
|
33
|
+
_MIN_TOKEN_COUNT = 2
|
|
34
|
+
_FUZZY_JACCARD_THRESHOLD = 0.9
|
|
35
|
+
_MINHASH_PERMUTATIONS = 32
|
|
36
|
+
_MINHASH_BAND_SIZE = 4
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_string_exact(name: str) -> str:
|
|
40
|
+
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
|
41
|
+
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
|
42
|
+
return normalized.strip()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_name_for_fuzzy(name: str) -> str:
|
|
46
|
+
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
|
47
|
+
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
|
48
|
+
normalized = normalized.strip()
|
|
49
|
+
return re.sub(r'[\s]+', ' ', normalized)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _name_entropy(normalized_name: str) -> float:
|
|
53
|
+
"""Approximate text specificity using Shannon entropy over characters.
|
|
54
|
+
|
|
55
|
+
We strip spaces, count how often each character appears, and sum
|
|
56
|
+
probability * -log2(probability). Short or repetitive names yield low
|
|
57
|
+
entropy, which signals we should defer resolution to the LLM instead of
|
|
58
|
+
trusting fuzzy similarity.
|
|
59
|
+
"""
|
|
60
|
+
if not normalized_name:
|
|
61
|
+
return 0.0
|
|
62
|
+
|
|
63
|
+
counts: dict[str, int] = {}
|
|
64
|
+
for char in normalized_name.replace(' ', ''):
|
|
65
|
+
counts[char] = counts.get(char, 0) + 1
|
|
66
|
+
|
|
67
|
+
total = sum(counts.values())
|
|
68
|
+
if total == 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
entropy = 0.0
|
|
72
|
+
for count in counts.values():
|
|
73
|
+
probability = count / total
|
|
74
|
+
entropy -= probability * math.log2(probability)
|
|
75
|
+
|
|
76
|
+
return entropy
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _has_high_entropy(normalized_name: str) -> bool:
|
|
80
|
+
"""Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
|
|
81
|
+
token_count = len(normalized_name.split())
|
|
82
|
+
if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _shingles(normalized_name: str) -> set[str]:
|
|
89
|
+
"""Create 3-gram shingles from the normalized name for MinHash calculations."""
|
|
90
|
+
cleaned = normalized_name.replace(' ', '')
|
|
91
|
+
if len(cleaned) < 2:
|
|
92
|
+
return {cleaned} if cleaned else set()
|
|
93
|
+
|
|
94
|
+
return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _hash_shingle(shingle: str, seed: int) -> int:
|
|
98
|
+
"""Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
|
|
99
|
+
digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
|
|
100
|
+
return int.from_bytes(digest.digest(), 'big')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
|
|
104
|
+
"""Compute the MinHash signature for the shingle set across predefined permutations."""
|
|
105
|
+
if not shingles:
|
|
106
|
+
return tuple()
|
|
107
|
+
|
|
108
|
+
seeds = range(_MINHASH_PERMUTATIONS)
|
|
109
|
+
signature: list[int] = []
|
|
110
|
+
for seed in seeds:
|
|
111
|
+
min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
|
|
112
|
+
signature.append(min_hash)
|
|
113
|
+
|
|
114
|
+
return tuple(signature)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
|
|
118
|
+
"""Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
|
|
119
|
+
signature_list = list(signature)
|
|
120
|
+
if not signature_list:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
bands: list[tuple[int, ...]] = []
|
|
124
|
+
for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
|
|
125
|
+
band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
|
|
126
|
+
if len(band) == _MINHASH_BAND_SIZE:
|
|
127
|
+
bands.append(band)
|
|
128
|
+
return bands
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _jaccard_similarity(a: set[str], b: set[str]) -> float:
|
|
132
|
+
"""Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
|
|
133
|
+
if not a and not b:
|
|
134
|
+
return 1.0
|
|
135
|
+
if not a or not b:
|
|
136
|
+
return 0.0
|
|
137
|
+
|
|
138
|
+
intersection = len(a.intersection(b))
|
|
139
|
+
union = len(a.union(b))
|
|
140
|
+
return intersection / union if union else 0.0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@lru_cache(maxsize=512)
|
|
144
|
+
def _cached_shingles(name: str) -> set[str]:
|
|
145
|
+
"""Cache shingle sets per normalized name to avoid recomputation within a worker."""
|
|
146
|
+
return _shingles(name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class DedupCandidateIndexes:
|
|
151
|
+
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
|
152
|
+
|
|
153
|
+
existing_nodes: list[EntityNode]
|
|
154
|
+
nodes_by_uuid: dict[str, EntityNode]
|
|
155
|
+
normalized_existing: defaultdict[str, list[EntityNode]]
|
|
156
|
+
shingles_by_candidate: dict[str, set[str]]
|
|
157
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DedupResolutionState:
|
|
162
|
+
"""Mutable resolution bookkeeping shared across deterministic and LLM passes."""
|
|
163
|
+
|
|
164
|
+
resolved_nodes: list[EntityNode | None]
|
|
165
|
+
uuid_map: dict[str, str]
|
|
166
|
+
unresolved_indices: list[int]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
|
170
|
+
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
|
171
|
+
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
|
172
|
+
nodes_by_uuid: dict[str, EntityNode] = {}
|
|
173
|
+
shingles_by_candidate: dict[str, set[str]] = {}
|
|
174
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
|
175
|
+
|
|
176
|
+
for candidate in existing_nodes:
|
|
177
|
+
normalized = _normalize_string_exact(candidate.name)
|
|
178
|
+
normalized_existing[normalized].append(candidate)
|
|
179
|
+
nodes_by_uuid[candidate.uuid] = candidate
|
|
180
|
+
|
|
181
|
+
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
|
182
|
+
shingles_by_candidate[candidate.uuid] = shingles
|
|
183
|
+
|
|
184
|
+
signature = _minhash_signature(shingles)
|
|
185
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
186
|
+
lsh_buckets[(band_index, band)].append(candidate.uuid)
|
|
187
|
+
|
|
188
|
+
return DedupCandidateIndexes(
|
|
189
|
+
existing_nodes=existing_nodes,
|
|
190
|
+
nodes_by_uuid=nodes_by_uuid,
|
|
191
|
+
normalized_existing=normalized_existing,
|
|
192
|
+
shingles_by_candidate=shingles_by_candidate,
|
|
193
|
+
lsh_buckets=lsh_buckets,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _resolve_with_similarity(
|
|
198
|
+
extracted_nodes: list[EntityNode],
|
|
199
|
+
indexes: DedupCandidateIndexes,
|
|
200
|
+
state: DedupResolutionState,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
|
203
|
+
for idx, node in enumerate(extracted_nodes):
|
|
204
|
+
normalized_exact = _normalize_string_exact(node.name)
|
|
205
|
+
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
|
206
|
+
|
|
207
|
+
if not _has_high_entropy(normalized_fuzzy):
|
|
208
|
+
state.unresolved_indices.append(idx)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
existing_matches = indexes.normalized_existing.get(normalized_exact, [])
|
|
212
|
+
if len(existing_matches) == 1:
|
|
213
|
+
match = existing_matches[0]
|
|
214
|
+
state.resolved_nodes[idx] = match
|
|
215
|
+
state.uuid_map[node.uuid] = match.uuid
|
|
216
|
+
continue
|
|
217
|
+
if len(existing_matches) > 1:
|
|
218
|
+
state.unresolved_indices.append(idx)
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
shingles = _cached_shingles(normalized_fuzzy)
|
|
222
|
+
signature = _minhash_signature(shingles)
|
|
223
|
+
candidate_ids: set[str] = set()
|
|
224
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
225
|
+
candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
|
|
226
|
+
|
|
227
|
+
best_candidate: EntityNode | None = None
|
|
228
|
+
best_score = 0.0
|
|
229
|
+
for candidate_id in candidate_ids:
|
|
230
|
+
candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
|
|
231
|
+
score = _jaccard_similarity(shingles, candidate_shingles)
|
|
232
|
+
if score > best_score:
|
|
233
|
+
best_score = score
|
|
234
|
+
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
|
235
|
+
|
|
236
|
+
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
|
237
|
+
state.resolved_nodes[idx] = best_candidate
|
|
238
|
+
state.uuid_map[node.uuid] = best_candidate.uuid
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
state.unresolved_indices.append(idx)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
__all__ = [
|
|
245
|
+
'DedupCandidateIndexes',
|
|
246
|
+
'DedupResolutionState',
|
|
247
|
+
'_normalize_string_exact',
|
|
248
|
+
'_normalize_name_for_fuzzy',
|
|
249
|
+
'_has_high_entropy',
|
|
250
|
+
'_minhash_signature',
|
|
251
|
+
'_lsh_bands',
|
|
252
|
+
'_jaccard_similarity',
|
|
253
|
+
'_cached_shingles',
|
|
254
|
+
'_FUZZY_JACCARD_THRESHOLD',
|
|
255
|
+
'_build_candidate_indexes',
|
|
256
|
+
'_resolve_with_similarity',
|
|
257
|
+
]
|
|
@@ -41,6 +41,7 @@ from graphiti_core.search.search_config import SearchResults
|
|
|
41
41
|
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
|
|
42
42
|
from graphiti_core.search.search_filters import SearchFilters
|
|
43
43
|
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
|
|
44
|
+
from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
|
|
44
45
|
|
|
45
46
|
logger = logging.getLogger(__name__)
|
|
46
47
|
|
|
@@ -397,6 +398,19 @@ async def resolve_extracted_edge(
|
|
|
397
398
|
if len(related_edges) == 0 and len(existing_edges) == 0:
|
|
398
399
|
return extracted_edge, [], []
|
|
399
400
|
|
|
401
|
+
# Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
|
|
402
|
+
normalized_fact = _normalize_string_exact(extracted_edge.fact)
|
|
403
|
+
for edge in related_edges:
|
|
404
|
+
if (
|
|
405
|
+
edge.source_node_uuid == extracted_edge.source_node_uuid
|
|
406
|
+
and edge.target_node_uuid == extracted_edge.target_node_uuid
|
|
407
|
+
and _normalize_string_exact(edge.fact) == normalized_fact
|
|
408
|
+
):
|
|
409
|
+
resolved = edge
|
|
410
|
+
if episode is not None and episode.uuid not in resolved.episodes:
|
|
411
|
+
resolved.episodes.append(episode.uuid)
|
|
412
|
+
return resolved, [], []
|
|
413
|
+
|
|
400
414
|
start = time()
|
|
401
415
|
|
|
402
416
|
# Prepare context for LLM
|
|
@@ -24,7 +24,12 @@ from graphiti_core.graphiti_types import GraphitiClients
|
|
|
24
24
|
from graphiti_core.helpers import MAX_REFLEXION_ITERATIONS, semaphore_gather
|
|
25
25
|
from graphiti_core.llm_client import LLMClient
|
|
26
26
|
from graphiti_core.llm_client.config import ModelSize
|
|
27
|
-
from graphiti_core.nodes import
|
|
27
|
+
from graphiti_core.nodes import (
|
|
28
|
+
EntityNode,
|
|
29
|
+
EpisodeType,
|
|
30
|
+
EpisodicNode,
|
|
31
|
+
create_entity_node_embeddings,
|
|
32
|
+
)
|
|
28
33
|
from graphiti_core.prompts import prompt_library
|
|
29
34
|
from graphiti_core.prompts.dedupe_nodes import NodeDuplicate, NodeResolutions
|
|
30
35
|
from graphiti_core.prompts.extract_nodes import (
|
|
@@ -38,7 +43,15 @@ from graphiti_core.search.search_config import SearchResults
|
|
|
38
43
|
from graphiti_core.search.search_config_recipes import NODE_HYBRID_SEARCH_RRF
|
|
39
44
|
from graphiti_core.search.search_filters import SearchFilters
|
|
40
45
|
from graphiti_core.utils.datetime_utils import utc_now
|
|
41
|
-
from graphiti_core.utils.maintenance.
|
|
46
|
+
from graphiti_core.utils.maintenance.dedup_helpers import (
|
|
47
|
+
DedupCandidateIndexes,
|
|
48
|
+
DedupResolutionState,
|
|
49
|
+
_build_candidate_indexes,
|
|
50
|
+
_resolve_with_similarity,
|
|
51
|
+
)
|
|
52
|
+
from graphiti_core.utils.maintenance.edge_operations import (
|
|
53
|
+
filter_existing_duplicate_of_edges,
|
|
54
|
+
)
|
|
42
55
|
|
|
43
56
|
logger = logging.getLogger(__name__)
|
|
44
57
|
|
|
@@ -119,11 +132,13 @@ async def extract_nodes(
|
|
|
119
132
|
)
|
|
120
133
|
elif episode.source == EpisodeType.text:
|
|
121
134
|
llm_response = await llm_client.generate_response(
|
|
122
|
-
prompt_library.extract_nodes.extract_text(context),
|
|
135
|
+
prompt_library.extract_nodes.extract_text(context),
|
|
136
|
+
response_model=ExtractedEntities,
|
|
123
137
|
)
|
|
124
138
|
elif episode.source == EpisodeType.json:
|
|
125
139
|
llm_response = await llm_client.generate_response(
|
|
126
|
-
prompt_library.extract_nodes.extract_json(context),
|
|
140
|
+
prompt_library.extract_nodes.extract_json(context),
|
|
141
|
+
response_model=ExtractedEntities,
|
|
127
142
|
)
|
|
128
143
|
|
|
129
144
|
response_object = ExtractedEntities(**llm_response)
|
|
@@ -181,17 +196,12 @@ async def extract_nodes(
|
|
|
181
196
|
return extracted_nodes
|
|
182
197
|
|
|
183
198
|
|
|
184
|
-
async def
|
|
199
|
+
async def _collect_candidate_nodes(
|
|
185
200
|
clients: GraphitiClients,
|
|
186
201
|
extracted_nodes: list[EntityNode],
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
existing_nodes_override: list[EntityNode] | None = None,
|
|
191
|
-
) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
|
|
192
|
-
llm_client = clients.llm_client
|
|
193
|
-
driver = clients.driver
|
|
194
|
-
|
|
202
|
+
existing_nodes_override: list[EntityNode] | None,
|
|
203
|
+
) -> list[EntityNode]:
|
|
204
|
+
"""Search per extracted name and return unique candidates with overrides honored in order."""
|
|
195
205
|
search_results: list[SearchResults] = await semaphore_gather(
|
|
196
206
|
*[
|
|
197
207
|
search(
|
|
@@ -205,33 +215,40 @@ async def resolve_extracted_nodes(
|
|
|
205
215
|
]
|
|
206
216
|
)
|
|
207
217
|
|
|
208
|
-
candidate_nodes: list[EntityNode] =
|
|
209
|
-
[node for result in search_results for node in result.nodes]
|
|
210
|
-
if existing_nodes_override is None
|
|
211
|
-
else existing_nodes_override
|
|
212
|
-
)
|
|
218
|
+
candidate_nodes: list[EntityNode] = [node for result in search_results for node in result.nodes]
|
|
213
219
|
|
|
214
|
-
|
|
220
|
+
if existing_nodes_override is not None:
|
|
221
|
+
candidate_nodes.extend(existing_nodes_override)
|
|
215
222
|
|
|
216
|
-
|
|
223
|
+
seen_candidate_uuids: set[str] = set()
|
|
224
|
+
ordered_candidates: list[EntityNode] = []
|
|
225
|
+
for candidate in candidate_nodes:
|
|
226
|
+
if candidate.uuid in seen_candidate_uuids:
|
|
227
|
+
continue
|
|
228
|
+
seen_candidate_uuids.add(candidate.uuid)
|
|
229
|
+
ordered_candidates.append(candidate)
|
|
217
230
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
+
return ordered_candidates
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def _resolve_with_llm(
|
|
235
|
+
llm_client: LLMClient,
|
|
236
|
+
extracted_nodes: list[EntityNode],
|
|
237
|
+
indexes: DedupCandidateIndexes,
|
|
238
|
+
state: DedupResolutionState,
|
|
239
|
+
ensure_ascii: bool,
|
|
240
|
+
episode: EpisodicNode | None,
|
|
241
|
+
previous_episodes: list[EpisodicNode] | None,
|
|
242
|
+
entity_types: dict[str, type[BaseModel]] | None,
|
|
243
|
+
) -> None:
|
|
244
|
+
"""Escalate unresolved nodes to the dedupe prompt so the LLM can select or reject duplicates."""
|
|
245
|
+
if not state.unresolved_indices:
|
|
246
|
+
return
|
|
231
247
|
|
|
232
248
|
entity_types_dict: dict[str, type[BaseModel]] = entity_types if entity_types is not None else {}
|
|
233
249
|
|
|
234
|
-
|
|
250
|
+
llm_extracted_nodes = [extracted_nodes[i] for i in state.unresolved_indices]
|
|
251
|
+
|
|
235
252
|
extracted_nodes_context = [
|
|
236
253
|
{
|
|
237
254
|
'id': i,
|
|
@@ -242,17 +259,29 @@ async def resolve_extracted_nodes(
|
|
|
242
259
|
).__doc__
|
|
243
260
|
or 'Default Entity Type',
|
|
244
261
|
}
|
|
245
|
-
for i, node in enumerate(
|
|
262
|
+
for i, node in enumerate(llm_extracted_nodes)
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
existing_nodes_context = [
|
|
266
|
+
{
|
|
267
|
+
**{
|
|
268
|
+
'idx': i,
|
|
269
|
+
'name': candidate.name,
|
|
270
|
+
'entity_types': candidate.labels,
|
|
271
|
+
},
|
|
272
|
+
**candidate.attributes,
|
|
273
|
+
}
|
|
274
|
+
for i, candidate in enumerate(indexes.existing_nodes)
|
|
246
275
|
]
|
|
247
276
|
|
|
248
277
|
context = {
|
|
249
278
|
'extracted_nodes': extracted_nodes_context,
|
|
250
279
|
'existing_nodes': existing_nodes_context,
|
|
251
280
|
'episode_content': episode.content if episode is not None else '',
|
|
252
|
-
'previous_episodes':
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
'ensure_ascii':
|
|
281
|
+
'previous_episodes': (
|
|
282
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
283
|
+
),
|
|
284
|
+
'ensure_ascii': ensure_ascii,
|
|
256
285
|
}
|
|
257
286
|
|
|
258
287
|
llm_response = await llm_client.generate_response(
|
|
@@ -262,33 +291,81 @@ async def resolve_extracted_nodes(
|
|
|
262
291
|
|
|
263
292
|
node_resolutions: list[NodeDuplicate] = NodeResolutions(**llm_response).entity_resolutions
|
|
264
293
|
|
|
265
|
-
resolved_nodes: list[EntityNode] = []
|
|
266
|
-
uuid_map: dict[str, str] = {}
|
|
267
|
-
node_duplicates: list[tuple[EntityNode, EntityNode]] = []
|
|
268
294
|
for resolution in node_resolutions:
|
|
269
|
-
|
|
295
|
+
relative_id: int = resolution.id
|
|
270
296
|
duplicate_idx: int = resolution.duplicate_idx
|
|
271
297
|
|
|
272
|
-
|
|
298
|
+
original_index = state.unresolved_indices[relative_id]
|
|
299
|
+
extracted_node = extracted_nodes[original_index]
|
|
273
300
|
|
|
274
301
|
resolved_node = (
|
|
275
|
-
existing_nodes[duplicate_idx]
|
|
276
|
-
if 0 <= duplicate_idx < len(existing_nodes)
|
|
302
|
+
indexes.existing_nodes[duplicate_idx]
|
|
303
|
+
if 0 <= duplicate_idx < len(indexes.existing_nodes)
|
|
277
304
|
else extracted_node
|
|
278
305
|
)
|
|
279
306
|
|
|
280
|
-
|
|
307
|
+
state.resolved_nodes[original_index] = resolved_node
|
|
308
|
+
state.uuid_map[extracted_node.uuid] = resolved_node.uuid
|
|
281
309
|
|
|
282
|
-
resolved_nodes.append(resolved_node)
|
|
283
|
-
uuid_map[extracted_node.uuid] = resolved_node.uuid
|
|
284
310
|
|
|
285
|
-
|
|
311
|
+
async def resolve_extracted_nodes(
|
|
312
|
+
clients: GraphitiClients,
|
|
313
|
+
extracted_nodes: list[EntityNode],
|
|
314
|
+
episode: EpisodicNode | None = None,
|
|
315
|
+
previous_episodes: list[EpisodicNode] | None = None,
|
|
316
|
+
entity_types: dict[str, type[BaseModel]] | None = None,
|
|
317
|
+
existing_nodes_override: list[EntityNode] | None = None,
|
|
318
|
+
) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
|
|
319
|
+
"""Search for existing nodes, resolve deterministic matches, then escalate holdouts to the LLM dedupe prompt."""
|
|
320
|
+
llm_client = clients.llm_client
|
|
321
|
+
driver = clients.driver
|
|
322
|
+
existing_nodes = await _collect_candidate_nodes(
|
|
323
|
+
clients,
|
|
324
|
+
extracted_nodes,
|
|
325
|
+
existing_nodes_override,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
indexes: DedupCandidateIndexes = _build_candidate_indexes(existing_nodes)
|
|
329
|
+
|
|
330
|
+
state = DedupResolutionState(
|
|
331
|
+
resolved_nodes=[None] * len(extracted_nodes),
|
|
332
|
+
uuid_map={},
|
|
333
|
+
unresolved_indices=[],
|
|
334
|
+
)
|
|
335
|
+
node_duplicates: list[tuple[EntityNode, EntityNode]] = []
|
|
336
|
+
|
|
337
|
+
_resolve_with_similarity(extracted_nodes, indexes, state)
|
|
338
|
+
|
|
339
|
+
await _resolve_with_llm(
|
|
340
|
+
llm_client,
|
|
341
|
+
extracted_nodes,
|
|
342
|
+
indexes,
|
|
343
|
+
state,
|
|
344
|
+
clients.ensure_ascii,
|
|
345
|
+
episode,
|
|
346
|
+
previous_episodes,
|
|
347
|
+
entity_types,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
for idx, node in enumerate(extracted_nodes):
|
|
351
|
+
if state.resolved_nodes[idx] is None:
|
|
352
|
+
state.resolved_nodes[idx] = node
|
|
353
|
+
state.uuid_map[node.uuid] = node.uuid
|
|
354
|
+
|
|
355
|
+
logger.debug(
|
|
356
|
+
'Resolved nodes: %s',
|
|
357
|
+
[(node.name, node.uuid) for node in state.resolved_nodes if node is not None],
|
|
358
|
+
)
|
|
286
359
|
|
|
287
360
|
new_node_duplicates: list[
|
|
288
361
|
tuple[EntityNode, EntityNode]
|
|
289
362
|
] = await filter_existing_duplicate_of_edges(driver, node_duplicates)
|
|
290
363
|
|
|
291
|
-
return
|
|
364
|
+
return (
|
|
365
|
+
[node for node in state.resolved_nodes if node is not None],
|
|
366
|
+
state.uuid_map,
|
|
367
|
+
new_node_duplicates,
|
|
368
|
+
)
|
|
292
369
|
|
|
293
370
|
|
|
294
371
|
async def extract_attributes_from_nodes(
|
|
@@ -307,9 +384,11 @@ async def extract_attributes_from_nodes(
|
|
|
307
384
|
node,
|
|
308
385
|
episode,
|
|
309
386
|
previous_episodes,
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
387
|
+
(
|
|
388
|
+
entity_types.get(next((item for item in node.labels if item != 'Entity'), ''))
|
|
389
|
+
if entity_types is not None
|
|
390
|
+
else None
|
|
391
|
+
),
|
|
313
392
|
clients.ensure_ascii,
|
|
314
393
|
)
|
|
315
394
|
for node in nodes
|
|
@@ -339,18 +418,18 @@ async def extract_attributes_from_node(
|
|
|
339
418
|
attributes_context: dict[str, Any] = {
|
|
340
419
|
'node': node_context,
|
|
341
420
|
'episode_content': episode.content if episode is not None else '',
|
|
342
|
-
'previous_episodes':
|
|
343
|
-
|
|
344
|
-
|
|
421
|
+
'previous_episodes': (
|
|
422
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
423
|
+
),
|
|
345
424
|
'ensure_ascii': ensure_ascii,
|
|
346
425
|
}
|
|
347
426
|
|
|
348
427
|
summary_context: dict[str, Any] = {
|
|
349
428
|
'node': node_context,
|
|
350
429
|
'episode_content': episode.content if episode is not None else '',
|
|
351
|
-
'previous_episodes':
|
|
352
|
-
|
|
353
|
-
|
|
430
|
+
'previous_episodes': (
|
|
431
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
432
|
+
),
|
|
354
433
|
'ensure_ascii': ensure_ascii,
|
|
355
434
|
}
|
|
356
435
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphiti-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.30.0rc0
|
|
4
4
|
Summary: A temporal graph building library
|
|
5
5
|
Project-URL: Homepage, https://help.getzep.com/graphiti/graphiti/overview
|
|
6
6
|
Project-URL: Repository, https://github.com/getzep/graphiti
|
|
@@ -68,13 +68,14 @@ graphiti_core/utils/bulk_utils.py,sha256=9XWXqjxiu2ydKMLKQRTbvzO6cO1o1HRjjpmaf5Y
|
|
|
68
68
|
graphiti_core/utils/datetime_utils.py,sha256=J-zYSq7-H-2n9hYOXNIun12kM10vNX9mMATGR_egTmY,1806
|
|
69
69
|
graphiti_core/utils/maintenance/__init__.py,sha256=vW4H1KyapTl-OOz578uZABYcpND4wPx3Vt6aAPaXh78,301
|
|
70
70
|
graphiti_core/utils/maintenance/community_operations.py,sha256=XMiokEemn96GlvjkOvbo9hIX04Fea3eVj408NHG5P4o,11042
|
|
71
|
-
graphiti_core/utils/maintenance/
|
|
71
|
+
graphiti_core/utils/maintenance/dedup_helpers.py,sha256=vlH_vr0OSdlAMRN2MqxXJO0ktZm1tV3Ua-piNEXlusM,8908
|
|
72
|
+
graphiti_core/utils/maintenance/edge_operations.py,sha256=fvWKJWzz4_d2Y8bOfZFjJpLnGmsFwnrutFW25LX-S08,21287
|
|
72
73
|
graphiti_core/utils/maintenance/graph_data_operations.py,sha256=42icj3S_ELAJ-NK3jVS_rg_243dmnaZOyUitJj_uJ-M,6085
|
|
73
|
-
graphiti_core/utils/maintenance/node_operations.py,sha256=
|
|
74
|
+
graphiti_core/utils/maintenance/node_operations.py,sha256=uETM0536c2BAwr7pSdpiUHER6Nt7piWcKKK8O8AoBz4,15716
|
|
74
75
|
graphiti_core/utils/maintenance/temporal_operations.py,sha256=IIaVtShpVkOYe6haxz3a1x3v54-MzaEXG8VsxFUNeoY,3582
|
|
75
76
|
graphiti_core/utils/maintenance/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
77
|
graphiti_core/utils/ontology_utils/entity_types_utils.py,sha256=4eVgxLWY6Q8k9cRJ5pW59IYF--U4nXZsZIGOVb_yHfQ,1285
|
|
77
|
-
graphiti_core-0.
|
|
78
|
-
graphiti_core-0.
|
|
79
|
-
graphiti_core-0.
|
|
80
|
-
graphiti_core-0.
|
|
78
|
+
graphiti_core-0.30.0rc0.dist-info/METADATA,sha256=y7E8yac8any2aLVv4QUJj6fKrEt3i0-Madq6tMUMpEg,26933
|
|
79
|
+
graphiti_core-0.30.0rc0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
80
|
+
graphiti_core-0.30.0rc0.dist-info/licenses/LICENSE,sha256=KCUwCyDXuVEgmDWkozHyniRyWjnWUWjkuDHfU6o3JlA,11325
|
|
81
|
+
graphiti_core-0.30.0rc0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|