graphiti-core 0.21.0rc6__py3-none-any.whl → 0.21.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graphiti-core might be problematic. Click here for more details.
- graphiti_core/graphiti.py +1 -0
- graphiti_core/llm_client/client.py +14 -4
- graphiti_core/llm_client/gemini_client.py +2 -2
- graphiti_core/llm_client/openai_base_client.py +2 -2
- graphiti_core/llm_client/openai_generic_client.py +2 -2
- graphiti_core/prompts/dedupe_nodes.py +42 -26
- graphiti_core/prompts/extract_nodes.py +2 -1
- graphiti_core/utils/bulk_utils.py +127 -60
- graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
- graphiti_core/utils/maintenance/edge_operations.py +89 -6
- graphiti_core/utils/maintenance/node_operations.py +171 -64
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.21.0rc7.dist-info}/METADATA +4 -1
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.21.0rc7.dist-info}/RECORD +15 -14
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.21.0rc7.dist-info}/WHEEL +0 -0
- {graphiti_core-0.21.0rc6.dist-info → graphiti_core-0.21.0rc7.dist-info}/licenses/LICENSE +0 -0
graphiti_core/graphiti.py
CHANGED
|
@@ -32,9 +32,19 @@ from .errors import RateLimitError
|
|
|
32
32
|
DEFAULT_TEMPERATURE = 0
|
|
33
33
|
DEFAULT_CACHE_DIR = './llm_cache'
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
|
|
36
|
+
def get_extraction_language_instruction() -> str:
|
|
37
|
+
"""Returns instruction for language extraction behavior.
|
|
38
|
+
|
|
39
|
+
Override this function to customize language extraction:
|
|
40
|
+
- Return empty string to disable multilingual instructions
|
|
41
|
+
- Return custom instructions for specific language requirements
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: Language instruction to append to system messages
|
|
45
|
+
"""
|
|
46
|
+
return '\n\nAny extracted information should be returned in the same language as it was written in.'
|
|
47
|
+
|
|
38
48
|
|
|
39
49
|
logger = logging.getLogger(__name__)
|
|
40
50
|
|
|
@@ -145,7 +155,7 @@ class LLMClient(ABC):
|
|
|
145
155
|
)
|
|
146
156
|
|
|
147
157
|
# Add multilingual extraction instructions
|
|
148
|
-
messages[0].content +=
|
|
158
|
+
messages[0].content += get_extraction_language_instruction()
|
|
149
159
|
|
|
150
160
|
if self.cache_enabled and self.cache_dir is not None:
|
|
151
161
|
cache_key = self._get_cache_key(messages)
|
|
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, ClassVar
|
|
|
23
23
|
from pydantic import BaseModel
|
|
24
24
|
|
|
25
25
|
from ..prompts.models import Message
|
|
26
|
-
from .client import
|
|
26
|
+
from .client import LLMClient, get_extraction_language_instruction
|
|
27
27
|
from .config import LLMConfig, ModelSize
|
|
28
28
|
from .errors import RateLimitError
|
|
29
29
|
|
|
@@ -376,7 +376,7 @@ class GeminiClient(LLMClient):
|
|
|
376
376
|
last_output = None
|
|
377
377
|
|
|
378
378
|
# Add multilingual extraction instructions
|
|
379
|
-
messages[0].content +=
|
|
379
|
+
messages[0].content += get_extraction_language_instruction()
|
|
380
380
|
|
|
381
381
|
while retry_count < self.MAX_RETRIES:
|
|
382
382
|
try:
|
|
@@ -25,7 +25,7 @@ from openai.types.chat import ChatCompletionMessageParam
|
|
|
25
25
|
from pydantic import BaseModel
|
|
26
26
|
|
|
27
27
|
from ..prompts.models import Message
|
|
28
|
-
from .client import
|
|
28
|
+
from .client import LLMClient, get_extraction_language_instruction
|
|
29
29
|
from .config import DEFAULT_MAX_TOKENS, LLMConfig, ModelSize
|
|
30
30
|
from .errors import RateLimitError, RefusalError
|
|
31
31
|
|
|
@@ -184,7 +184,7 @@ class BaseOpenAIClient(LLMClient):
|
|
|
184
184
|
last_error = None
|
|
185
185
|
|
|
186
186
|
# Add multilingual extraction instructions
|
|
187
|
-
messages[0].content +=
|
|
187
|
+
messages[0].content += get_extraction_language_instruction()
|
|
188
188
|
|
|
189
189
|
while retry_count <= self.MAX_RETRIES:
|
|
190
190
|
try:
|
|
@@ -25,7 +25,7 @@ from openai.types.chat import ChatCompletionMessageParam
|
|
|
25
25
|
from pydantic import BaseModel
|
|
26
26
|
|
|
27
27
|
from ..prompts.models import Message
|
|
28
|
-
from .client import
|
|
28
|
+
from .client import LLMClient, get_extraction_language_instruction
|
|
29
29
|
from .config import DEFAULT_MAX_TOKENS, LLMConfig, ModelSize
|
|
30
30
|
from .errors import RateLimitError, RefusalError
|
|
31
31
|
|
|
@@ -136,7 +136,7 @@ class OpenAIGenericClient(LLMClient):
|
|
|
136
136
|
)
|
|
137
137
|
|
|
138
138
|
# Add multilingual extraction instructions
|
|
139
|
-
messages[0].content +=
|
|
139
|
+
messages[0].content += get_extraction_language_instruction()
|
|
140
140
|
|
|
141
141
|
while retry_count <= self.MAX_RETRIES:
|
|
142
142
|
try:
|
|
@@ -92,12 +92,23 @@ def node(context: dict[str, Any]) -> list[Message]:
|
|
|
92
92
|
|
|
93
93
|
TASK:
|
|
94
94
|
1. Compare `new_entity` against each item in `existing_entities`.
|
|
95
|
-
2. If it refers to the same real
|
|
96
|
-
3. Let `duplicate_idx` = the
|
|
97
|
-
4. Let `duplicates` = the list of
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
95
|
+
2. If it refers to the same real-world object or concept, collect its index.
|
|
96
|
+
3. Let `duplicate_idx` = the smallest collected index, or -1 if none.
|
|
97
|
+
4. Let `duplicates` = the sorted list of all collected indices (empty list if none).
|
|
98
|
+
|
|
99
|
+
Respond with a JSON object containing an "entity_resolutions" array with a single entry:
|
|
100
|
+
{{
|
|
101
|
+
"entity_resolutions": [
|
|
102
|
+
{{
|
|
103
|
+
"id": integer id from NEW ENTITY,
|
|
104
|
+
"name": the best full name for the entity,
|
|
105
|
+
"duplicate_idx": integer index of the best duplicate in EXISTING ENTITIES, or -1 if none,
|
|
106
|
+
"duplicates": sorted list of all duplicate indices you collected (deduplicate the list, use [] when none)
|
|
107
|
+
}}
|
|
108
|
+
]
|
|
109
|
+
}}
|
|
110
|
+
|
|
111
|
+
Only reference indices that appear in EXISTING ENTITIES, and return [] / -1 when unsure.
|
|
101
112
|
""",
|
|
102
113
|
),
|
|
103
114
|
]
|
|
@@ -126,26 +137,26 @@ def nodes(context: dict[str, Any]) -> list[Message]:
|
|
|
126
137
|
{{
|
|
127
138
|
id: integer id of the entity,
|
|
128
139
|
name: "name of the entity",
|
|
129
|
-
entity_type: "
|
|
130
|
-
entity_type_description: "Description of what the entity type represents"
|
|
131
|
-
duplication_candidates: [
|
|
132
|
-
{{
|
|
133
|
-
idx: integer index of the candidate entity,
|
|
134
|
-
name: "name of the candidate entity",
|
|
135
|
-
entity_type: "ontological classification of the candidate entity",
|
|
136
|
-
...<additional attributes>
|
|
137
|
-
}}
|
|
138
|
-
]
|
|
140
|
+
entity_type: ["Entity", "<optional additional label>", ...],
|
|
141
|
+
entity_type_description: "Description of what the entity type represents"
|
|
139
142
|
}}
|
|
140
|
-
|
|
143
|
+
|
|
141
144
|
<ENTITIES>
|
|
142
145
|
{to_prompt_json(context['extracted_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)}
|
|
143
146
|
</ENTITIES>
|
|
144
|
-
|
|
147
|
+
|
|
145
148
|
<EXISTING ENTITIES>
|
|
146
149
|
{to_prompt_json(context['existing_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)}
|
|
147
150
|
</EXISTING ENTITIES>
|
|
148
151
|
|
|
152
|
+
Each entry in EXISTING ENTITIES is an object with the following structure:
|
|
153
|
+
{{
|
|
154
|
+
idx: integer index of the candidate entity (use this when referencing a duplicate),
|
|
155
|
+
name: "name of the candidate entity",
|
|
156
|
+
entity_types: ["Entity", "<optional additional label>", ...],
|
|
157
|
+
...<additional attributes such as summaries or metadata>
|
|
158
|
+
}}
|
|
159
|
+
|
|
149
160
|
For each of the above ENTITIES, determine if the entity is a duplicate of any of the EXISTING ENTITIES.
|
|
150
161
|
|
|
151
162
|
Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
|
|
@@ -155,14 +166,19 @@ def nodes(context: dict[str, Any]) -> list[Message]:
|
|
|
155
166
|
- They have similar names or purposes but refer to separate instances or concepts.
|
|
156
167
|
|
|
157
168
|
Task:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
For
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
169
|
+
Respond with a JSON object that contains an "entity_resolutions" array with one entry for each entity in ENTITIES, ordered by the entity id.
|
|
170
|
+
|
|
171
|
+
For every entity, return an object with the following keys:
|
|
172
|
+
{{
|
|
173
|
+
"id": integer id from ENTITIES,
|
|
174
|
+
"name": the best full name for the entity (preserve the original name unless a duplicate has a more complete name),
|
|
175
|
+
"duplicate_idx": the idx of the EXISTING ENTITY that is the best duplicate match, or -1 if there is no duplicate,
|
|
176
|
+
"duplicates": a sorted list of all idx values from EXISTING ENTITIES that refer to duplicates (deduplicate the list, use [] when none or unsure)
|
|
177
|
+
}}
|
|
178
|
+
|
|
179
|
+
- Only use idx values that appear in EXISTING ENTITIES.
|
|
180
|
+
- Set duplicate_idx to the smallest idx you collected for that entity, or -1 if duplicates is empty.
|
|
181
|
+
- Never fabricate entities or indices.
|
|
166
182
|
""",
|
|
167
183
|
),
|
|
168
184
|
]
|
|
@@ -152,7 +152,8 @@ Indicate the classified entity type by providing its entity_type_id.
|
|
|
152
152
|
|
|
153
153
|
Guidelines:
|
|
154
154
|
1. Always try to extract an entities that the JSON represents. This will often be something like a "name" or "user field
|
|
155
|
-
2.
|
|
155
|
+
2. Extract all entities mentioned in all other properties throughout the JSON structure
|
|
156
|
+
3. Do NOT extract any properties that contain dates
|
|
156
157
|
"""
|
|
157
158
|
return [
|
|
158
159
|
Message(role='system', content=sys_prompt),
|
|
@@ -43,8 +43,14 @@ from graphiti_core.models.nodes.node_db_queries import (
|
|
|
43
43
|
get_entity_node_save_bulk_query,
|
|
44
44
|
get_episode_node_save_bulk_query,
|
|
45
45
|
)
|
|
46
|
-
from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode
|
|
46
|
+
from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode
|
|
47
47
|
from graphiti_core.utils.datetime_utils import convert_datetimes_to_strings
|
|
48
|
+
from graphiti_core.utils.maintenance.dedup_helpers import (
|
|
49
|
+
DedupResolutionState,
|
|
50
|
+
_build_candidate_indexes,
|
|
51
|
+
_normalize_string_exact,
|
|
52
|
+
_resolve_with_similarity,
|
|
53
|
+
)
|
|
48
54
|
from graphiti_core.utils.maintenance.edge_operations import (
|
|
49
55
|
extract_edges,
|
|
50
56
|
resolve_extracted_edge,
|
|
@@ -63,6 +69,38 @@ logger = logging.getLogger(__name__)
|
|
|
63
69
|
CHUNK_SIZE = 10
|
|
64
70
|
|
|
65
71
|
|
|
72
|
+
def _build_directed_uuid_map(pairs: list[tuple[str, str]]) -> dict[str, str]:
|
|
73
|
+
"""Collapse alias -> canonical chains while preserving direction.
|
|
74
|
+
|
|
75
|
+
The incoming pairs represent directed mappings discovered during node dedupe. We use a simple
|
|
76
|
+
union-find with iterative path compression to ensure every source UUID resolves to its ultimate
|
|
77
|
+
canonical target, even if aliases appear lexicographically smaller than the canonical UUID.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
parent: dict[str, str] = {}
|
|
81
|
+
|
|
82
|
+
def find(uuid: str) -> str:
|
|
83
|
+
"""Directed union-find lookup using iterative path compression."""
|
|
84
|
+
parent.setdefault(uuid, uuid)
|
|
85
|
+
root = uuid
|
|
86
|
+
while parent[root] != root:
|
|
87
|
+
root = parent[root]
|
|
88
|
+
|
|
89
|
+
while parent[uuid] != root:
|
|
90
|
+
next_uuid = parent[uuid]
|
|
91
|
+
parent[uuid] = root
|
|
92
|
+
uuid = next_uuid
|
|
93
|
+
|
|
94
|
+
return root
|
|
95
|
+
|
|
96
|
+
for source_uuid, target_uuid in pairs:
|
|
97
|
+
parent.setdefault(source_uuid, source_uuid)
|
|
98
|
+
parent.setdefault(target_uuid, target_uuid)
|
|
99
|
+
parent[find(source_uuid)] = find(target_uuid)
|
|
100
|
+
|
|
101
|
+
return {uuid: find(uuid) for uuid in parent}
|
|
102
|
+
|
|
103
|
+
|
|
66
104
|
class RawEpisode(BaseModel):
|
|
67
105
|
name: str
|
|
68
106
|
uuid: str | None = Field(default=None)
|
|
@@ -266,83 +304,111 @@ async def dedupe_nodes_bulk(
|
|
|
266
304
|
episode_tuples: list[tuple[EpisodicNode, list[EpisodicNode]]],
|
|
267
305
|
entity_types: dict[str, type[BaseModel]] | None = None,
|
|
268
306
|
) -> tuple[dict[str, list[EntityNode]], dict[str, str]]:
|
|
269
|
-
|
|
270
|
-
min_score = 0.8
|
|
271
|
-
|
|
272
|
-
# generate embeddings
|
|
273
|
-
await semaphore_gather(
|
|
274
|
-
*[create_entity_node_embeddings(embedder, nodes) for nodes in extracted_nodes]
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
# Find similar results
|
|
278
|
-
dedupe_tuples: list[tuple[list[EntityNode], list[EntityNode]]] = []
|
|
279
|
-
for i, nodes_i in enumerate(extracted_nodes):
|
|
280
|
-
existing_nodes: list[EntityNode] = []
|
|
281
|
-
for j, nodes_j in enumerate(extracted_nodes):
|
|
282
|
-
if i == j:
|
|
283
|
-
continue
|
|
284
|
-
existing_nodes += nodes_j
|
|
285
|
-
|
|
286
|
-
candidates_i: list[EntityNode] = []
|
|
287
|
-
for node in nodes_i:
|
|
288
|
-
for existing_node in existing_nodes:
|
|
289
|
-
# Approximate BM25 by checking for word overlaps (this is faster than creating many in-memory indices)
|
|
290
|
-
# This approach will cast a wider net than BM25, which is ideal for this use case
|
|
291
|
-
node_words = set(node.name.lower().split())
|
|
292
|
-
existing_node_words = set(existing_node.name.lower().split())
|
|
293
|
-
has_overlap = not node_words.isdisjoint(existing_node_words)
|
|
294
|
-
if has_overlap:
|
|
295
|
-
candidates_i.append(existing_node)
|
|
296
|
-
continue
|
|
307
|
+
"""Resolve entity duplicates across an in-memory batch using a two-pass strategy.
|
|
297
308
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
candidates_i.append(existing_node)
|
|
305
|
-
|
|
306
|
-
dedupe_tuples.append((nodes_i, candidates_i))
|
|
309
|
+
1. Run :func:`resolve_extracted_nodes` for every episode in parallel so each batch item is
|
|
310
|
+
reconciled against the live graph just like the non-batch flow.
|
|
311
|
+
2. Re-run the deterministic similarity heuristics across the union of resolved nodes to catch
|
|
312
|
+
duplicates that only co-occur inside this batch, emitting a canonical UUID map that callers
|
|
313
|
+
can apply to edges and persistence.
|
|
314
|
+
"""
|
|
307
315
|
|
|
308
|
-
|
|
309
|
-
bulk_node_resolutions: list[
|
|
310
|
-
tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]
|
|
311
|
-
] = await semaphore_gather(
|
|
316
|
+
first_pass_results = await semaphore_gather(
|
|
312
317
|
*[
|
|
313
318
|
resolve_extracted_nodes(
|
|
314
319
|
clients,
|
|
315
|
-
|
|
320
|
+
nodes,
|
|
316
321
|
episode_tuples[i][0],
|
|
317
322
|
episode_tuples[i][1],
|
|
318
323
|
entity_types,
|
|
319
|
-
existing_nodes_override=dedupe_tuples[i][1],
|
|
320
324
|
)
|
|
321
|
-
for i,
|
|
325
|
+
for i, nodes in enumerate(extracted_nodes)
|
|
322
326
|
]
|
|
323
327
|
)
|
|
324
328
|
|
|
325
|
-
|
|
329
|
+
episode_resolutions: list[tuple[str, list[EntityNode]]] = []
|
|
330
|
+
per_episode_uuid_maps: list[dict[str, str]] = []
|
|
326
331
|
duplicate_pairs: list[tuple[str, str]] = []
|
|
327
|
-
for _, _, duplicates in bulk_node_resolutions:
|
|
328
|
-
for duplicate in duplicates:
|
|
329
|
-
n, m = duplicate
|
|
330
|
-
duplicate_pairs.append((n.uuid, m.uuid))
|
|
331
332
|
|
|
332
|
-
|
|
333
|
-
|
|
333
|
+
for (resolved_nodes, uuid_map, duplicates), (episode, _) in zip(
|
|
334
|
+
first_pass_results, episode_tuples, strict=True
|
|
335
|
+
):
|
|
336
|
+
episode_resolutions.append((episode.uuid, resolved_nodes))
|
|
337
|
+
per_episode_uuid_maps.append(uuid_map)
|
|
338
|
+
duplicate_pairs.extend((source.uuid, target.uuid) for source, target in duplicates)
|
|
339
|
+
|
|
340
|
+
canonical_nodes: dict[str, EntityNode] = {}
|
|
341
|
+
for _, resolved_nodes in episode_resolutions:
|
|
342
|
+
for node in resolved_nodes:
|
|
343
|
+
# NOTE: this loop is O(n^2) in the number of nodes inside the batch because we rebuild
|
|
344
|
+
# the MinHash index for the accumulated canonical pool each time. The LRU-backed
|
|
345
|
+
# shingle cache keeps the constant factors low for typical batch sizes (≤ CHUNK_SIZE),
|
|
346
|
+
# but if batches grow significantly we should switch to an incremental index or chunked
|
|
347
|
+
# processing.
|
|
348
|
+
if not canonical_nodes:
|
|
349
|
+
canonical_nodes[node.uuid] = node
|
|
350
|
+
continue
|
|
334
351
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
352
|
+
existing_candidates = list(canonical_nodes.values())
|
|
353
|
+
normalized = _normalize_string_exact(node.name)
|
|
354
|
+
exact_match = next(
|
|
355
|
+
(
|
|
356
|
+
candidate
|
|
357
|
+
for candidate in existing_candidates
|
|
358
|
+
if _normalize_string_exact(candidate.name) == normalized
|
|
359
|
+
),
|
|
360
|
+
None,
|
|
361
|
+
)
|
|
362
|
+
if exact_match is not None:
|
|
363
|
+
if exact_match.uuid != node.uuid:
|
|
364
|
+
duplicate_pairs.append((node.uuid, exact_match.uuid))
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
indexes = _build_candidate_indexes(existing_candidates)
|
|
368
|
+
state = DedupResolutionState(
|
|
369
|
+
resolved_nodes=[None],
|
|
370
|
+
uuid_map={},
|
|
371
|
+
unresolved_indices=[],
|
|
372
|
+
)
|
|
373
|
+
_resolve_with_similarity([node], indexes, state)
|
|
374
|
+
|
|
375
|
+
resolved = state.resolved_nodes[0]
|
|
376
|
+
if resolved is None:
|
|
377
|
+
canonical_nodes[node.uuid] = node
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
canonical_uuid = resolved.uuid
|
|
381
|
+
canonical_nodes.setdefault(canonical_uuid, resolved)
|
|
382
|
+
if canonical_uuid != node.uuid:
|
|
383
|
+
duplicate_pairs.append((node.uuid, canonical_uuid))
|
|
384
|
+
|
|
385
|
+
union_pairs: list[tuple[str, str]] = []
|
|
386
|
+
for uuid_map in per_episode_uuid_maps:
|
|
387
|
+
union_pairs.extend(uuid_map.items())
|
|
388
|
+
union_pairs.extend(duplicate_pairs)
|
|
389
|
+
|
|
390
|
+
compressed_map: dict[str, str] = _build_directed_uuid_map(union_pairs)
|
|
338
391
|
|
|
339
392
|
nodes_by_episode: dict[str, list[EntityNode]] = {}
|
|
340
|
-
for
|
|
341
|
-
|
|
393
|
+
for episode_uuid, resolved_nodes in episode_resolutions:
|
|
394
|
+
deduped_nodes: list[EntityNode] = []
|
|
395
|
+
seen: set[str] = set()
|
|
396
|
+
for node in resolved_nodes:
|
|
397
|
+
canonical_uuid = compressed_map.get(node.uuid, node.uuid)
|
|
398
|
+
if canonical_uuid in seen:
|
|
399
|
+
continue
|
|
400
|
+
seen.add(canonical_uuid)
|
|
401
|
+
canonical_node = canonical_nodes.get(canonical_uuid)
|
|
402
|
+
if canonical_node is None:
|
|
403
|
+
logger.error(
|
|
404
|
+
'Canonical node %s missing during batch dedupe; falling back to %s',
|
|
405
|
+
canonical_uuid,
|
|
406
|
+
node.uuid,
|
|
407
|
+
)
|
|
408
|
+
canonical_node = node
|
|
409
|
+
deduped_nodes.append(canonical_node)
|
|
342
410
|
|
|
343
|
-
nodes_by_episode[
|
|
344
|
-
node_uuid_map[compressed_map.get(node.uuid, node.uuid)] for node in nodes
|
|
345
|
-
]
|
|
411
|
+
nodes_by_episode[episode_uuid] = deduped_nodes
|
|
346
412
|
|
|
347
413
|
return nodes_by_episode, compressed_map
|
|
348
414
|
|
|
@@ -411,6 +477,7 @@ async def dedupe_edges_bulk(
|
|
|
411
477
|
candidates,
|
|
412
478
|
episode,
|
|
413
479
|
edge_types,
|
|
480
|
+
set(edge_types),
|
|
414
481
|
clients.ensure_ascii,
|
|
415
482
|
)
|
|
416
483
|
for episode, edge, candidates in dedupe_tuples
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024, Zep Software, Inc.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
from hashlib import blake2b
|
|
26
|
+
from typing import TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from graphiti_core.nodes import EntityNode
|
|
30
|
+
|
|
31
|
+
_NAME_ENTROPY_THRESHOLD = 1.5
|
|
32
|
+
_MIN_NAME_LENGTH = 6
|
|
33
|
+
_MIN_TOKEN_COUNT = 2
|
|
34
|
+
_FUZZY_JACCARD_THRESHOLD = 0.9
|
|
35
|
+
_MINHASH_PERMUTATIONS = 32
|
|
36
|
+
_MINHASH_BAND_SIZE = 4
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_string_exact(name: str) -> str:
|
|
40
|
+
"""Lowercase text and collapse whitespace so equal names map to the same key."""
|
|
41
|
+
normalized = re.sub(r'[\s]+', ' ', name.lower())
|
|
42
|
+
return normalized.strip()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_name_for_fuzzy(name: str) -> str:
|
|
46
|
+
"""Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
|
|
47
|
+
normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
|
|
48
|
+
normalized = normalized.strip()
|
|
49
|
+
return re.sub(r'[\s]+', ' ', normalized)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _name_entropy(normalized_name: str) -> float:
|
|
53
|
+
"""Approximate text specificity using Shannon entropy over characters.
|
|
54
|
+
|
|
55
|
+
We strip spaces, count how often each character appears, and sum
|
|
56
|
+
probability * -log2(probability). Short or repetitive names yield low
|
|
57
|
+
entropy, which signals we should defer resolution to the LLM instead of
|
|
58
|
+
trusting fuzzy similarity.
|
|
59
|
+
"""
|
|
60
|
+
if not normalized_name:
|
|
61
|
+
return 0.0
|
|
62
|
+
|
|
63
|
+
counts: dict[str, int] = {}
|
|
64
|
+
for char in normalized_name.replace(' ', ''):
|
|
65
|
+
counts[char] = counts.get(char, 0) + 1
|
|
66
|
+
|
|
67
|
+
total = sum(counts.values())
|
|
68
|
+
if total == 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
entropy = 0.0
|
|
72
|
+
for count in counts.values():
|
|
73
|
+
probability = count / total
|
|
74
|
+
entropy -= probability * math.log2(probability)
|
|
75
|
+
|
|
76
|
+
return entropy
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _has_high_entropy(normalized_name: str) -> bool:
|
|
80
|
+
"""Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
|
|
81
|
+
token_count = len(normalized_name.split())
|
|
82
|
+
if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _shingles(normalized_name: str) -> set[str]:
|
|
89
|
+
"""Create 3-gram shingles from the normalized name for MinHash calculations."""
|
|
90
|
+
cleaned = normalized_name.replace(' ', '')
|
|
91
|
+
if len(cleaned) < 2:
|
|
92
|
+
return {cleaned} if cleaned else set()
|
|
93
|
+
|
|
94
|
+
return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _hash_shingle(shingle: str, seed: int) -> int:
|
|
98
|
+
"""Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
|
|
99
|
+
digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
|
|
100
|
+
return int.from_bytes(digest.digest(), 'big')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
|
|
104
|
+
"""Compute the MinHash signature for the shingle set across predefined permutations."""
|
|
105
|
+
if not shingles:
|
|
106
|
+
return tuple()
|
|
107
|
+
|
|
108
|
+
seeds = range(_MINHASH_PERMUTATIONS)
|
|
109
|
+
signature: list[int] = []
|
|
110
|
+
for seed in seeds:
|
|
111
|
+
min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
|
|
112
|
+
signature.append(min_hash)
|
|
113
|
+
|
|
114
|
+
return tuple(signature)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
|
|
118
|
+
"""Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
|
|
119
|
+
signature_list = list(signature)
|
|
120
|
+
if not signature_list:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
bands: list[tuple[int, ...]] = []
|
|
124
|
+
for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
|
|
125
|
+
band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
|
|
126
|
+
if len(band) == _MINHASH_BAND_SIZE:
|
|
127
|
+
bands.append(band)
|
|
128
|
+
return bands
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _jaccard_similarity(a: set[str], b: set[str]) -> float:
|
|
132
|
+
"""Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
|
|
133
|
+
if not a and not b:
|
|
134
|
+
return 1.0
|
|
135
|
+
if not a or not b:
|
|
136
|
+
return 0.0
|
|
137
|
+
|
|
138
|
+
intersection = len(a.intersection(b))
|
|
139
|
+
union = len(a.union(b))
|
|
140
|
+
return intersection / union if union else 0.0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@lru_cache(maxsize=512)
|
|
144
|
+
def _cached_shingles(name: str) -> set[str]:
|
|
145
|
+
"""Cache shingle sets per normalized name to avoid recomputation within a worker."""
|
|
146
|
+
return _shingles(name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class DedupCandidateIndexes:
|
|
151
|
+
"""Precomputed lookup structures that drive entity deduplication heuristics."""
|
|
152
|
+
|
|
153
|
+
existing_nodes: list[EntityNode]
|
|
154
|
+
nodes_by_uuid: dict[str, EntityNode]
|
|
155
|
+
normalized_existing: defaultdict[str, list[EntityNode]]
|
|
156
|
+
shingles_by_candidate: dict[str, set[str]]
|
|
157
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class DedupResolutionState:
|
|
162
|
+
"""Mutable resolution bookkeeping shared across deterministic and LLM passes."""
|
|
163
|
+
|
|
164
|
+
resolved_nodes: list[EntityNode | None]
|
|
165
|
+
uuid_map: dict[str, str]
|
|
166
|
+
unresolved_indices: list[int]
|
|
167
|
+
duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
|
|
171
|
+
"""Precompute exact and fuzzy lookup structures once per dedupe run."""
|
|
172
|
+
normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
|
|
173
|
+
nodes_by_uuid: dict[str, EntityNode] = {}
|
|
174
|
+
shingles_by_candidate: dict[str, set[str]] = {}
|
|
175
|
+
lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
|
|
176
|
+
|
|
177
|
+
for candidate in existing_nodes:
|
|
178
|
+
normalized = _normalize_string_exact(candidate.name)
|
|
179
|
+
normalized_existing[normalized].append(candidate)
|
|
180
|
+
nodes_by_uuid[candidate.uuid] = candidate
|
|
181
|
+
|
|
182
|
+
shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
|
|
183
|
+
shingles_by_candidate[candidate.uuid] = shingles
|
|
184
|
+
|
|
185
|
+
signature = _minhash_signature(shingles)
|
|
186
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
187
|
+
lsh_buckets[(band_index, band)].append(candidate.uuid)
|
|
188
|
+
|
|
189
|
+
return DedupCandidateIndexes(
|
|
190
|
+
existing_nodes=existing_nodes,
|
|
191
|
+
nodes_by_uuid=nodes_by_uuid,
|
|
192
|
+
normalized_existing=normalized_existing,
|
|
193
|
+
shingles_by_candidate=shingles_by_candidate,
|
|
194
|
+
lsh_buckets=lsh_buckets,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _resolve_with_similarity(
|
|
199
|
+
extracted_nodes: list[EntityNode],
|
|
200
|
+
indexes: DedupCandidateIndexes,
|
|
201
|
+
state: DedupResolutionState,
|
|
202
|
+
) -> None:
|
|
203
|
+
"""Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
|
|
204
|
+
for idx, node in enumerate(extracted_nodes):
|
|
205
|
+
normalized_exact = _normalize_string_exact(node.name)
|
|
206
|
+
normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
|
|
207
|
+
|
|
208
|
+
if not _has_high_entropy(normalized_fuzzy):
|
|
209
|
+
state.unresolved_indices.append(idx)
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
existing_matches = indexes.normalized_existing.get(normalized_exact, [])
|
|
213
|
+
if len(existing_matches) == 1:
|
|
214
|
+
match = existing_matches[0]
|
|
215
|
+
state.resolved_nodes[idx] = match
|
|
216
|
+
state.uuid_map[node.uuid] = match.uuid
|
|
217
|
+
if match.uuid != node.uuid:
|
|
218
|
+
state.duplicate_pairs.append((node, match))
|
|
219
|
+
continue
|
|
220
|
+
if len(existing_matches) > 1:
|
|
221
|
+
state.unresolved_indices.append(idx)
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
shingles = _cached_shingles(normalized_fuzzy)
|
|
225
|
+
signature = _minhash_signature(shingles)
|
|
226
|
+
candidate_ids: set[str] = set()
|
|
227
|
+
for band_index, band in enumerate(_lsh_bands(signature)):
|
|
228
|
+
candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
|
|
229
|
+
|
|
230
|
+
best_candidate: EntityNode | None = None
|
|
231
|
+
best_score = 0.0
|
|
232
|
+
for candidate_id in candidate_ids:
|
|
233
|
+
candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
|
|
234
|
+
score = _jaccard_similarity(shingles, candidate_shingles)
|
|
235
|
+
if score > best_score:
|
|
236
|
+
best_score = score
|
|
237
|
+
best_candidate = indexes.nodes_by_uuid.get(candidate_id)
|
|
238
|
+
|
|
239
|
+
if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
|
|
240
|
+
state.resolved_nodes[idx] = best_candidate
|
|
241
|
+
state.uuid_map[node.uuid] = best_candidate.uuid
|
|
242
|
+
if best_candidate.uuid != node.uuid:
|
|
243
|
+
state.duplicate_pairs.append((node, best_candidate))
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
state.unresolved_indices.append(idx)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
__all__ = [
|
|
250
|
+
'DedupCandidateIndexes',
|
|
251
|
+
'DedupResolutionState',
|
|
252
|
+
'_normalize_string_exact',
|
|
253
|
+
'_normalize_name_for_fuzzy',
|
|
254
|
+
'_has_high_entropy',
|
|
255
|
+
'_minhash_signature',
|
|
256
|
+
'_lsh_bands',
|
|
257
|
+
'_jaccard_similarity',
|
|
258
|
+
'_cached_shingles',
|
|
259
|
+
'_FUZZY_JACCARD_THRESHOLD',
|
|
260
|
+
'_build_candidate_indexes',
|
|
261
|
+
'_resolve_with_similarity',
|
|
262
|
+
]
|
|
@@ -41,6 +41,9 @@ from graphiti_core.search.search_config import SearchResults
|
|
|
41
41
|
from graphiti_core.search.search_config_recipes import EDGE_HYBRID_SEARCH_RRF
|
|
42
42
|
from graphiti_core.search.search_filters import SearchFilters
|
|
43
43
|
from graphiti_core.utils.datetime_utils import ensure_utc, utc_now
|
|
44
|
+
from graphiti_core.utils.maintenance.dedup_helpers import _normalize_string_exact
|
|
45
|
+
|
|
46
|
+
DEFAULT_EDGE_NAME = 'RELATES_TO'
|
|
44
47
|
|
|
45
48
|
logger = logging.getLogger(__name__)
|
|
46
49
|
|
|
@@ -280,8 +283,12 @@ async def resolve_extracted_edges(
|
|
|
280
283
|
# Build entity hash table
|
|
281
284
|
uuid_entity_map: dict[str, EntityNode] = {entity.uuid: entity for entity in entities}
|
|
282
285
|
|
|
283
|
-
# Determine which edge types are relevant for each edge
|
|
286
|
+
# Determine which edge types are relevant for each edge.
|
|
287
|
+
# `edge_types_lst` stores the subset of custom edge definitions whose
|
|
288
|
+
# node signature matches each extracted edge. Anything outside this subset
|
|
289
|
+
# should only stay on the edge if it is a non-custom (LLM generated) label.
|
|
284
290
|
edge_types_lst: list[dict[str, type[BaseModel]]] = []
|
|
291
|
+
custom_type_names = set(edge_types or {})
|
|
285
292
|
for extracted_edge in extracted_edges:
|
|
286
293
|
source_node = uuid_entity_map.get(extracted_edge.source_node_uuid)
|
|
287
294
|
target_node = uuid_entity_map.get(extracted_edge.target_node_uuid)
|
|
@@ -309,6 +316,20 @@ async def resolve_extracted_edges(
|
|
|
309
316
|
|
|
310
317
|
edge_types_lst.append(extracted_edge_types)
|
|
311
318
|
|
|
319
|
+
for extracted_edge, extracted_edge_types in zip(extracted_edges, edge_types_lst, strict=True):
|
|
320
|
+
allowed_type_names = set(extracted_edge_types)
|
|
321
|
+
is_custom_name = extracted_edge.name in custom_type_names
|
|
322
|
+
if not allowed_type_names:
|
|
323
|
+
# No custom types are valid for this node pairing. Keep LLM generated
|
|
324
|
+
# labels, but flip disallowed custom names back to the default.
|
|
325
|
+
if is_custom_name and extracted_edge.name != DEFAULT_EDGE_NAME:
|
|
326
|
+
extracted_edge.name = DEFAULT_EDGE_NAME
|
|
327
|
+
continue
|
|
328
|
+
if is_custom_name and extracted_edge.name not in allowed_type_names:
|
|
329
|
+
# Custom name exists but it is not permitted for this source/target
|
|
330
|
+
# signature, so fall back to the default edge label.
|
|
331
|
+
extracted_edge.name = DEFAULT_EDGE_NAME
|
|
332
|
+
|
|
312
333
|
# resolve edges with related edges in the graph and find invalidation candidates
|
|
313
334
|
results: list[tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]] = list(
|
|
314
335
|
await semaphore_gather(
|
|
@@ -320,6 +341,7 @@ async def resolve_extracted_edges(
|
|
|
320
341
|
existing_edges,
|
|
321
342
|
episode,
|
|
322
343
|
extracted_edge_types,
|
|
344
|
+
custom_type_names,
|
|
323
345
|
clients.ensure_ascii,
|
|
324
346
|
)
|
|
325
347
|
for extracted_edge, related_edges, existing_edges, extracted_edge_types in zip(
|
|
@@ -391,12 +413,54 @@ async def resolve_extracted_edge(
|
|
|
391
413
|
related_edges: list[EntityEdge],
|
|
392
414
|
existing_edges: list[EntityEdge],
|
|
393
415
|
episode: EpisodicNode,
|
|
394
|
-
|
|
416
|
+
edge_type_candidates: dict[str, type[BaseModel]] | None = None,
|
|
417
|
+
custom_edge_type_names: set[str] | None = None,
|
|
395
418
|
ensure_ascii: bool = True,
|
|
396
419
|
) -> tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]:
|
|
420
|
+
"""Resolve an extracted edge against existing graph context.
|
|
421
|
+
|
|
422
|
+
Parameters
|
|
423
|
+
----------
|
|
424
|
+
llm_client : LLMClient
|
|
425
|
+
Client used to invoke the LLM for deduplication and attribute extraction.
|
|
426
|
+
extracted_edge : EntityEdge
|
|
427
|
+
Newly extracted edge whose canonical representation is being resolved.
|
|
428
|
+
related_edges : list[EntityEdge]
|
|
429
|
+
Candidate edges with identical endpoints used for duplicate detection.
|
|
430
|
+
existing_edges : list[EntityEdge]
|
|
431
|
+
Broader set of edges evaluated for contradiction / invalidation.
|
|
432
|
+
episode : EpisodicNode
|
|
433
|
+
Episode providing content context when extracting edge attributes.
|
|
434
|
+
edge_type_candidates : dict[str, type[BaseModel]] | None
|
|
435
|
+
Custom edge types permitted for the current source/target signature.
|
|
436
|
+
custom_edge_type_names : set[str] | None
|
|
437
|
+
Full catalog of registered custom edge names. Used to distinguish
|
|
438
|
+
between disallowed custom types (which fall back to the default label)
|
|
439
|
+
and ad-hoc labels emitted by the LLM.
|
|
440
|
+
ensure_ascii : bool
|
|
441
|
+
Whether prompt payloads should coerce ASCII output.
|
|
442
|
+
|
|
443
|
+
Returns
|
|
444
|
+
-------
|
|
445
|
+
tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]
|
|
446
|
+
The resolved edge, any duplicates, and edges to invalidate.
|
|
447
|
+
"""
|
|
397
448
|
if len(related_edges) == 0 and len(existing_edges) == 0:
|
|
398
449
|
return extracted_edge, [], []
|
|
399
450
|
|
|
451
|
+
# Fast path: if the fact text and endpoints already exist verbatim, reuse the matching edge.
|
|
452
|
+
normalized_fact = _normalize_string_exact(extracted_edge.fact)
|
|
453
|
+
for edge in related_edges:
|
|
454
|
+
if (
|
|
455
|
+
edge.source_node_uuid == extracted_edge.source_node_uuid
|
|
456
|
+
and edge.target_node_uuid == extracted_edge.target_node_uuid
|
|
457
|
+
and _normalize_string_exact(edge.fact) == normalized_fact
|
|
458
|
+
):
|
|
459
|
+
resolved = edge
|
|
460
|
+
if episode is not None and episode.uuid not in resolved.episodes:
|
|
461
|
+
resolved.episodes.append(episode.uuid)
|
|
462
|
+
return resolved, [], []
|
|
463
|
+
|
|
400
464
|
start = time()
|
|
401
465
|
|
|
402
466
|
# Prepare context for LLM
|
|
@@ -415,9 +479,9 @@ async def resolve_extracted_edge(
|
|
|
415
479
|
'fact_type_name': type_name,
|
|
416
480
|
'fact_type_description': type_model.__doc__,
|
|
417
481
|
}
|
|
418
|
-
for i, (type_name, type_model) in enumerate(
|
|
482
|
+
for i, (type_name, type_model) in enumerate(edge_type_candidates.items())
|
|
419
483
|
]
|
|
420
|
-
if
|
|
484
|
+
if edge_type_candidates is not None
|
|
421
485
|
else []
|
|
422
486
|
)
|
|
423
487
|
|
|
@@ -454,7 +518,16 @@ async def resolve_extracted_edge(
|
|
|
454
518
|
]
|
|
455
519
|
|
|
456
520
|
fact_type: str = response_object.fact_type
|
|
457
|
-
|
|
521
|
+
candidate_type_names = set(edge_type_candidates or {})
|
|
522
|
+
custom_type_names = custom_edge_type_names or set()
|
|
523
|
+
|
|
524
|
+
is_default_type = fact_type.upper() == 'DEFAULT'
|
|
525
|
+
is_custom_type = fact_type in custom_type_names
|
|
526
|
+
is_allowed_custom_type = fact_type in candidate_type_names
|
|
527
|
+
|
|
528
|
+
if is_allowed_custom_type:
|
|
529
|
+
# The LLM selected a custom type that is allowed for the node pair.
|
|
530
|
+
# Adopt the custom type and, if needed, extract its structured attributes.
|
|
458
531
|
resolved_edge.name = fact_type
|
|
459
532
|
|
|
460
533
|
edge_attributes_context = {
|
|
@@ -464,7 +537,7 @@ async def resolve_extracted_edge(
|
|
|
464
537
|
'ensure_ascii': ensure_ascii,
|
|
465
538
|
}
|
|
466
539
|
|
|
467
|
-
edge_model =
|
|
540
|
+
edge_model = edge_type_candidates.get(fact_type) if edge_type_candidates else None
|
|
468
541
|
if edge_model is not None and len(edge_model.model_fields) != 0:
|
|
469
542
|
edge_attributes_response = await llm_client.generate_response(
|
|
470
543
|
prompt_library.extract_edges.extract_attributes(edge_attributes_context),
|
|
@@ -473,6 +546,16 @@ async def resolve_extracted_edge(
|
|
|
473
546
|
)
|
|
474
547
|
|
|
475
548
|
resolved_edge.attributes = edge_attributes_response
|
|
549
|
+
elif not is_default_type and is_custom_type:
|
|
550
|
+
# The LLM picked a custom type that is not allowed for this signature.
|
|
551
|
+
# Reset to the default label and drop any structured attributes.
|
|
552
|
+
resolved_edge.name = DEFAULT_EDGE_NAME
|
|
553
|
+
resolved_edge.attributes = {}
|
|
554
|
+
elif not is_default_type:
|
|
555
|
+
# Non-custom labels are allowed to pass through so long as the LLM does
|
|
556
|
+
# not return the sentinel DEFAULT value.
|
|
557
|
+
resolved_edge.name = fact_type
|
|
558
|
+
resolved_edge.attributes = {}
|
|
476
559
|
|
|
477
560
|
end = time()
|
|
478
561
|
logger.debug(
|
|
@@ -24,7 +24,12 @@ from graphiti_core.graphiti_types import GraphitiClients
|
|
|
24
24
|
from graphiti_core.helpers import MAX_REFLEXION_ITERATIONS, semaphore_gather
|
|
25
25
|
from graphiti_core.llm_client import LLMClient
|
|
26
26
|
from graphiti_core.llm_client.config import ModelSize
|
|
27
|
-
from graphiti_core.nodes import
|
|
27
|
+
from graphiti_core.nodes import (
|
|
28
|
+
EntityNode,
|
|
29
|
+
EpisodeType,
|
|
30
|
+
EpisodicNode,
|
|
31
|
+
create_entity_node_embeddings,
|
|
32
|
+
)
|
|
28
33
|
from graphiti_core.prompts import prompt_library
|
|
29
34
|
from graphiti_core.prompts.dedupe_nodes import NodeDuplicate, NodeResolutions
|
|
30
35
|
from graphiti_core.prompts.extract_nodes import (
|
|
@@ -38,7 +43,15 @@ from graphiti_core.search.search_config import SearchResults
|
|
|
38
43
|
from graphiti_core.search.search_config_recipes import NODE_HYBRID_SEARCH_RRF
|
|
39
44
|
from graphiti_core.search.search_filters import SearchFilters
|
|
40
45
|
from graphiti_core.utils.datetime_utils import utc_now
|
|
41
|
-
from graphiti_core.utils.maintenance.
|
|
46
|
+
from graphiti_core.utils.maintenance.dedup_helpers import (
|
|
47
|
+
DedupCandidateIndexes,
|
|
48
|
+
DedupResolutionState,
|
|
49
|
+
_build_candidate_indexes,
|
|
50
|
+
_resolve_with_similarity,
|
|
51
|
+
)
|
|
52
|
+
from graphiti_core.utils.maintenance.edge_operations import (
|
|
53
|
+
filter_existing_duplicate_of_edges,
|
|
54
|
+
)
|
|
42
55
|
|
|
43
56
|
logger = logging.getLogger(__name__)
|
|
44
57
|
|
|
@@ -119,11 +132,13 @@ async def extract_nodes(
|
|
|
119
132
|
)
|
|
120
133
|
elif episode.source == EpisodeType.text:
|
|
121
134
|
llm_response = await llm_client.generate_response(
|
|
122
|
-
prompt_library.extract_nodes.extract_text(context),
|
|
135
|
+
prompt_library.extract_nodes.extract_text(context),
|
|
136
|
+
response_model=ExtractedEntities,
|
|
123
137
|
)
|
|
124
138
|
elif episode.source == EpisodeType.json:
|
|
125
139
|
llm_response = await llm_client.generate_response(
|
|
126
|
-
prompt_library.extract_nodes.extract_json(context),
|
|
140
|
+
prompt_library.extract_nodes.extract_json(context),
|
|
141
|
+
response_model=ExtractedEntities,
|
|
127
142
|
)
|
|
128
143
|
|
|
129
144
|
response_object = ExtractedEntities(**llm_response)
|
|
@@ -181,17 +196,12 @@ async def extract_nodes(
|
|
|
181
196
|
return extracted_nodes
|
|
182
197
|
|
|
183
198
|
|
|
184
|
-
async def
|
|
199
|
+
async def _collect_candidate_nodes(
|
|
185
200
|
clients: GraphitiClients,
|
|
186
201
|
extracted_nodes: list[EntityNode],
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
existing_nodes_override: list[EntityNode] | None = None,
|
|
191
|
-
) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
|
|
192
|
-
llm_client = clients.llm_client
|
|
193
|
-
driver = clients.driver
|
|
194
|
-
|
|
202
|
+
existing_nodes_override: list[EntityNode] | None,
|
|
203
|
+
) -> list[EntityNode]:
|
|
204
|
+
"""Search per extracted name and return unique candidates with overrides honored in order."""
|
|
195
205
|
search_results: list[SearchResults] = await semaphore_gather(
|
|
196
206
|
*[
|
|
197
207
|
search(
|
|
@@ -205,33 +215,44 @@ async def resolve_extracted_nodes(
|
|
|
205
215
|
]
|
|
206
216
|
)
|
|
207
217
|
|
|
208
|
-
candidate_nodes: list[EntityNode] =
|
|
209
|
-
[node for result in search_results for node in result.nodes]
|
|
210
|
-
if existing_nodes_override is None
|
|
211
|
-
else existing_nodes_override
|
|
212
|
-
)
|
|
218
|
+
candidate_nodes: list[EntityNode] = [node for result in search_results for node in result.nodes]
|
|
213
219
|
|
|
214
|
-
|
|
220
|
+
if existing_nodes_override is not None:
|
|
221
|
+
candidate_nodes.extend(existing_nodes_override)
|
|
215
222
|
|
|
216
|
-
|
|
223
|
+
seen_candidate_uuids: set[str] = set()
|
|
224
|
+
ordered_candidates: list[EntityNode] = []
|
|
225
|
+
for candidate in candidate_nodes:
|
|
226
|
+
if candidate.uuid in seen_candidate_uuids:
|
|
227
|
+
continue
|
|
228
|
+
seen_candidate_uuids.add(candidate.uuid)
|
|
229
|
+
ordered_candidates.append(candidate)
|
|
230
|
+
|
|
231
|
+
return ordered_candidates
|
|
217
232
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
233
|
+
|
|
234
|
+
async def _resolve_with_llm(
|
|
235
|
+
llm_client: LLMClient,
|
|
236
|
+
extracted_nodes: list[EntityNode],
|
|
237
|
+
indexes: DedupCandidateIndexes,
|
|
238
|
+
state: DedupResolutionState,
|
|
239
|
+
ensure_ascii: bool,
|
|
240
|
+
episode: EpisodicNode | None,
|
|
241
|
+
previous_episodes: list[EpisodicNode] | None,
|
|
242
|
+
entity_types: dict[str, type[BaseModel]] | None,
|
|
243
|
+
) -> None:
|
|
244
|
+
"""Escalate unresolved nodes to the dedupe prompt so the LLM can select or reject duplicates.
|
|
245
|
+
|
|
246
|
+
The guardrails below defensively ignore malformed or duplicate LLM responses so the
|
|
247
|
+
ingestion workflow remains deterministic even when the model misbehaves.
|
|
248
|
+
"""
|
|
249
|
+
if not state.unresolved_indices:
|
|
250
|
+
return
|
|
231
251
|
|
|
232
252
|
entity_types_dict: dict[str, type[BaseModel]] = entity_types if entity_types is not None else {}
|
|
233
253
|
|
|
234
|
-
|
|
254
|
+
llm_extracted_nodes = [extracted_nodes[i] for i in state.unresolved_indices]
|
|
255
|
+
|
|
235
256
|
extracted_nodes_context = [
|
|
236
257
|
{
|
|
237
258
|
'id': i,
|
|
@@ -242,17 +263,29 @@ async def resolve_extracted_nodes(
|
|
|
242
263
|
).__doc__
|
|
243
264
|
or 'Default Entity Type',
|
|
244
265
|
}
|
|
245
|
-
for i, node in enumerate(
|
|
266
|
+
for i, node in enumerate(llm_extracted_nodes)
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
existing_nodes_context = [
|
|
270
|
+
{
|
|
271
|
+
**{
|
|
272
|
+
'idx': i,
|
|
273
|
+
'name': candidate.name,
|
|
274
|
+
'entity_types': candidate.labels,
|
|
275
|
+
},
|
|
276
|
+
**candidate.attributes,
|
|
277
|
+
}
|
|
278
|
+
for i, candidate in enumerate(indexes.existing_nodes)
|
|
246
279
|
]
|
|
247
280
|
|
|
248
281
|
context = {
|
|
249
282
|
'extracted_nodes': extracted_nodes_context,
|
|
250
283
|
'existing_nodes': existing_nodes_context,
|
|
251
284
|
'episode_content': episode.content if episode is not None else '',
|
|
252
|
-
'previous_episodes':
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
'ensure_ascii':
|
|
285
|
+
'previous_episodes': (
|
|
286
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
287
|
+
),
|
|
288
|
+
'ensure_ascii': ensure_ascii,
|
|
256
289
|
}
|
|
257
290
|
|
|
258
291
|
llm_response = await llm_client.generate_response(
|
|
@@ -262,33 +295,105 @@ async def resolve_extracted_nodes(
|
|
|
262
295
|
|
|
263
296
|
node_resolutions: list[NodeDuplicate] = NodeResolutions(**llm_response).entity_resolutions
|
|
264
297
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
298
|
+
valid_relative_range = range(len(state.unresolved_indices))
|
|
299
|
+
processed_relative_ids: set[int] = set()
|
|
300
|
+
|
|
268
301
|
for resolution in node_resolutions:
|
|
269
|
-
|
|
302
|
+
relative_id: int = resolution.id
|
|
270
303
|
duplicate_idx: int = resolution.duplicate_idx
|
|
271
304
|
|
|
272
|
-
|
|
305
|
+
if relative_id not in valid_relative_range:
|
|
306
|
+
logger.warning(
|
|
307
|
+
'Skipping invalid LLM dedupe id %s (unresolved indices: %s)',
|
|
308
|
+
relative_id,
|
|
309
|
+
state.unresolved_indices,
|
|
310
|
+
)
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
if relative_id in processed_relative_ids:
|
|
314
|
+
logger.warning('Duplicate LLM dedupe id %s received; ignoring.', relative_id)
|
|
315
|
+
continue
|
|
316
|
+
processed_relative_ids.add(relative_id)
|
|
273
317
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
if 0 <= duplicate_idx < len(existing_nodes)
|
|
277
|
-
else extracted_node
|
|
278
|
-
)
|
|
318
|
+
original_index = state.unresolved_indices[relative_id]
|
|
319
|
+
extracted_node = extracted_nodes[original_index]
|
|
279
320
|
|
|
280
|
-
|
|
321
|
+
resolved_node: EntityNode
|
|
322
|
+
if duplicate_idx == -1:
|
|
323
|
+
resolved_node = extracted_node
|
|
324
|
+
elif 0 <= duplicate_idx < len(indexes.existing_nodes):
|
|
325
|
+
resolved_node = indexes.existing_nodes[duplicate_idx]
|
|
326
|
+
else:
|
|
327
|
+
logger.warning(
|
|
328
|
+
'Invalid duplicate_idx %s for extracted node %s; treating as no duplicate.',
|
|
329
|
+
duplicate_idx,
|
|
330
|
+
extracted_node.uuid,
|
|
331
|
+
)
|
|
332
|
+
resolved_node = extracted_node
|
|
281
333
|
|
|
282
|
-
resolved_nodes
|
|
283
|
-
uuid_map[extracted_node.uuid] = resolved_node.uuid
|
|
334
|
+
state.resolved_nodes[original_index] = resolved_node
|
|
335
|
+
state.uuid_map[extracted_node.uuid] = resolved_node.uuid
|
|
336
|
+
if resolved_node.uuid != extracted_node.uuid:
|
|
337
|
+
state.duplicate_pairs.append((extracted_node, resolved_node))
|
|
284
338
|
|
|
285
|
-
|
|
339
|
+
|
|
340
|
+
async def resolve_extracted_nodes(
|
|
341
|
+
clients: GraphitiClients,
|
|
342
|
+
extracted_nodes: list[EntityNode],
|
|
343
|
+
episode: EpisodicNode | None = None,
|
|
344
|
+
previous_episodes: list[EpisodicNode] | None = None,
|
|
345
|
+
entity_types: dict[str, type[BaseModel]] | None = None,
|
|
346
|
+
existing_nodes_override: list[EntityNode] | None = None,
|
|
347
|
+
) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
|
|
348
|
+
"""Search for existing nodes, resolve deterministic matches, then escalate holdouts to the LLM dedupe prompt."""
|
|
349
|
+
llm_client = clients.llm_client
|
|
350
|
+
driver = clients.driver
|
|
351
|
+
existing_nodes = await _collect_candidate_nodes(
|
|
352
|
+
clients,
|
|
353
|
+
extracted_nodes,
|
|
354
|
+
existing_nodes_override,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
indexes: DedupCandidateIndexes = _build_candidate_indexes(existing_nodes)
|
|
358
|
+
|
|
359
|
+
state = DedupResolutionState(
|
|
360
|
+
resolved_nodes=[None] * len(extracted_nodes),
|
|
361
|
+
uuid_map={},
|
|
362
|
+
unresolved_indices=[],
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
_resolve_with_similarity(extracted_nodes, indexes, state)
|
|
366
|
+
|
|
367
|
+
await _resolve_with_llm(
|
|
368
|
+
llm_client,
|
|
369
|
+
extracted_nodes,
|
|
370
|
+
indexes,
|
|
371
|
+
state,
|
|
372
|
+
clients.ensure_ascii,
|
|
373
|
+
episode,
|
|
374
|
+
previous_episodes,
|
|
375
|
+
entity_types,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
for idx, node in enumerate(extracted_nodes):
|
|
379
|
+
if state.resolved_nodes[idx] is None:
|
|
380
|
+
state.resolved_nodes[idx] = node
|
|
381
|
+
state.uuid_map[node.uuid] = node.uuid
|
|
382
|
+
|
|
383
|
+
logger.debug(
|
|
384
|
+
'Resolved nodes: %s',
|
|
385
|
+
[(node.name, node.uuid) for node in state.resolved_nodes if node is not None],
|
|
386
|
+
)
|
|
286
387
|
|
|
287
388
|
new_node_duplicates: list[
|
|
288
389
|
tuple[EntityNode, EntityNode]
|
|
289
|
-
] = await filter_existing_duplicate_of_edges(driver,
|
|
390
|
+
] = await filter_existing_duplicate_of_edges(driver, state.duplicate_pairs)
|
|
290
391
|
|
|
291
|
-
return
|
|
392
|
+
return (
|
|
393
|
+
[node for node in state.resolved_nodes if node is not None],
|
|
394
|
+
state.uuid_map,
|
|
395
|
+
new_node_duplicates,
|
|
396
|
+
)
|
|
292
397
|
|
|
293
398
|
|
|
294
399
|
async def extract_attributes_from_nodes(
|
|
@@ -307,9 +412,11 @@ async def extract_attributes_from_nodes(
|
|
|
307
412
|
node,
|
|
308
413
|
episode,
|
|
309
414
|
previous_episodes,
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
415
|
+
(
|
|
416
|
+
entity_types.get(next((item for item in node.labels if item != 'Entity'), ''))
|
|
417
|
+
if entity_types is not None
|
|
418
|
+
else None
|
|
419
|
+
),
|
|
313
420
|
clients.ensure_ascii,
|
|
314
421
|
)
|
|
315
422
|
for node in nodes
|
|
@@ -339,18 +446,18 @@ async def extract_attributes_from_node(
|
|
|
339
446
|
attributes_context: dict[str, Any] = {
|
|
340
447
|
'node': node_context,
|
|
341
448
|
'episode_content': episode.content if episode is not None else '',
|
|
342
|
-
'previous_episodes':
|
|
343
|
-
|
|
344
|
-
|
|
449
|
+
'previous_episodes': (
|
|
450
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
451
|
+
),
|
|
345
452
|
'ensure_ascii': ensure_ascii,
|
|
346
453
|
}
|
|
347
454
|
|
|
348
455
|
summary_context: dict[str, Any] = {
|
|
349
456
|
'node': node_context,
|
|
350
457
|
'episode_content': episode.content if episode is not None else '',
|
|
351
|
-
'previous_episodes':
|
|
352
|
-
|
|
353
|
-
|
|
458
|
+
'previous_episodes': (
|
|
459
|
+
[ep.content for ep in previous_episodes] if previous_episodes is not None else []
|
|
460
|
+
),
|
|
354
461
|
'ensure_ascii': ensure_ascii,
|
|
355
462
|
}
|
|
356
463
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphiti-core
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.0rc7
|
|
4
4
|
Summary: A temporal graph building library
|
|
5
5
|
Project-URL: Homepage, https://help.getzep.com/graphiti/graphiti/overview
|
|
6
6
|
Project-URL: Repository, https://github.com/getzep/graphiti
|
|
@@ -20,6 +20,7 @@ Provides-Extra: anthropic
|
|
|
20
20
|
Requires-Dist: anthropic>=0.49.0; extra == 'anthropic'
|
|
21
21
|
Provides-Extra: dev
|
|
22
22
|
Requires-Dist: anthropic>=0.49.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: boto3>=1.39.16; extra == 'dev'
|
|
23
24
|
Requires-Dist: diskcache-stubs>=5.6.3.6.20240818; extra == 'dev'
|
|
24
25
|
Requires-Dist: falkordb<2.0.0,>=1.1.2; extra == 'dev'
|
|
25
26
|
Requires-Dist: google-genai>=1.8.0; extra == 'dev'
|
|
@@ -28,9 +29,11 @@ Requires-Dist: ipykernel>=6.29.5; extra == 'dev'
|
|
|
28
29
|
Requires-Dist: jupyterlab>=4.2.4; extra == 'dev'
|
|
29
30
|
Requires-Dist: kuzu>=0.11.2; extra == 'dev'
|
|
30
31
|
Requires-Dist: langchain-anthropic>=0.2.4; extra == 'dev'
|
|
32
|
+
Requires-Dist: langchain-aws>=0.2.29; extra == 'dev'
|
|
31
33
|
Requires-Dist: langchain-openai>=0.2.6; extra == 'dev'
|
|
32
34
|
Requires-Dist: langgraph>=0.2.15; extra == 'dev'
|
|
33
35
|
Requires-Dist: langsmith>=0.1.108; extra == 'dev'
|
|
36
|
+
Requires-Dist: opensearch-py>=3.0.0; extra == 'dev'
|
|
34
37
|
Requires-Dist: pyright>=1.1.404; extra == 'dev'
|
|
35
38
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
36
39
|
Requires-Dist: pytest-xdist>=3.6.1; extra == 'dev'
|
|
@@ -2,7 +2,7 @@ graphiti_core/__init__.py,sha256=e5SWFkRiaUwfprYIeIgVIh7JDedNiloZvd3roU-0aDY,55
|
|
|
2
2
|
graphiti_core/edges.py,sha256=PhJm_s28cHLEaIqcw66wP16hOq4P4bVQbC_sESHQkXU,20919
|
|
3
3
|
graphiti_core/errors.py,sha256=cH_v9TPgEPeQE6GFOHIg5TvejpUCBddGarMY2Whxbwc,2707
|
|
4
4
|
graphiti_core/graph_queries.py,sha256=ZWMqAo5pwb8PO5ddg4zZ0ArhHWuWV42g3R9ULIxsHOs,8058
|
|
5
|
-
graphiti_core/graphiti.py,sha256=
|
|
5
|
+
graphiti_core/graphiti.py,sha256=5Y3SdcC_Ebhp-oqbbIxb0KGshWU24EQx4YYKvK8Id8g,41935
|
|
6
6
|
graphiti_core/graphiti_types.py,sha256=C_p2XwScQlCzo7ets097TrSLs9ATxPZQ4WCsxDS7QHc,1066
|
|
7
7
|
graphiti_core/helpers.py,sha256=q8kbL9gz8igdlh-oMUS-ylUyeMlXZb-ccf-HQkrES_0,5184
|
|
8
8
|
graphiti_core/nodes.py,sha256=wYLQcVEXvQMxTpTc9LWSoPTzzaoUOm0rl07c9wS1XSY,30323
|
|
@@ -27,14 +27,14 @@ graphiti_core/embedder/voyage.py,sha256=oJHAZiNqjdEJOKgoKfGWcxK2-Ewqn5UB3vrBwIwP
|
|
|
27
27
|
graphiti_core/llm_client/__init__.py,sha256=QgBWUiCeBp6YiA_xqyrDvJ9jIyy1hngH8g7FWahN3nw,776
|
|
28
28
|
graphiti_core/llm_client/anthropic_client.py,sha256=xTFcrgMDK77BwnChBhYj51Jaa2mRNI850oJv2pKZI0A,12892
|
|
29
29
|
graphiti_core/llm_client/azure_openai_client.py,sha256=ekERggAekbb7enes1RJqdRChf_mjaZTFXsnMbxO7azQ,2497
|
|
30
|
-
graphiti_core/llm_client/client.py,sha256=
|
|
30
|
+
graphiti_core/llm_client/client.py,sha256=KUWq7Gq9J4PdP06lLCBEb8OSZOE6luPqaQ3xgtpZwWg,6835
|
|
31
31
|
graphiti_core/llm_client/config.py,sha256=pivp29CDIbDPqgw5NF9Ok2AwcqTV5z5_Q1bgNs1CDGs,2560
|
|
32
32
|
graphiti_core/llm_client/errors.py,sha256=pn6brRiLW60DAUIXJYKBT6MInrS4ueuH1hNLbn_JbQo,1243
|
|
33
|
-
graphiti_core/llm_client/gemini_client.py,sha256=
|
|
33
|
+
graphiti_core/llm_client/gemini_client.py,sha256=AxD7sqsPQdgfcZCBIGN302s1hFYlBN9FOQcDEV0tw08,17725
|
|
34
34
|
graphiti_core/llm_client/groq_client.py,sha256=bYLE_cg1QEhugsJOXh4b1vPbxagKeMWqk48240GCzMs,2922
|
|
35
|
-
graphiti_core/llm_client/openai_base_client.py,sha256=
|
|
35
|
+
graphiti_core/llm_client/openai_base_client.py,sha256=LeEBZ33Y_bIz-YSr6aCbYKMI9r0SNPeZkALXQ0iFsSE,8488
|
|
36
36
|
graphiti_core/llm_client/openai_client.py,sha256=AuaCFQFMJEGzBkFVouccq3XentmWRIKW0RLRBCUMm7Y,3763
|
|
37
|
-
graphiti_core/llm_client/openai_generic_client.py,sha256=
|
|
37
|
+
graphiti_core/llm_client/openai_generic_client.py,sha256=lyOQwzIMVb9pk3WWrU5zsG38J26QGKebxC40-lRYMJg,7007
|
|
38
38
|
graphiti_core/llm_client/utils.py,sha256=zKpxXEbKa369m4W7RDEf-m56kH46V1Mx3RowcWZEWWs,1000
|
|
39
39
|
graphiti_core/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
graphiti_core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -44,11 +44,11 @@ graphiti_core/models/nodes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
|
44
44
|
graphiti_core/models/nodes/node_db_queries.py,sha256=TCHZKG5bQNarV9C5k4hOFFqc-LwTVQ8Pnd6okVVNKbo,12826
|
|
45
45
|
graphiti_core/prompts/__init__.py,sha256=EA-x9xUki9l8wnu2l8ek_oNf75-do5tq5hVq7Zbv8Kw,101
|
|
46
46
|
graphiti_core/prompts/dedupe_edges.py,sha256=WRXQi7JQZdIfKDICWyU7Wbs5WyD_KBblLBSeKdbLyuk,5914
|
|
47
|
-
graphiti_core/prompts/dedupe_nodes.py,sha256=
|
|
47
|
+
graphiti_core/prompts/dedupe_nodes.py,sha256=H4sIzpi1gBwPedTMhdY175jnLj5JtnEeb_WNITitPLU,9171
|
|
48
48
|
graphiti_core/prompts/eval.py,sha256=ijwxbE87G678imdhfPvRujepQMq_JZ3XHX4vOAcVnVI,5507
|
|
49
49
|
graphiti_core/prompts/extract_edge_dates.py,sha256=3Drs3CmvP0gJN5BidWSxrNvLet3HPoTybU3BUIAoc0Y,4218
|
|
50
50
|
graphiti_core/prompts/extract_edges.py,sha256=mnncxb6lyr3ufKajRAh09czmJawiEM54sSPNy9ukiio,6888
|
|
51
|
-
graphiti_core/prompts/extract_nodes.py,sha256=
|
|
51
|
+
graphiti_core/prompts/extract_nodes.py,sha256=GYX97qlSSrR_3QLc48EGCti8tdC1_OKpEdAR0Y2wfVY,11629
|
|
52
52
|
graphiti_core/prompts/invalidate_edges.py,sha256=yfpcs_pyctnoM77ULPZXEtKW0oHr1MeLsJzC5yrE-o4,3547
|
|
53
53
|
graphiti_core/prompts/lib.py,sha256=DCyHePM4_q-CptTpEXGO_dBv9k7xDtclEaB1dGu7EcI,4092
|
|
54
54
|
graphiti_core/prompts/models.py,sha256=NgxdbPHJpBEcpbXovKyScgpBc73Q-GIW-CBDlBtDjto,894
|
|
@@ -64,17 +64,18 @@ graphiti_core/search/search_utils.py,sha256=ak1aBeKNuxS7szydNHwva2ABWSRlQ0S_v8ZO
|
|
|
64
64
|
graphiti_core/telemetry/__init__.py,sha256=5kALLDlU9bb2v19CdN7qVANsJWyfnL9E60J6FFgzm3o,226
|
|
65
65
|
graphiti_core/telemetry/telemetry.py,sha256=47LrzOVBCcZxsYPsnSxWFiztHoxYKKxPwyRX0hnbDGc,3230
|
|
66
66
|
graphiti_core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
-
graphiti_core/utils/bulk_utils.py,sha256=
|
|
67
|
+
graphiti_core/utils/bulk_utils.py,sha256=0rpBaPg1CBQu7djcSS9XWfv9T1unRRVW8_ge_Tf7lF0,20288
|
|
68
68
|
graphiti_core/utils/datetime_utils.py,sha256=J-zYSq7-H-2n9hYOXNIun12kM10vNX9mMATGR_egTmY,1806
|
|
69
69
|
graphiti_core/utils/maintenance/__init__.py,sha256=vW4H1KyapTl-OOz578uZABYcpND4wPx3Vt6aAPaXh78,301
|
|
70
70
|
graphiti_core/utils/maintenance/community_operations.py,sha256=XMiokEemn96GlvjkOvbo9hIX04Fea3eVj408NHG5P4o,11042
|
|
71
|
-
graphiti_core/utils/maintenance/
|
|
71
|
+
graphiti_core/utils/maintenance/dedup_helpers.py,sha256=B7k6KkB6Sii8PZCWNNTvsNiy4BNTNWpoLeGgrPLq6BE,9220
|
|
72
|
+
graphiti_core/utils/maintenance/edge_operations.py,sha256=9bRCI_3loKJX3EAMLpNULWLnhSDCHsCghiqbXPdicPM,24808
|
|
72
73
|
graphiti_core/utils/maintenance/graph_data_operations.py,sha256=42icj3S_ELAJ-NK3jVS_rg_243dmnaZOyUitJj_uJ-M,6085
|
|
73
|
-
graphiti_core/utils/maintenance/node_operations.py,sha256=
|
|
74
|
+
graphiti_core/utils/maintenance/node_operations.py,sha256=TKpXPtnTVxxan8I1xQyVkGn3zyRdb_Q00cgUpLcloig,16860
|
|
74
75
|
graphiti_core/utils/maintenance/temporal_operations.py,sha256=IIaVtShpVkOYe6haxz3a1x3v54-MzaEXG8VsxFUNeoY,3582
|
|
75
76
|
graphiti_core/utils/maintenance/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
77
|
graphiti_core/utils/ontology_utils/entity_types_utils.py,sha256=4eVgxLWY6Q8k9cRJ5pW59IYF--U4nXZsZIGOVb_yHfQ,1285
|
|
77
|
-
graphiti_core-0.21.
|
|
78
|
-
graphiti_core-0.21.
|
|
79
|
-
graphiti_core-0.21.
|
|
80
|
-
graphiti_core-0.21.
|
|
78
|
+
graphiti_core-0.21.0rc7.dist-info/METADATA,sha256=pAEEXoHTF8p8L1ds3kF_6KEjdhyB2iX9DPTuucxMe0o,27084
|
|
79
|
+
graphiti_core-0.21.0rc7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
80
|
+
graphiti_core-0.21.0rc7.dist-info/licenses/LICENSE,sha256=KCUwCyDXuVEgmDWkozHyniRyWjnWUWjkuDHfU6o3JlA,11325
|
|
81
|
+
graphiti_core-0.21.0rc7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|