graphiti-core 0.17.4__py3-none-any.whl → 0.24.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. graphiti_core/cross_encoder/gemini_reranker_client.py +1 -1
  2. graphiti_core/cross_encoder/openai_reranker_client.py +1 -1
  3. graphiti_core/decorators.py +110 -0
  4. graphiti_core/driver/driver.py +62 -2
  5. graphiti_core/driver/falkordb_driver.py +215 -23
  6. graphiti_core/driver/graph_operations/graph_operations.py +191 -0
  7. graphiti_core/driver/kuzu_driver.py +182 -0
  8. graphiti_core/driver/neo4j_driver.py +61 -8
  9. graphiti_core/driver/neptune_driver.py +305 -0
  10. graphiti_core/driver/search_interface/search_interface.py +89 -0
  11. graphiti_core/edges.py +264 -132
  12. graphiti_core/embedder/azure_openai.py +10 -3
  13. graphiti_core/embedder/client.py +2 -1
  14. graphiti_core/graph_queries.py +114 -101
  15. graphiti_core/graphiti.py +582 -255
  16. graphiti_core/graphiti_types.py +2 -0
  17. graphiti_core/helpers.py +21 -14
  18. graphiti_core/llm_client/anthropic_client.py +142 -52
  19. graphiti_core/llm_client/azure_openai_client.py +57 -19
  20. graphiti_core/llm_client/client.py +83 -21
  21. graphiti_core/llm_client/config.py +1 -1
  22. graphiti_core/llm_client/gemini_client.py +75 -57
  23. graphiti_core/llm_client/openai_base_client.py +94 -50
  24. graphiti_core/llm_client/openai_client.py +28 -8
  25. graphiti_core/llm_client/openai_generic_client.py +91 -56
  26. graphiti_core/models/edges/edge_db_queries.py +259 -35
  27. graphiti_core/models/nodes/node_db_queries.py +311 -32
  28. graphiti_core/nodes.py +388 -164
  29. graphiti_core/prompts/dedupe_edges.py +42 -31
  30. graphiti_core/prompts/dedupe_nodes.py +56 -39
  31. graphiti_core/prompts/eval.py +4 -4
  32. graphiti_core/prompts/extract_edges.py +23 -14
  33. graphiti_core/prompts/extract_nodes.py +73 -32
  34. graphiti_core/prompts/prompt_helpers.py +39 -0
  35. graphiti_core/prompts/snippets.py +29 -0
  36. graphiti_core/prompts/summarize_nodes.py +23 -25
  37. graphiti_core/search/search.py +154 -74
  38. graphiti_core/search/search_config.py +39 -4
  39. graphiti_core/search/search_filters.py +109 -31
  40. graphiti_core/search/search_helpers.py +5 -6
  41. graphiti_core/search/search_utils.py +1360 -473
  42. graphiti_core/tracer.py +193 -0
  43. graphiti_core/utils/bulk_utils.py +216 -90
  44. graphiti_core/utils/datetime_utils.py +13 -0
  45. graphiti_core/utils/maintenance/community_operations.py +62 -38
  46. graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
  47. graphiti_core/utils/maintenance/edge_operations.py +286 -126
  48. graphiti_core/utils/maintenance/graph_data_operations.py +44 -74
  49. graphiti_core/utils/maintenance/node_operations.py +320 -158
  50. graphiti_core/utils/maintenance/temporal_operations.py +11 -3
  51. graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
  52. graphiti_core/utils/text_utils.py +53 -0
  53. {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/METADATA +221 -87
  54. graphiti_core-0.24.3.dist-info/RECORD +86 -0
  55. {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/WHEEL +1 -1
  56. graphiti_core-0.17.4.dist-info/RECORD +0 -77
  57. /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
  58. {graphiti_core-0.17.4.dist-info → graphiti_core-0.24.3.dist-info}/licenses/LICENSE +0 -0
@@ -4,11 +4,12 @@ from collections import defaultdict
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
- from graphiti_core.driver.driver import GraphDriver
7
+ from graphiti_core.driver.driver import GraphDriver, GraphProvider
8
8
  from graphiti_core.edges import CommunityEdge
9
9
  from graphiti_core.embedder import EmbedderClient
10
10
  from graphiti_core.helpers import semaphore_gather
11
11
  from graphiti_core.llm_client import LLMClient
12
+ from graphiti_core.models.nodes.node_db_queries import COMMUNITY_NODE_RETURN
12
13
  from graphiti_core.nodes import CommunityNode, EntityNode, get_community_node_from_record
13
14
  from graphiti_core.prompts import prompt_library
14
15
  from graphiti_core.prompts.summarize_nodes import Summary, SummaryDescription
@@ -33,10 +34,11 @@ async def get_community_clusters(
33
34
  if group_ids is None:
34
35
  group_id_values, _, _ = await driver.execute_query(
35
36
  """
36
- MATCH (n:Entity WHERE n.group_id IS NOT NULL)
37
- RETURN
38
- collect(DISTINCT n.group_id) AS group_ids
39
- """,
37
+ MATCH (n:Entity)
38
+ WHERE n.group_id IS NOT NULL
39
+ RETURN
40
+ collect(DISTINCT n.group_id) AS group_ids
41
+ """
40
42
  )
41
43
 
42
44
  group_ids = group_id_values[0]['group_ids'] if group_id_values else []
@@ -45,14 +47,21 @@ async def get_community_clusters(
45
47
  projection: dict[str, list[Neighbor]] = {}
46
48
  nodes = await EntityNode.get_by_group_ids(driver, [group_id])
47
49
  for node in nodes:
48
- records, _, _ = await driver.execute_query(
50
+ match_query = """
51
+ MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[e:RELATES_TO]-(m: Entity {group_id: $group_id})
52
+ """
53
+ if driver.provider == GraphProvider.KUZU:
54
+ match_query = """
55
+ MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(m: Entity {group_id: $group_id})
49
56
  """
50
- MATCH (n:Entity {group_id: $group_id, uuid: $uuid})-[r:RELATES_TO]-(m: Entity {group_id: $group_id})
51
- WITH count(r) AS count, m.uuid AS uuid
52
- RETURN
53
- uuid,
54
- count
55
- """,
57
+ records, _, _ = await driver.execute_query(
58
+ match_query
59
+ + """
60
+ WITH count(e) AS count, m.uuid AS uuid
61
+ RETURN
62
+ uuid,
63
+ count
64
+ """,
56
65
  uuid=node.uuid,
57
66
  group_id=group_id,
58
67
  )
@@ -124,10 +133,14 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]:
124
133
 
125
134
  async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str:
126
135
  # Prepare context for LLM
127
- context = {'node_summaries': [{'summary': summary} for summary in summary_pair]}
136
+ context = {
137
+ 'node_summaries': [{'summary': summary} for summary in summary_pair],
138
+ }
128
139
 
129
140
  llm_response = await llm_client.generate_response(
130
- prompt_library.summarize_nodes.summarize_pair(context), response_model=Summary
141
+ prompt_library.summarize_nodes.summarize_pair(context),
142
+ response_model=Summary,
143
+ prompt_name='summarize_nodes.summarize_pair',
131
144
  )
132
145
 
133
146
  pair_summary = llm_response.get('summary', '')
@@ -136,11 +149,14 @@ async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -
136
149
 
137
150
 
138
151
  async def generate_summary_description(llm_client: LLMClient, summary: str) -> str:
139
- context = {'summary': summary}
152
+ context = {
153
+ 'summary': summary,
154
+ }
140
155
 
141
156
  llm_response = await llm_client.generate_response(
142
157
  prompt_library.summarize_nodes.summary_description(context),
143
158
  response_model=SummaryDescription,
159
+ prompt_name='summarize_nodes.summary_description',
144
160
  )
145
161
 
146
162
  description = llm_response.get('description', '')
@@ -191,7 +207,9 @@ async def build_community(
191
207
 
192
208
 
193
209
  async def build_communities(
194
- driver: GraphDriver, llm_client: LLMClient, group_ids: list[str] | None
210
+ driver: GraphDriver,
211
+ llm_client: LLMClient,
212
+ group_ids: list[str] | None,
195
213
  ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
196
214
  community_clusters = await get_community_clusters(driver, group_ids)
197
215
 
@@ -219,9 +237,9 @@ async def build_communities(
219
237
  async def remove_communities(driver: GraphDriver):
220
238
  await driver.execute_query(
221
239
  """
222
- MATCH (c:Community)
223
- DETACH DELETE c
224
- """,
240
+ MATCH (c:Community)
241
+ DETACH DELETE c
242
+ """
225
243
  )
226
244
 
227
245
 
@@ -231,14 +249,10 @@ async def determine_entity_community(
231
249
  # Check if the node is already part of a community
232
250
  records, _, _ = await driver.execute_query(
233
251
  """
234
- MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
235
- RETURN
236
- c.uuid As uuid,
237
- c.name AS name,
238
- c.group_id AS group_id,
239
- c.created_at AS created_at,
240
- c.summary AS summary
241
- """,
252
+ MATCH (c:Community)-[:HAS_MEMBER]->(n:Entity {uuid: $entity_uuid})
253
+ RETURN
254
+ """
255
+ + COMMUNITY_NODE_RETURN,
242
256
  entity_uuid=entity.uuid,
243
257
  )
244
258
 
@@ -246,16 +260,19 @@ async def determine_entity_community(
246
260
  return get_community_node_from_record(records[0]), False
247
261
 
248
262
  # If the node has no community, add it to the mode community of surrounding entities
263
+ match_query = """
264
+ MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
265
+ """
266
+ if driver.provider == GraphProvider.KUZU:
267
+ match_query = """
268
+ MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(e:RelatesToNode_)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
269
+ """
249
270
  records, _, _ = await driver.execute_query(
271
+ match_query
272
+ + """
273
+ RETURN
250
274
  """
251
- MATCH (c:Community)-[:HAS_MEMBER]->(m:Entity)-[:RELATES_TO]-(n:Entity {uuid: $entity_uuid})
252
- RETURN
253
- c.uuid As uuid,
254
- c.name AS name,
255
- c.group_id AS group_id,
256
- c.created_at AS created_at,
257
- c.summary AS summary
258
- """,
275
+ + COMMUNITY_NODE_RETURN,
259
276
  entity_uuid=entity.uuid,
260
277
  )
261
278
 
@@ -285,12 +302,15 @@ async def determine_entity_community(
285
302
 
286
303
 
287
304
  async def update_community(
288
- driver: GraphDriver, llm_client: LLMClient, embedder: EmbedderClient, entity: EntityNode
289
- ):
305
+ driver: GraphDriver,
306
+ llm_client: LLMClient,
307
+ embedder: EmbedderClient,
308
+ entity: EntityNode,
309
+ ) -> tuple[list[CommunityNode], list[CommunityEdge]]:
290
310
  community, is_new = await determine_entity_community(driver, entity)
291
311
 
292
312
  if community is None:
293
- return
313
+ return [], []
294
314
 
295
315
  new_summary = await summarize_pair(llm_client, (entity.summary, community.summary))
296
316
  new_name = await generate_summary_description(llm_client, new_summary)
@@ -298,10 +318,14 @@ async def update_community(
298
318
  community.summary = new_summary
299
319
  community.name = new_name
300
320
 
321
+ community_edges = []
301
322
  if is_new:
302
323
  community_edge = (build_community_edges([entity], community, utc_now()))[0]
303
324
  await community_edge.save(driver)
325
+ community_edges.append(community_edge)
304
326
 
305
327
  await community.generate_name_embedding(embedder)
306
328
 
307
329
  await community.save(driver)
330
+
331
+ return [community], community_edges
@@ -0,0 +1,262 @@
1
+ """
2
+ Copyright 2024, Zep Software, Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import math
20
+ import re
21
+ from collections import defaultdict
22
+ from collections.abc import Iterable
23
+ from dataclasses import dataclass, field
24
+ from functools import lru_cache
25
+ from hashlib import blake2b
26
+ from typing import TYPE_CHECKING
27
+
28
+ if TYPE_CHECKING:
29
+ from graphiti_core.nodes import EntityNode
30
+
31
+ _NAME_ENTROPY_THRESHOLD = 1.5
32
+ _MIN_NAME_LENGTH = 6
33
+ _MIN_TOKEN_COUNT = 2
34
+ _FUZZY_JACCARD_THRESHOLD = 0.9
35
+ _MINHASH_PERMUTATIONS = 32
36
+ _MINHASH_BAND_SIZE = 4
37
+
38
+
39
+ def _normalize_string_exact(name: str) -> str:
40
+ """Lowercase text and collapse whitespace so equal names map to the same key."""
41
+ normalized = re.sub(r'[\s]+', ' ', name.lower())
42
+ return normalized.strip()
43
+
44
+
45
+ def _normalize_name_for_fuzzy(name: str) -> str:
46
+ """Produce a fuzzier form that keeps alphanumerics and apostrophes for n-gram shingles."""
47
+ normalized = re.sub(r"[^a-z0-9' ]", ' ', _normalize_string_exact(name))
48
+ normalized = normalized.strip()
49
+ return re.sub(r'[\s]+', ' ', normalized)
50
+
51
+
52
+ def _name_entropy(normalized_name: str) -> float:
53
+ """Approximate text specificity using Shannon entropy over characters.
54
+
55
+ We strip spaces, count how often each character appears, and sum
56
+ probability * -log2(probability). Short or repetitive names yield low
57
+ entropy, which signals we should defer resolution to the LLM instead of
58
+ trusting fuzzy similarity.
59
+ """
60
+ if not normalized_name:
61
+ return 0.0
62
+
63
+ counts: dict[str, int] = {}
64
+ for char in normalized_name.replace(' ', ''):
65
+ counts[char] = counts.get(char, 0) + 1
66
+
67
+ total = sum(counts.values())
68
+ if total == 0:
69
+ return 0.0
70
+
71
+ entropy = 0.0
72
+ for count in counts.values():
73
+ probability = count / total
74
+ entropy -= probability * math.log2(probability)
75
+
76
+ return entropy
77
+
78
+
79
+ def _has_high_entropy(normalized_name: str) -> bool:
80
+ """Filter out very short or low-entropy names that are unreliable for fuzzy matching."""
81
+ token_count = len(normalized_name.split())
82
+ if len(normalized_name) < _MIN_NAME_LENGTH and token_count < _MIN_TOKEN_COUNT:
83
+ return False
84
+
85
+ return _name_entropy(normalized_name) >= _NAME_ENTROPY_THRESHOLD
86
+
87
+
88
+ def _shingles(normalized_name: str) -> set[str]:
89
+ """Create 3-gram shingles from the normalized name for MinHash calculations."""
90
+ cleaned = normalized_name.replace(' ', '')
91
+ if len(cleaned) < 2:
92
+ return {cleaned} if cleaned else set()
93
+
94
+ return {cleaned[i : i + 3] for i in range(len(cleaned) - 2)}
95
+
96
+
97
+ def _hash_shingle(shingle: str, seed: int) -> int:
98
+ """Generate a deterministic 64-bit hash for a shingle given the permutation seed."""
99
+ digest = blake2b(f'{seed}:{shingle}'.encode(), digest_size=8)
100
+ return int.from_bytes(digest.digest(), 'big')
101
+
102
+
103
+ def _minhash_signature(shingles: Iterable[str]) -> tuple[int, ...]:
104
+ """Compute the MinHash signature for the shingle set across predefined permutations."""
105
+ if not shingles:
106
+ return tuple()
107
+
108
+ seeds = range(_MINHASH_PERMUTATIONS)
109
+ signature: list[int] = []
110
+ for seed in seeds:
111
+ min_hash = min(_hash_shingle(shingle, seed) for shingle in shingles)
112
+ signature.append(min_hash)
113
+
114
+ return tuple(signature)
115
+
116
+
117
+ def _lsh_bands(signature: Iterable[int]) -> list[tuple[int, ...]]:
118
+ """Split the MinHash signature into fixed-size bands for locality-sensitive hashing."""
119
+ signature_list = list(signature)
120
+ if not signature_list:
121
+ return []
122
+
123
+ bands: list[tuple[int, ...]] = []
124
+ for start in range(0, len(signature_list), _MINHASH_BAND_SIZE):
125
+ band = tuple(signature_list[start : start + _MINHASH_BAND_SIZE])
126
+ if len(band) == _MINHASH_BAND_SIZE:
127
+ bands.append(band)
128
+ return bands
129
+
130
+
131
+ def _jaccard_similarity(a: set[str], b: set[str]) -> float:
132
+ """Return the Jaccard similarity between two shingle sets, handling empty edge cases."""
133
+ if not a and not b:
134
+ return 1.0
135
+ if not a or not b:
136
+ return 0.0
137
+
138
+ intersection = len(a.intersection(b))
139
+ union = len(a.union(b))
140
+ return intersection / union if union else 0.0
141
+
142
+
143
+ @lru_cache(maxsize=512)
144
+ def _cached_shingles(name: str) -> set[str]:
145
+ """Cache shingle sets per normalized name to avoid recomputation within a worker."""
146
+ return _shingles(name)
147
+
148
+
149
+ @dataclass
150
+ class DedupCandidateIndexes:
151
+ """Precomputed lookup structures that drive entity deduplication heuristics."""
152
+
153
+ existing_nodes: list[EntityNode]
154
+ nodes_by_uuid: dict[str, EntityNode]
155
+ normalized_existing: defaultdict[str, list[EntityNode]]
156
+ shingles_by_candidate: dict[str, set[str]]
157
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]]
158
+
159
+
160
+ @dataclass
161
+ class DedupResolutionState:
162
+ """Mutable resolution bookkeeping shared across deterministic and LLM passes."""
163
+
164
+ resolved_nodes: list[EntityNode | None]
165
+ uuid_map: dict[str, str]
166
+ unresolved_indices: list[int]
167
+ duplicate_pairs: list[tuple[EntityNode, EntityNode]] = field(default_factory=list)
168
+
169
+
170
+ def _build_candidate_indexes(existing_nodes: list[EntityNode]) -> DedupCandidateIndexes:
171
+ """Precompute exact and fuzzy lookup structures once per dedupe run."""
172
+ normalized_existing: defaultdict[str, list[EntityNode]] = defaultdict(list)
173
+ nodes_by_uuid: dict[str, EntityNode] = {}
174
+ shingles_by_candidate: dict[str, set[str]] = {}
175
+ lsh_buckets: defaultdict[tuple[int, tuple[int, ...]], list[str]] = defaultdict(list)
176
+
177
+ for candidate in existing_nodes:
178
+ normalized = _normalize_string_exact(candidate.name)
179
+ normalized_existing[normalized].append(candidate)
180
+ nodes_by_uuid[candidate.uuid] = candidate
181
+
182
+ shingles = _cached_shingles(_normalize_name_for_fuzzy(candidate.name))
183
+ shingles_by_candidate[candidate.uuid] = shingles
184
+
185
+ signature = _minhash_signature(shingles)
186
+ for band_index, band in enumerate(_lsh_bands(signature)):
187
+ lsh_buckets[(band_index, band)].append(candidate.uuid)
188
+
189
+ return DedupCandidateIndexes(
190
+ existing_nodes=existing_nodes,
191
+ nodes_by_uuid=nodes_by_uuid,
192
+ normalized_existing=normalized_existing,
193
+ shingles_by_candidate=shingles_by_candidate,
194
+ lsh_buckets=lsh_buckets,
195
+ )
196
+
197
+
198
+ def _resolve_with_similarity(
199
+ extracted_nodes: list[EntityNode],
200
+ indexes: DedupCandidateIndexes,
201
+ state: DedupResolutionState,
202
+ ) -> None:
203
+ """Attempt deterministic resolution using exact name hits and fuzzy MinHash comparisons."""
204
+ for idx, node in enumerate(extracted_nodes):
205
+ normalized_exact = _normalize_string_exact(node.name)
206
+ normalized_fuzzy = _normalize_name_for_fuzzy(node.name)
207
+
208
+ if not _has_high_entropy(normalized_fuzzy):
209
+ state.unresolved_indices.append(idx)
210
+ continue
211
+
212
+ existing_matches = indexes.normalized_existing.get(normalized_exact, [])
213
+ if len(existing_matches) == 1:
214
+ match = existing_matches[0]
215
+ state.resolved_nodes[idx] = match
216
+ state.uuid_map[node.uuid] = match.uuid
217
+ if match.uuid != node.uuid:
218
+ state.duplicate_pairs.append((node, match))
219
+ continue
220
+ if len(existing_matches) > 1:
221
+ state.unresolved_indices.append(idx)
222
+ continue
223
+
224
+ shingles = _cached_shingles(normalized_fuzzy)
225
+ signature = _minhash_signature(shingles)
226
+ candidate_ids: set[str] = set()
227
+ for band_index, band in enumerate(_lsh_bands(signature)):
228
+ candidate_ids.update(indexes.lsh_buckets.get((band_index, band), []))
229
+
230
+ best_candidate: EntityNode | None = None
231
+ best_score = 0.0
232
+ for candidate_id in candidate_ids:
233
+ candidate_shingles = indexes.shingles_by_candidate.get(candidate_id, set())
234
+ score = _jaccard_similarity(shingles, candidate_shingles)
235
+ if score > best_score:
236
+ best_score = score
237
+ best_candidate = indexes.nodes_by_uuid.get(candidate_id)
238
+
239
+ if best_candidate is not None and best_score >= _FUZZY_JACCARD_THRESHOLD:
240
+ state.resolved_nodes[idx] = best_candidate
241
+ state.uuid_map[node.uuid] = best_candidate.uuid
242
+ if best_candidate.uuid != node.uuid:
243
+ state.duplicate_pairs.append((node, best_candidate))
244
+ continue
245
+
246
+ state.unresolved_indices.append(idx)
247
+
248
+
249
+ __all__ = [
250
+ 'DedupCandidateIndexes',
251
+ 'DedupResolutionState',
252
+ '_normalize_string_exact',
253
+ '_normalize_name_for_fuzzy',
254
+ '_has_high_entropy',
255
+ '_minhash_signature',
256
+ '_lsh_bands',
257
+ '_jaccard_similarity',
258
+ '_cached_shingles',
259
+ '_FUZZY_JACCARD_THRESHOLD',
260
+ '_build_candidate_indexes',
261
+ '_resolve_with_similarity',
262
+ ]