graphiti-core 0.17.4__py3-none-any.whl → 0.25.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphiti_core/cross_encoder/gemini_reranker_client.py +1 -1
- graphiti_core/cross_encoder/openai_reranker_client.py +1 -1
- graphiti_core/decorators.py +110 -0
- graphiti_core/driver/driver.py +62 -2
- graphiti_core/driver/falkordb_driver.py +215 -23
- graphiti_core/driver/graph_operations/graph_operations.py +191 -0
- graphiti_core/driver/kuzu_driver.py +182 -0
- graphiti_core/driver/neo4j_driver.py +70 -8
- graphiti_core/driver/neptune_driver.py +305 -0
- graphiti_core/driver/search_interface/search_interface.py +89 -0
- graphiti_core/edges.py +264 -132
- graphiti_core/embedder/azure_openai.py +10 -3
- graphiti_core/embedder/client.py +2 -1
- graphiti_core/graph_queries.py +114 -101
- graphiti_core/graphiti.py +635 -260
- graphiti_core/graphiti_types.py +2 -0
- graphiti_core/helpers.py +37 -15
- graphiti_core/llm_client/anthropic_client.py +142 -52
- graphiti_core/llm_client/azure_openai_client.py +57 -19
- graphiti_core/llm_client/client.py +83 -21
- graphiti_core/llm_client/config.py +1 -1
- graphiti_core/llm_client/gemini_client.py +75 -57
- graphiti_core/llm_client/openai_base_client.py +92 -48
- graphiti_core/llm_client/openai_client.py +39 -9
- graphiti_core/llm_client/openai_generic_client.py +91 -56
- graphiti_core/models/edges/edge_db_queries.py +259 -35
- graphiti_core/models/nodes/node_db_queries.py +311 -32
- graphiti_core/nodes.py +388 -164
- graphiti_core/prompts/dedupe_edges.py +42 -31
- graphiti_core/prompts/dedupe_nodes.py +56 -39
- graphiti_core/prompts/eval.py +4 -4
- graphiti_core/prompts/extract_edges.py +24 -15
- graphiti_core/prompts/extract_nodes.py +76 -35
- graphiti_core/prompts/prompt_helpers.py +39 -0
- graphiti_core/prompts/snippets.py +29 -0
- graphiti_core/prompts/summarize_nodes.py +23 -25
- graphiti_core/search/search.py +154 -74
- graphiti_core/search/search_config.py +39 -4
- graphiti_core/search/search_filters.py +110 -31
- graphiti_core/search/search_helpers.py +5 -6
- graphiti_core/search/search_utils.py +1360 -473
- graphiti_core/tracer.py +193 -0
- graphiti_core/utils/bulk_utils.py +216 -90
- graphiti_core/utils/content_chunking.py +702 -0
- graphiti_core/utils/datetime_utils.py +13 -0
- graphiti_core/utils/maintenance/community_operations.py +62 -38
- graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
- graphiti_core/utils/maintenance/edge_operations.py +306 -156
- graphiti_core/utils/maintenance/graph_data_operations.py +44 -74
- graphiti_core/utils/maintenance/node_operations.py +466 -206
- graphiti_core/utils/maintenance/temporal_operations.py +11 -3
- graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
- graphiti_core/utils/text_utils.py +53 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/METADATA +221 -87
- graphiti_core-0.25.3.dist-info/RECORD +87 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/WHEEL +1 -1
- graphiti_core-0.17.4.dist-info/RECORD +0 -77
- /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
- {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,702 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024, Zep Software, Inc.
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import re
|
|
20
|
+
|
|
21
|
+
from graphiti_core.helpers import (
|
|
22
|
+
CHUNK_DENSITY_THRESHOLD,
|
|
23
|
+
CHUNK_MIN_TOKENS,
|
|
24
|
+
CHUNK_OVERLAP_TOKENS,
|
|
25
|
+
CHUNK_TOKEN_SIZE,
|
|
26
|
+
)
|
|
27
|
+
from graphiti_core.nodes import EpisodeType
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Approximate characters per token (conservative estimate)
|
|
32
|
+
CHARS_PER_TOKEN = 4
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def estimate_tokens(text: str) -> int:
|
|
36
|
+
"""Estimate token count using character-based heuristic.
|
|
37
|
+
|
|
38
|
+
Uses ~4 characters per token as a conservative estimate.
|
|
39
|
+
This is faster than actual tokenization and works across all LLM providers.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: The text to estimate tokens for
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Estimated token count
|
|
46
|
+
"""
|
|
47
|
+
return len(text) // CHARS_PER_TOKEN
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _tokens_to_chars(tokens: int) -> int:
|
|
51
|
+
"""Convert token count to approximate character count."""
|
|
52
|
+
return tokens * CHARS_PER_TOKEN
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def should_chunk(content: str, episode_type: EpisodeType) -> bool:
|
|
56
|
+
"""Determine whether content should be chunked based on size and entity density.
|
|
57
|
+
|
|
58
|
+
Only chunks content that is both:
|
|
59
|
+
1. Large enough to potentially cause LLM issues (>= CHUNK_MIN_TOKENS)
|
|
60
|
+
2. High entity density (many entities per token)
|
|
61
|
+
|
|
62
|
+
Short content processes fine regardless of density. This targets the specific
|
|
63
|
+
failure case of large entity-dense inputs while preserving context for
|
|
64
|
+
prose/narrative content and avoiding unnecessary chunking of small inputs.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
content: The content to evaluate
|
|
68
|
+
episode_type: Type of episode (json, message, text)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
True if content is large and has high entity density
|
|
72
|
+
"""
|
|
73
|
+
tokens = estimate_tokens(content)
|
|
74
|
+
|
|
75
|
+
# Short content always processes fine - no need to chunk
|
|
76
|
+
if tokens < CHUNK_MIN_TOKENS:
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
return _estimate_high_density(content, episode_type, tokens)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _estimate_high_density(content: str, episode_type: EpisodeType, tokens: int) -> bool:
|
|
83
|
+
"""Estimate whether content has high entity density.
|
|
84
|
+
|
|
85
|
+
High-density content (many entities per token) benefits from chunking.
|
|
86
|
+
Low-density content (prose, narratives) loses context when chunked.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
content: The content to analyze
|
|
90
|
+
episode_type: Type of episode
|
|
91
|
+
tokens: Pre-computed token count
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
True if content appears to have high entity density
|
|
95
|
+
"""
|
|
96
|
+
if episode_type == EpisodeType.json:
|
|
97
|
+
return _json_likely_dense(content, tokens)
|
|
98
|
+
else:
|
|
99
|
+
return _text_likely_dense(content, tokens)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _json_likely_dense(content: str, tokens: int) -> bool:
|
|
103
|
+
"""Estimate entity density for JSON content.
|
|
104
|
+
|
|
105
|
+
JSON is considered dense if it has many array elements or object keys,
|
|
106
|
+
as each typically represents a distinct entity or data point.
|
|
107
|
+
|
|
108
|
+
Heuristics:
|
|
109
|
+
- Array: Count elements, estimate entities per 1000 tokens
|
|
110
|
+
- Object: Count top-level keys
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
content: JSON string content
|
|
114
|
+
tokens: Token count
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
True if JSON appears to have high entity density
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
data = json.loads(content)
|
|
121
|
+
except json.JSONDecodeError:
|
|
122
|
+
# Invalid JSON, fall back to text heuristics
|
|
123
|
+
return _text_likely_dense(content, tokens)
|
|
124
|
+
|
|
125
|
+
if isinstance(data, list):
|
|
126
|
+
# For arrays, each element likely contains entities
|
|
127
|
+
element_count = len(data)
|
|
128
|
+
# Estimate density: elements per 1000 tokens
|
|
129
|
+
density = (element_count / tokens) * 1000 if tokens > 0 else 0
|
|
130
|
+
return density > CHUNK_DENSITY_THRESHOLD * 1000 # Scale threshold
|
|
131
|
+
elif isinstance(data, dict):
|
|
132
|
+
# For objects, count keys recursively (shallow)
|
|
133
|
+
key_count = _count_json_keys(data, max_depth=2)
|
|
134
|
+
density = (key_count / tokens) * 1000 if tokens > 0 else 0
|
|
135
|
+
return density > CHUNK_DENSITY_THRESHOLD * 1000
|
|
136
|
+
else:
|
|
137
|
+
# Scalar value, no need to chunk
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _count_json_keys(data: dict, max_depth: int = 2, current_depth: int = 0) -> int:
|
|
142
|
+
"""Count keys in a JSON object up to a certain depth.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
data: Dictionary to count keys in
|
|
146
|
+
max_depth: Maximum depth to traverse
|
|
147
|
+
current_depth: Current recursion depth
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Count of keys
|
|
151
|
+
"""
|
|
152
|
+
if current_depth >= max_depth:
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
count = len(data)
|
|
156
|
+
for value in data.values():
|
|
157
|
+
if isinstance(value, dict):
|
|
158
|
+
count += _count_json_keys(value, max_depth, current_depth + 1)
|
|
159
|
+
elif isinstance(value, list):
|
|
160
|
+
for item in value:
|
|
161
|
+
if isinstance(item, dict):
|
|
162
|
+
count += _count_json_keys(item, max_depth, current_depth + 1)
|
|
163
|
+
return count
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _text_likely_dense(content: str, tokens: int) -> bool:
|
|
167
|
+
"""Estimate entity density for text content.
|
|
168
|
+
|
|
169
|
+
Uses capitalized words as a proxy for named entities (people, places,
|
|
170
|
+
organizations, products). High ratio of capitalized words suggests
|
|
171
|
+
high entity density.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
content: Text content
|
|
175
|
+
tokens: Token count
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
True if text appears to have high entity density
|
|
179
|
+
"""
|
|
180
|
+
if tokens == 0:
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
# Split into words
|
|
184
|
+
words = content.split()
|
|
185
|
+
if not words:
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
# Count capitalized words (excluding sentence starters)
|
|
189
|
+
# A word is "capitalized" if it starts with uppercase and isn't all caps
|
|
190
|
+
capitalized_count = 0
|
|
191
|
+
for i, word in enumerate(words):
|
|
192
|
+
# Skip if it's likely a sentence starter (after . ! ? or first word)
|
|
193
|
+
if i == 0:
|
|
194
|
+
continue
|
|
195
|
+
if i > 0 and words[i - 1].rstrip()[-1:] in '.!?':
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Check if capitalized (first char upper, not all caps)
|
|
199
|
+
cleaned = word.strip('.,!?;:\'"()[]{}')
|
|
200
|
+
if cleaned and cleaned[0].isupper() and not cleaned.isupper():
|
|
201
|
+
capitalized_count += 1
|
|
202
|
+
|
|
203
|
+
# Calculate density: capitalized words per 1000 tokens
|
|
204
|
+
density = (capitalized_count / tokens) * 1000 if tokens > 0 else 0
|
|
205
|
+
|
|
206
|
+
# Text density threshold is typically lower than JSON
|
|
207
|
+
# A well-written article might have 5-10% named entities
|
|
208
|
+
return density > CHUNK_DENSITY_THRESHOLD * 500 # Half the JSON threshold
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def chunk_json_content(
|
|
212
|
+
content: str,
|
|
213
|
+
chunk_size_tokens: int | None = None,
|
|
214
|
+
overlap_tokens: int | None = None,
|
|
215
|
+
) -> list[str]:
|
|
216
|
+
"""Split JSON content into chunks while preserving structure.
|
|
217
|
+
|
|
218
|
+
For arrays: splits at element boundaries, keeping complete objects.
|
|
219
|
+
For objects: splits at top-level key boundaries.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
content: JSON string to chunk
|
|
223
|
+
chunk_size_tokens: Target size per chunk in tokens (default from env)
|
|
224
|
+
overlap_tokens: Overlap between chunks in tokens (default from env)
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of JSON string chunks
|
|
228
|
+
"""
|
|
229
|
+
chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
|
|
230
|
+
overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
|
|
231
|
+
|
|
232
|
+
chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
|
|
233
|
+
overlap_chars = _tokens_to_chars(overlap_tokens)
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
data = json.loads(content)
|
|
237
|
+
except json.JSONDecodeError:
|
|
238
|
+
logger.warning('Failed to parse JSON, falling back to text chunking')
|
|
239
|
+
return chunk_text_content(content, chunk_size_tokens, overlap_tokens)
|
|
240
|
+
|
|
241
|
+
if isinstance(data, list):
|
|
242
|
+
return _chunk_json_array(data, chunk_size_chars, overlap_chars)
|
|
243
|
+
elif isinstance(data, dict):
|
|
244
|
+
return _chunk_json_object(data, chunk_size_chars, overlap_chars)
|
|
245
|
+
else:
|
|
246
|
+
# Scalar value, return as-is
|
|
247
|
+
return [content]
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _chunk_json_array(
|
|
251
|
+
data: list,
|
|
252
|
+
chunk_size_chars: int,
|
|
253
|
+
overlap_chars: int,
|
|
254
|
+
) -> list[str]:
|
|
255
|
+
"""Chunk a JSON array by splitting at element boundaries."""
|
|
256
|
+
if not data:
|
|
257
|
+
return ['[]']
|
|
258
|
+
|
|
259
|
+
chunks: list[str] = []
|
|
260
|
+
current_elements: list = []
|
|
261
|
+
current_size = 2 # Account for '[]'
|
|
262
|
+
|
|
263
|
+
for element in data:
|
|
264
|
+
element_json = json.dumps(element)
|
|
265
|
+
element_size = len(element_json) + 2 # Account for comma and space
|
|
266
|
+
|
|
267
|
+
# Check if adding this element would exceed chunk size
|
|
268
|
+
if current_elements and current_size + element_size > chunk_size_chars:
|
|
269
|
+
# Save current chunk
|
|
270
|
+
chunks.append(json.dumps(current_elements))
|
|
271
|
+
|
|
272
|
+
# Start new chunk with overlap (include last few elements)
|
|
273
|
+
overlap_elements = _get_overlap_elements(current_elements, overlap_chars)
|
|
274
|
+
current_elements = overlap_elements
|
|
275
|
+
current_size = len(json.dumps(current_elements)) if current_elements else 2
|
|
276
|
+
|
|
277
|
+
current_elements.append(element)
|
|
278
|
+
current_size += element_size
|
|
279
|
+
|
|
280
|
+
# Don't forget the last chunk
|
|
281
|
+
if current_elements:
|
|
282
|
+
chunks.append(json.dumps(current_elements))
|
|
283
|
+
|
|
284
|
+
return chunks if chunks else ['[]']
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _get_overlap_elements(elements: list, overlap_chars: int) -> list:
|
|
288
|
+
"""Get elements from the end of a list that fit within overlap_chars."""
|
|
289
|
+
if not elements:
|
|
290
|
+
return []
|
|
291
|
+
|
|
292
|
+
overlap_elements: list = []
|
|
293
|
+
current_size = 2 # Account for '[]'
|
|
294
|
+
|
|
295
|
+
for element in reversed(elements):
|
|
296
|
+
element_json = json.dumps(element)
|
|
297
|
+
element_size = len(element_json) + 2
|
|
298
|
+
|
|
299
|
+
if current_size + element_size > overlap_chars:
|
|
300
|
+
break
|
|
301
|
+
|
|
302
|
+
overlap_elements.insert(0, element)
|
|
303
|
+
current_size += element_size
|
|
304
|
+
|
|
305
|
+
return overlap_elements
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _chunk_json_object(
|
|
309
|
+
data: dict,
|
|
310
|
+
chunk_size_chars: int,
|
|
311
|
+
overlap_chars: int,
|
|
312
|
+
) -> list[str]:
|
|
313
|
+
"""Chunk a JSON object by splitting at top-level key boundaries."""
|
|
314
|
+
if not data:
|
|
315
|
+
return ['{}']
|
|
316
|
+
|
|
317
|
+
chunks: list[str] = []
|
|
318
|
+
current_keys: list[str] = []
|
|
319
|
+
current_dict: dict = {}
|
|
320
|
+
current_size = 2 # Account for '{}'
|
|
321
|
+
|
|
322
|
+
for key, value in data.items():
|
|
323
|
+
entry_json = json.dumps({key: value})
|
|
324
|
+
entry_size = len(entry_json)
|
|
325
|
+
|
|
326
|
+
# Check if adding this entry would exceed chunk size
|
|
327
|
+
if current_dict and current_size + entry_size > chunk_size_chars:
|
|
328
|
+
# Save current chunk
|
|
329
|
+
chunks.append(json.dumps(current_dict))
|
|
330
|
+
|
|
331
|
+
# Start new chunk with overlap (include last few keys)
|
|
332
|
+
overlap_dict = _get_overlap_dict(current_dict, current_keys, overlap_chars)
|
|
333
|
+
current_dict = overlap_dict
|
|
334
|
+
current_keys = list(overlap_dict.keys())
|
|
335
|
+
current_size = len(json.dumps(current_dict)) if current_dict else 2
|
|
336
|
+
|
|
337
|
+
current_dict[key] = value
|
|
338
|
+
current_keys.append(key)
|
|
339
|
+
current_size += entry_size
|
|
340
|
+
|
|
341
|
+
# Don't forget the last chunk
|
|
342
|
+
if current_dict:
|
|
343
|
+
chunks.append(json.dumps(current_dict))
|
|
344
|
+
|
|
345
|
+
return chunks if chunks else ['{}']
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _get_overlap_dict(data: dict, keys: list[str], overlap_chars: int) -> dict:
|
|
349
|
+
"""Get key-value pairs from the end of a dict that fit within overlap_chars."""
|
|
350
|
+
if not data or not keys:
|
|
351
|
+
return {}
|
|
352
|
+
|
|
353
|
+
overlap_dict: dict = {}
|
|
354
|
+
current_size = 2 # Account for '{}'
|
|
355
|
+
|
|
356
|
+
for key in reversed(keys):
|
|
357
|
+
if key not in data:
|
|
358
|
+
continue
|
|
359
|
+
entry_json = json.dumps({key: data[key]})
|
|
360
|
+
entry_size = len(entry_json)
|
|
361
|
+
|
|
362
|
+
if current_size + entry_size > overlap_chars:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
overlap_dict[key] = data[key]
|
|
366
|
+
current_size += entry_size
|
|
367
|
+
|
|
368
|
+
# Reverse to maintain original order
|
|
369
|
+
return dict(reversed(list(overlap_dict.items())))
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def chunk_text_content(
|
|
373
|
+
content: str,
|
|
374
|
+
chunk_size_tokens: int | None = None,
|
|
375
|
+
overlap_tokens: int | None = None,
|
|
376
|
+
) -> list[str]:
|
|
377
|
+
"""Split text content at natural boundaries (paragraphs, sentences).
|
|
378
|
+
|
|
379
|
+
Includes overlap to capture entities at chunk boundaries.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
content: Text to chunk
|
|
383
|
+
chunk_size_tokens: Target size per chunk in tokens (default from env)
|
|
384
|
+
overlap_tokens: Overlap between chunks in tokens (default from env)
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
List of text chunks
|
|
388
|
+
"""
|
|
389
|
+
chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
|
|
390
|
+
overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
|
|
391
|
+
|
|
392
|
+
chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
|
|
393
|
+
overlap_chars = _tokens_to_chars(overlap_tokens)
|
|
394
|
+
|
|
395
|
+
if len(content) <= chunk_size_chars:
|
|
396
|
+
return [content]
|
|
397
|
+
|
|
398
|
+
# Split into paragraphs first
|
|
399
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
|
400
|
+
|
|
401
|
+
chunks: list[str] = []
|
|
402
|
+
current_chunk: list[str] = []
|
|
403
|
+
current_size = 0
|
|
404
|
+
|
|
405
|
+
for paragraph in paragraphs:
|
|
406
|
+
paragraph = paragraph.strip()
|
|
407
|
+
if not paragraph:
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
para_size = len(paragraph)
|
|
411
|
+
|
|
412
|
+
# If a single paragraph is too large, split it by sentences
|
|
413
|
+
if para_size > chunk_size_chars:
|
|
414
|
+
# First, save current chunk if any
|
|
415
|
+
if current_chunk:
|
|
416
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
417
|
+
current_chunk = []
|
|
418
|
+
current_size = 0
|
|
419
|
+
|
|
420
|
+
# Split large paragraph by sentences
|
|
421
|
+
sentence_chunks = _chunk_by_sentences(paragraph, chunk_size_chars, overlap_chars)
|
|
422
|
+
chunks.extend(sentence_chunks)
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
# Check if adding this paragraph would exceed chunk size
|
|
426
|
+
if current_chunk and current_size + para_size + 2 > chunk_size_chars:
|
|
427
|
+
# Save current chunk
|
|
428
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
429
|
+
|
|
430
|
+
# Start new chunk with overlap
|
|
431
|
+
overlap_text = _get_overlap_text('\n\n'.join(current_chunk), overlap_chars)
|
|
432
|
+
if overlap_text:
|
|
433
|
+
current_chunk = [overlap_text]
|
|
434
|
+
current_size = len(overlap_text)
|
|
435
|
+
else:
|
|
436
|
+
current_chunk = []
|
|
437
|
+
current_size = 0
|
|
438
|
+
|
|
439
|
+
current_chunk.append(paragraph)
|
|
440
|
+
current_size += para_size + 2 # Account for '\n\n'
|
|
441
|
+
|
|
442
|
+
# Don't forget the last chunk
|
|
443
|
+
if current_chunk:
|
|
444
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
445
|
+
|
|
446
|
+
return chunks if chunks else [content]
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _chunk_by_sentences(
|
|
450
|
+
text: str,
|
|
451
|
+
chunk_size_chars: int,
|
|
452
|
+
overlap_chars: int,
|
|
453
|
+
) -> list[str]:
|
|
454
|
+
"""Split text by sentence boundaries."""
|
|
455
|
+
# Split on sentence-ending punctuation followed by whitespace
|
|
456
|
+
sentence_pattern = r'(?<=[.!?])\s+'
|
|
457
|
+
sentences = re.split(sentence_pattern, text)
|
|
458
|
+
|
|
459
|
+
chunks: list[str] = []
|
|
460
|
+
current_chunk: list[str] = []
|
|
461
|
+
current_size = 0
|
|
462
|
+
|
|
463
|
+
for sentence in sentences:
|
|
464
|
+
sentence = sentence.strip()
|
|
465
|
+
if not sentence:
|
|
466
|
+
continue
|
|
467
|
+
|
|
468
|
+
sent_size = len(sentence)
|
|
469
|
+
|
|
470
|
+
# If a single sentence is too large, split it by fixed size
|
|
471
|
+
if sent_size > chunk_size_chars:
|
|
472
|
+
if current_chunk:
|
|
473
|
+
chunks.append(' '.join(current_chunk))
|
|
474
|
+
current_chunk = []
|
|
475
|
+
current_size = 0
|
|
476
|
+
|
|
477
|
+
# Split by fixed size as last resort
|
|
478
|
+
fixed_chunks = _chunk_by_size(sentence, chunk_size_chars, overlap_chars)
|
|
479
|
+
chunks.extend(fixed_chunks)
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
# Check if adding this sentence would exceed chunk size
|
|
483
|
+
if current_chunk and current_size + sent_size + 1 > chunk_size_chars:
|
|
484
|
+
chunks.append(' '.join(current_chunk))
|
|
485
|
+
|
|
486
|
+
# Start new chunk with overlap
|
|
487
|
+
overlap_text = _get_overlap_text(' '.join(current_chunk), overlap_chars)
|
|
488
|
+
if overlap_text:
|
|
489
|
+
current_chunk = [overlap_text]
|
|
490
|
+
current_size = len(overlap_text)
|
|
491
|
+
else:
|
|
492
|
+
current_chunk = []
|
|
493
|
+
current_size = 0
|
|
494
|
+
|
|
495
|
+
current_chunk.append(sentence)
|
|
496
|
+
current_size += sent_size + 1
|
|
497
|
+
|
|
498
|
+
if current_chunk:
|
|
499
|
+
chunks.append(' '.join(current_chunk))
|
|
500
|
+
|
|
501
|
+
return chunks
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def _chunk_by_size(
|
|
505
|
+
text: str,
|
|
506
|
+
chunk_size_chars: int,
|
|
507
|
+
overlap_chars: int,
|
|
508
|
+
) -> list[str]:
|
|
509
|
+
"""Split text by fixed character size (last resort)."""
|
|
510
|
+
chunks: list[str] = []
|
|
511
|
+
start = 0
|
|
512
|
+
|
|
513
|
+
while start < len(text):
|
|
514
|
+
end = min(start + chunk_size_chars, len(text))
|
|
515
|
+
|
|
516
|
+
# Try to break at word boundary
|
|
517
|
+
if end < len(text):
|
|
518
|
+
space_idx = text.rfind(' ', start, end)
|
|
519
|
+
if space_idx > start:
|
|
520
|
+
end = space_idx
|
|
521
|
+
|
|
522
|
+
chunks.append(text[start:end].strip())
|
|
523
|
+
|
|
524
|
+
# Move start forward, ensuring progress even if overlap >= chunk_size
|
|
525
|
+
# Always advance by at least (chunk_size - overlap) or 1 char minimum
|
|
526
|
+
min_progress = max(1, chunk_size_chars - overlap_chars)
|
|
527
|
+
start = max(start + min_progress, end - overlap_chars)
|
|
528
|
+
|
|
529
|
+
return chunks
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def _get_overlap_text(text: str, overlap_chars: int) -> str:
|
|
533
|
+
"""Get the last overlap_chars characters of text, breaking at word boundary."""
|
|
534
|
+
if len(text) <= overlap_chars:
|
|
535
|
+
return text
|
|
536
|
+
|
|
537
|
+
overlap_start = len(text) - overlap_chars
|
|
538
|
+
# Find the next word boundary after overlap_start
|
|
539
|
+
space_idx = text.find(' ', overlap_start)
|
|
540
|
+
if space_idx != -1:
|
|
541
|
+
return text[space_idx + 1 :]
|
|
542
|
+
return text[overlap_start:]
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def chunk_message_content(
|
|
546
|
+
content: str,
|
|
547
|
+
chunk_size_tokens: int | None = None,
|
|
548
|
+
overlap_tokens: int | None = None,
|
|
549
|
+
) -> list[str]:
|
|
550
|
+
"""Split conversation content preserving message boundaries.
|
|
551
|
+
|
|
552
|
+
Never splits mid-message. Messages are identified by patterns like:
|
|
553
|
+
- "Speaker: message"
|
|
554
|
+
- JSON message arrays
|
|
555
|
+
- Newline-separated messages
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
content: Conversation content to chunk
|
|
559
|
+
chunk_size_tokens: Target size per chunk in tokens (default from env)
|
|
560
|
+
overlap_tokens: Overlap between chunks in tokens (default from env)
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
List of conversation chunks
|
|
564
|
+
"""
|
|
565
|
+
chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
|
|
566
|
+
overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
|
|
567
|
+
|
|
568
|
+
chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
|
|
569
|
+
overlap_chars = _tokens_to_chars(overlap_tokens)
|
|
570
|
+
|
|
571
|
+
if len(content) <= chunk_size_chars:
|
|
572
|
+
return [content]
|
|
573
|
+
|
|
574
|
+
# Try to detect message format
|
|
575
|
+
# Check if it's JSON (array of message objects)
|
|
576
|
+
try:
|
|
577
|
+
data = json.loads(content)
|
|
578
|
+
if isinstance(data, list):
|
|
579
|
+
return _chunk_message_array(data, chunk_size_chars, overlap_chars)
|
|
580
|
+
except json.JSONDecodeError:
|
|
581
|
+
pass
|
|
582
|
+
|
|
583
|
+
# Try speaker pattern (e.g., "Alice: Hello")
|
|
584
|
+
speaker_pattern = r'^([A-Za-z_][A-Za-z0-9_\s]*):(.+?)(?=^[A-Za-z_][A-Za-z0-9_\s]*:|$)'
|
|
585
|
+
if re.search(speaker_pattern, content, re.MULTILINE | re.DOTALL):
|
|
586
|
+
return _chunk_speaker_messages(content, chunk_size_chars, overlap_chars)
|
|
587
|
+
|
|
588
|
+
# Fallback to line-based chunking
|
|
589
|
+
return _chunk_by_lines(content, chunk_size_chars, overlap_chars)
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _chunk_message_array(
|
|
593
|
+
messages: list,
|
|
594
|
+
chunk_size_chars: int,
|
|
595
|
+
overlap_chars: int,
|
|
596
|
+
) -> list[str]:
|
|
597
|
+
"""Chunk a JSON array of message objects."""
|
|
598
|
+
# Delegate to JSON array chunking
|
|
599
|
+
chunks = _chunk_json_array(messages, chunk_size_chars, overlap_chars)
|
|
600
|
+
return chunks
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _chunk_speaker_messages(
|
|
604
|
+
content: str,
|
|
605
|
+
chunk_size_chars: int,
|
|
606
|
+
overlap_chars: int,
|
|
607
|
+
) -> list[str]:
|
|
608
|
+
"""Chunk messages in 'Speaker: message' format."""
|
|
609
|
+
# Split on speaker patterns
|
|
610
|
+
pattern = r'(?=^[A-Za-z_][A-Za-z0-9_\s]*:)'
|
|
611
|
+
messages = re.split(pattern, content, flags=re.MULTILINE)
|
|
612
|
+
messages = [m.strip() for m in messages if m.strip()]
|
|
613
|
+
|
|
614
|
+
if not messages:
|
|
615
|
+
return [content]
|
|
616
|
+
|
|
617
|
+
chunks: list[str] = []
|
|
618
|
+
current_messages: list[str] = []
|
|
619
|
+
current_size = 0
|
|
620
|
+
|
|
621
|
+
for message in messages:
|
|
622
|
+
msg_size = len(message)
|
|
623
|
+
|
|
624
|
+
# If a single message is too large, include it as its own chunk
|
|
625
|
+
if msg_size > chunk_size_chars:
|
|
626
|
+
if current_messages:
|
|
627
|
+
chunks.append('\n'.join(current_messages))
|
|
628
|
+
current_messages = []
|
|
629
|
+
current_size = 0
|
|
630
|
+
chunks.append(message)
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
if current_messages and current_size + msg_size + 1 > chunk_size_chars:
|
|
634
|
+
chunks.append('\n'.join(current_messages))
|
|
635
|
+
|
|
636
|
+
# Get overlap (last message(s) that fit)
|
|
637
|
+
overlap_messages = _get_overlap_messages(current_messages, overlap_chars)
|
|
638
|
+
current_messages = overlap_messages
|
|
639
|
+
current_size = sum(len(m) for m in current_messages) + len(current_messages) - 1
|
|
640
|
+
|
|
641
|
+
current_messages.append(message)
|
|
642
|
+
current_size += msg_size + 1
|
|
643
|
+
|
|
644
|
+
if current_messages:
|
|
645
|
+
chunks.append('\n'.join(current_messages))
|
|
646
|
+
|
|
647
|
+
return chunks if chunks else [content]
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _get_overlap_messages(messages: list[str], overlap_chars: int) -> list[str]:
|
|
651
|
+
"""Get messages from the end that fit within overlap_chars."""
|
|
652
|
+
if not messages:
|
|
653
|
+
return []
|
|
654
|
+
|
|
655
|
+
overlap: list[str] = []
|
|
656
|
+
current_size = 0
|
|
657
|
+
|
|
658
|
+
for msg in reversed(messages):
|
|
659
|
+
msg_size = len(msg) + 1
|
|
660
|
+
if current_size + msg_size > overlap_chars:
|
|
661
|
+
break
|
|
662
|
+
overlap.insert(0, msg)
|
|
663
|
+
current_size += msg_size
|
|
664
|
+
|
|
665
|
+
return overlap
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def _chunk_by_lines(
|
|
669
|
+
content: str,
|
|
670
|
+
chunk_size_chars: int,
|
|
671
|
+
overlap_chars: int,
|
|
672
|
+
) -> list[str]:
|
|
673
|
+
"""Chunk content by line boundaries."""
|
|
674
|
+
lines = content.split('\n')
|
|
675
|
+
|
|
676
|
+
chunks: list[str] = []
|
|
677
|
+
current_lines: list[str] = []
|
|
678
|
+
current_size = 0
|
|
679
|
+
|
|
680
|
+
for line in lines:
|
|
681
|
+
line_size = len(line) + 1
|
|
682
|
+
|
|
683
|
+
if current_lines and current_size + line_size > chunk_size_chars:
|
|
684
|
+
chunks.append('\n'.join(current_lines))
|
|
685
|
+
|
|
686
|
+
# Get overlap lines
|
|
687
|
+
overlap_text = '\n'.join(current_lines)
|
|
688
|
+
overlap = _get_overlap_text(overlap_text, overlap_chars)
|
|
689
|
+
if overlap:
|
|
690
|
+
current_lines = overlap.split('\n')
|
|
691
|
+
current_size = len(overlap)
|
|
692
|
+
else:
|
|
693
|
+
current_lines = []
|
|
694
|
+
current_size = 0
|
|
695
|
+
|
|
696
|
+
current_lines.append(line)
|
|
697
|
+
current_size += line_size
|
|
698
|
+
|
|
699
|
+
if current_lines:
|
|
700
|
+
chunks.append('\n'.join(current_lines))
|
|
701
|
+
|
|
702
|
+
return chunks if chunks else [content]
|
|
@@ -40,3 +40,16 @@ def ensure_utc(dt: datetime | None) -> datetime | None:
|
|
|
40
40
|
return dt.astimezone(timezone.utc)
|
|
41
41
|
|
|
42
42
|
return dt
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def convert_datetimes_to_strings(obj):
|
|
46
|
+
if isinstance(obj, dict):
|
|
47
|
+
return {k: convert_datetimes_to_strings(v) for k, v in obj.items()}
|
|
48
|
+
elif isinstance(obj, list):
|
|
49
|
+
return [convert_datetimes_to_strings(item) for item in obj]
|
|
50
|
+
elif isinstance(obj, tuple):
|
|
51
|
+
return tuple(convert_datetimes_to_strings(item) for item in obj)
|
|
52
|
+
elif isinstance(obj, datetime):
|
|
53
|
+
return obj.isoformat()
|
|
54
|
+
else:
|
|
55
|
+
return obj
|