graphiti-core 0.17.4__py3-none-any.whl → 0.25.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. graphiti_core/cross_encoder/gemini_reranker_client.py +1 -1
  2. graphiti_core/cross_encoder/openai_reranker_client.py +1 -1
  3. graphiti_core/decorators.py +110 -0
  4. graphiti_core/driver/driver.py +62 -2
  5. graphiti_core/driver/falkordb_driver.py +215 -23
  6. graphiti_core/driver/graph_operations/graph_operations.py +191 -0
  7. graphiti_core/driver/kuzu_driver.py +182 -0
  8. graphiti_core/driver/neo4j_driver.py +70 -8
  9. graphiti_core/driver/neptune_driver.py +305 -0
  10. graphiti_core/driver/search_interface/search_interface.py +89 -0
  11. graphiti_core/edges.py +264 -132
  12. graphiti_core/embedder/azure_openai.py +10 -3
  13. graphiti_core/embedder/client.py +2 -1
  14. graphiti_core/graph_queries.py +114 -101
  15. graphiti_core/graphiti.py +635 -260
  16. graphiti_core/graphiti_types.py +2 -0
  17. graphiti_core/helpers.py +37 -15
  18. graphiti_core/llm_client/anthropic_client.py +142 -52
  19. graphiti_core/llm_client/azure_openai_client.py +57 -19
  20. graphiti_core/llm_client/client.py +83 -21
  21. graphiti_core/llm_client/config.py +1 -1
  22. graphiti_core/llm_client/gemini_client.py +75 -57
  23. graphiti_core/llm_client/openai_base_client.py +92 -48
  24. graphiti_core/llm_client/openai_client.py +39 -9
  25. graphiti_core/llm_client/openai_generic_client.py +91 -56
  26. graphiti_core/models/edges/edge_db_queries.py +259 -35
  27. graphiti_core/models/nodes/node_db_queries.py +311 -32
  28. graphiti_core/nodes.py +388 -164
  29. graphiti_core/prompts/dedupe_edges.py +42 -31
  30. graphiti_core/prompts/dedupe_nodes.py +56 -39
  31. graphiti_core/prompts/eval.py +4 -4
  32. graphiti_core/prompts/extract_edges.py +24 -15
  33. graphiti_core/prompts/extract_nodes.py +76 -35
  34. graphiti_core/prompts/prompt_helpers.py +39 -0
  35. graphiti_core/prompts/snippets.py +29 -0
  36. graphiti_core/prompts/summarize_nodes.py +23 -25
  37. graphiti_core/search/search.py +154 -74
  38. graphiti_core/search/search_config.py +39 -4
  39. graphiti_core/search/search_filters.py +110 -31
  40. graphiti_core/search/search_helpers.py +5 -6
  41. graphiti_core/search/search_utils.py +1360 -473
  42. graphiti_core/tracer.py +193 -0
  43. graphiti_core/utils/bulk_utils.py +216 -90
  44. graphiti_core/utils/content_chunking.py +702 -0
  45. graphiti_core/utils/datetime_utils.py +13 -0
  46. graphiti_core/utils/maintenance/community_operations.py +62 -38
  47. graphiti_core/utils/maintenance/dedup_helpers.py +262 -0
  48. graphiti_core/utils/maintenance/edge_operations.py +306 -156
  49. graphiti_core/utils/maintenance/graph_data_operations.py +44 -74
  50. graphiti_core/utils/maintenance/node_operations.py +466 -206
  51. graphiti_core/utils/maintenance/temporal_operations.py +11 -3
  52. graphiti_core/utils/ontology_utils/entity_types_utils.py +1 -1
  53. graphiti_core/utils/text_utils.py +53 -0
  54. {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/METADATA +221 -87
  55. graphiti_core-0.25.3.dist-info/RECORD +87 -0
  56. {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/WHEEL +1 -1
  57. graphiti_core-0.17.4.dist-info/RECORD +0 -77
  58. /graphiti_core/{utils/maintenance/utils.py → migrations/__init__.py} +0 -0
  59. {graphiti_core-0.17.4.dist-info → graphiti_core-0.25.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,702 @@
1
+ """
2
+ Copyright 2024, Zep Software, Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import json
18
+ import logging
19
+ import re
20
+
21
+ from graphiti_core.helpers import (
22
+ CHUNK_DENSITY_THRESHOLD,
23
+ CHUNK_MIN_TOKENS,
24
+ CHUNK_OVERLAP_TOKENS,
25
+ CHUNK_TOKEN_SIZE,
26
+ )
27
+ from graphiti_core.nodes import EpisodeType
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Approximate characters per token (conservative estimate)
32
+ CHARS_PER_TOKEN = 4
33
+
34
+
35
+ def estimate_tokens(text: str) -> int:
36
+ """Estimate token count using character-based heuristic.
37
+
38
+ Uses ~4 characters per token as a conservative estimate.
39
+ This is faster than actual tokenization and works across all LLM providers.
40
+
41
+ Args:
42
+ text: The text to estimate tokens for
43
+
44
+ Returns:
45
+ Estimated token count
46
+ """
47
+ return len(text) // CHARS_PER_TOKEN
48
+
49
+
50
+ def _tokens_to_chars(tokens: int) -> int:
51
+ """Convert token count to approximate character count."""
52
+ return tokens * CHARS_PER_TOKEN
53
+
54
+
55
+ def should_chunk(content: str, episode_type: EpisodeType) -> bool:
56
+ """Determine whether content should be chunked based on size and entity density.
57
+
58
+ Only chunks content that is both:
59
+ 1. Large enough to potentially cause LLM issues (>= CHUNK_MIN_TOKENS)
60
+ 2. High entity density (many entities per token)
61
+
62
+ Short content processes fine regardless of density. This targets the specific
63
+ failure case of large entity-dense inputs while preserving context for
64
+ prose/narrative content and avoiding unnecessary chunking of small inputs.
65
+
66
+ Args:
67
+ content: The content to evaluate
68
+ episode_type: Type of episode (json, message, text)
69
+
70
+ Returns:
71
+ True if content is large and has high entity density
72
+ """
73
+ tokens = estimate_tokens(content)
74
+
75
+ # Short content always processes fine - no need to chunk
76
+ if tokens < CHUNK_MIN_TOKENS:
77
+ return False
78
+
79
+ return _estimate_high_density(content, episode_type, tokens)
80
+
81
+
82
+ def _estimate_high_density(content: str, episode_type: EpisodeType, tokens: int) -> bool:
83
+ """Estimate whether content has high entity density.
84
+
85
+ High-density content (many entities per token) benefits from chunking.
86
+ Low-density content (prose, narratives) loses context when chunked.
87
+
88
+ Args:
89
+ content: The content to analyze
90
+ episode_type: Type of episode
91
+ tokens: Pre-computed token count
92
+
93
+ Returns:
94
+ True if content appears to have high entity density
95
+ """
96
+ if episode_type == EpisodeType.json:
97
+ return _json_likely_dense(content, tokens)
98
+ else:
99
+ return _text_likely_dense(content, tokens)
100
+
101
+
102
+ def _json_likely_dense(content: str, tokens: int) -> bool:
103
+ """Estimate entity density for JSON content.
104
+
105
+ JSON is considered dense if it has many array elements or object keys,
106
+ as each typically represents a distinct entity or data point.
107
+
108
+ Heuristics:
109
+ - Array: Count elements, estimate entities per 1000 tokens
110
+ - Object: Count top-level keys
111
+
112
+ Args:
113
+ content: JSON string content
114
+ tokens: Token count
115
+
116
+ Returns:
117
+ True if JSON appears to have high entity density
118
+ """
119
+ try:
120
+ data = json.loads(content)
121
+ except json.JSONDecodeError:
122
+ # Invalid JSON, fall back to text heuristics
123
+ return _text_likely_dense(content, tokens)
124
+
125
+ if isinstance(data, list):
126
+ # For arrays, each element likely contains entities
127
+ element_count = len(data)
128
+ # Estimate density: elements per 1000 tokens
129
+ density = (element_count / tokens) * 1000 if tokens > 0 else 0
130
+ return density > CHUNK_DENSITY_THRESHOLD * 1000 # Scale threshold
131
+ elif isinstance(data, dict):
132
+ # For objects, count keys recursively (shallow)
133
+ key_count = _count_json_keys(data, max_depth=2)
134
+ density = (key_count / tokens) * 1000 if tokens > 0 else 0
135
+ return density > CHUNK_DENSITY_THRESHOLD * 1000
136
+ else:
137
+ # Scalar value, no need to chunk
138
+ return False
139
+
140
+
141
+ def _count_json_keys(data: dict, max_depth: int = 2, current_depth: int = 0) -> int:
142
+ """Count keys in a JSON object up to a certain depth.
143
+
144
+ Args:
145
+ data: Dictionary to count keys in
146
+ max_depth: Maximum depth to traverse
147
+ current_depth: Current recursion depth
148
+
149
+ Returns:
150
+ Count of keys
151
+ """
152
+ if current_depth >= max_depth:
153
+ return 0
154
+
155
+ count = len(data)
156
+ for value in data.values():
157
+ if isinstance(value, dict):
158
+ count += _count_json_keys(value, max_depth, current_depth + 1)
159
+ elif isinstance(value, list):
160
+ for item in value:
161
+ if isinstance(item, dict):
162
+ count += _count_json_keys(item, max_depth, current_depth + 1)
163
+ return count
164
+
165
+
166
+ def _text_likely_dense(content: str, tokens: int) -> bool:
167
+ """Estimate entity density for text content.
168
+
169
+ Uses capitalized words as a proxy for named entities (people, places,
170
+ organizations, products). High ratio of capitalized words suggests
171
+ high entity density.
172
+
173
+ Args:
174
+ content: Text content
175
+ tokens: Token count
176
+
177
+ Returns:
178
+ True if text appears to have high entity density
179
+ """
180
+ if tokens == 0:
181
+ return False
182
+
183
+ # Split into words
184
+ words = content.split()
185
+ if not words:
186
+ return False
187
+
188
+ # Count capitalized words (excluding sentence starters)
189
+ # A word is "capitalized" if it starts with uppercase and isn't all caps
190
+ capitalized_count = 0
191
+ for i, word in enumerate(words):
192
+ # Skip if it's likely a sentence starter (after . ! ? or first word)
193
+ if i == 0:
194
+ continue
195
+ if i > 0 and words[i - 1].rstrip()[-1:] in '.!?':
196
+ continue
197
+
198
+ # Check if capitalized (first char upper, not all caps)
199
+ cleaned = word.strip('.,!?;:\'"()[]{}')
200
+ if cleaned and cleaned[0].isupper() and not cleaned.isupper():
201
+ capitalized_count += 1
202
+
203
+ # Calculate density: capitalized words per 1000 tokens
204
+ density = (capitalized_count / tokens) * 1000 if tokens > 0 else 0
205
+
206
+ # Text density threshold is typically lower than JSON
207
+ # A well-written article might have 5-10% named entities
208
+ return density > CHUNK_DENSITY_THRESHOLD * 500 # Half the JSON threshold
209
+
210
+
211
+ def chunk_json_content(
212
+ content: str,
213
+ chunk_size_tokens: int | None = None,
214
+ overlap_tokens: int | None = None,
215
+ ) -> list[str]:
216
+ """Split JSON content into chunks while preserving structure.
217
+
218
+ For arrays: splits at element boundaries, keeping complete objects.
219
+ For objects: splits at top-level key boundaries.
220
+
221
+ Args:
222
+ content: JSON string to chunk
223
+ chunk_size_tokens: Target size per chunk in tokens (default from env)
224
+ overlap_tokens: Overlap between chunks in tokens (default from env)
225
+
226
+ Returns:
227
+ List of JSON string chunks
228
+ """
229
+ chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
230
+ overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
231
+
232
+ chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
233
+ overlap_chars = _tokens_to_chars(overlap_tokens)
234
+
235
+ try:
236
+ data = json.loads(content)
237
+ except json.JSONDecodeError:
238
+ logger.warning('Failed to parse JSON, falling back to text chunking')
239
+ return chunk_text_content(content, chunk_size_tokens, overlap_tokens)
240
+
241
+ if isinstance(data, list):
242
+ return _chunk_json_array(data, chunk_size_chars, overlap_chars)
243
+ elif isinstance(data, dict):
244
+ return _chunk_json_object(data, chunk_size_chars, overlap_chars)
245
+ else:
246
+ # Scalar value, return as-is
247
+ return [content]
248
+
249
+
250
+ def _chunk_json_array(
251
+ data: list,
252
+ chunk_size_chars: int,
253
+ overlap_chars: int,
254
+ ) -> list[str]:
255
+ """Chunk a JSON array by splitting at element boundaries."""
256
+ if not data:
257
+ return ['[]']
258
+
259
+ chunks: list[str] = []
260
+ current_elements: list = []
261
+ current_size = 2 # Account for '[]'
262
+
263
+ for element in data:
264
+ element_json = json.dumps(element)
265
+ element_size = len(element_json) + 2 # Account for comma and space
266
+
267
+ # Check if adding this element would exceed chunk size
268
+ if current_elements and current_size + element_size > chunk_size_chars:
269
+ # Save current chunk
270
+ chunks.append(json.dumps(current_elements))
271
+
272
+ # Start new chunk with overlap (include last few elements)
273
+ overlap_elements = _get_overlap_elements(current_elements, overlap_chars)
274
+ current_elements = overlap_elements
275
+ current_size = len(json.dumps(current_elements)) if current_elements else 2
276
+
277
+ current_elements.append(element)
278
+ current_size += element_size
279
+
280
+ # Don't forget the last chunk
281
+ if current_elements:
282
+ chunks.append(json.dumps(current_elements))
283
+
284
+ return chunks if chunks else ['[]']
285
+
286
+
287
+ def _get_overlap_elements(elements: list, overlap_chars: int) -> list:
288
+ """Get elements from the end of a list that fit within overlap_chars."""
289
+ if not elements:
290
+ return []
291
+
292
+ overlap_elements: list = []
293
+ current_size = 2 # Account for '[]'
294
+
295
+ for element in reversed(elements):
296
+ element_json = json.dumps(element)
297
+ element_size = len(element_json) + 2
298
+
299
+ if current_size + element_size > overlap_chars:
300
+ break
301
+
302
+ overlap_elements.insert(0, element)
303
+ current_size += element_size
304
+
305
+ return overlap_elements
306
+
307
+
308
+ def _chunk_json_object(
309
+ data: dict,
310
+ chunk_size_chars: int,
311
+ overlap_chars: int,
312
+ ) -> list[str]:
313
+ """Chunk a JSON object by splitting at top-level key boundaries."""
314
+ if not data:
315
+ return ['{}']
316
+
317
+ chunks: list[str] = []
318
+ current_keys: list[str] = []
319
+ current_dict: dict = {}
320
+ current_size = 2 # Account for '{}'
321
+
322
+ for key, value in data.items():
323
+ entry_json = json.dumps({key: value})
324
+ entry_size = len(entry_json)
325
+
326
+ # Check if adding this entry would exceed chunk size
327
+ if current_dict and current_size + entry_size > chunk_size_chars:
328
+ # Save current chunk
329
+ chunks.append(json.dumps(current_dict))
330
+
331
+ # Start new chunk with overlap (include last few keys)
332
+ overlap_dict = _get_overlap_dict(current_dict, current_keys, overlap_chars)
333
+ current_dict = overlap_dict
334
+ current_keys = list(overlap_dict.keys())
335
+ current_size = len(json.dumps(current_dict)) if current_dict else 2
336
+
337
+ current_dict[key] = value
338
+ current_keys.append(key)
339
+ current_size += entry_size
340
+
341
+ # Don't forget the last chunk
342
+ if current_dict:
343
+ chunks.append(json.dumps(current_dict))
344
+
345
+ return chunks if chunks else ['{}']
346
+
347
+
348
+ def _get_overlap_dict(data: dict, keys: list[str], overlap_chars: int) -> dict:
349
+ """Get key-value pairs from the end of a dict that fit within overlap_chars."""
350
+ if not data or not keys:
351
+ return {}
352
+
353
+ overlap_dict: dict = {}
354
+ current_size = 2 # Account for '{}'
355
+
356
+ for key in reversed(keys):
357
+ if key not in data:
358
+ continue
359
+ entry_json = json.dumps({key: data[key]})
360
+ entry_size = len(entry_json)
361
+
362
+ if current_size + entry_size > overlap_chars:
363
+ break
364
+
365
+ overlap_dict[key] = data[key]
366
+ current_size += entry_size
367
+
368
+ # Reverse to maintain original order
369
+ return dict(reversed(list(overlap_dict.items())))
370
+
371
+
372
+ def chunk_text_content(
373
+ content: str,
374
+ chunk_size_tokens: int | None = None,
375
+ overlap_tokens: int | None = None,
376
+ ) -> list[str]:
377
+ """Split text content at natural boundaries (paragraphs, sentences).
378
+
379
+ Includes overlap to capture entities at chunk boundaries.
380
+
381
+ Args:
382
+ content: Text to chunk
383
+ chunk_size_tokens: Target size per chunk in tokens (default from env)
384
+ overlap_tokens: Overlap between chunks in tokens (default from env)
385
+
386
+ Returns:
387
+ List of text chunks
388
+ """
389
+ chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
390
+ overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
391
+
392
+ chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
393
+ overlap_chars = _tokens_to_chars(overlap_tokens)
394
+
395
+ if len(content) <= chunk_size_chars:
396
+ return [content]
397
+
398
+ # Split into paragraphs first
399
+ paragraphs = re.split(r'\n\s*\n', content)
400
+
401
+ chunks: list[str] = []
402
+ current_chunk: list[str] = []
403
+ current_size = 0
404
+
405
+ for paragraph in paragraphs:
406
+ paragraph = paragraph.strip()
407
+ if not paragraph:
408
+ continue
409
+
410
+ para_size = len(paragraph)
411
+
412
+ # If a single paragraph is too large, split it by sentences
413
+ if para_size > chunk_size_chars:
414
+ # First, save current chunk if any
415
+ if current_chunk:
416
+ chunks.append('\n\n'.join(current_chunk))
417
+ current_chunk = []
418
+ current_size = 0
419
+
420
+ # Split large paragraph by sentences
421
+ sentence_chunks = _chunk_by_sentences(paragraph, chunk_size_chars, overlap_chars)
422
+ chunks.extend(sentence_chunks)
423
+ continue
424
+
425
+ # Check if adding this paragraph would exceed chunk size
426
+ if current_chunk and current_size + para_size + 2 > chunk_size_chars:
427
+ # Save current chunk
428
+ chunks.append('\n\n'.join(current_chunk))
429
+
430
+ # Start new chunk with overlap
431
+ overlap_text = _get_overlap_text('\n\n'.join(current_chunk), overlap_chars)
432
+ if overlap_text:
433
+ current_chunk = [overlap_text]
434
+ current_size = len(overlap_text)
435
+ else:
436
+ current_chunk = []
437
+ current_size = 0
438
+
439
+ current_chunk.append(paragraph)
440
+ current_size += para_size + 2 # Account for '\n\n'
441
+
442
+ # Don't forget the last chunk
443
+ if current_chunk:
444
+ chunks.append('\n\n'.join(current_chunk))
445
+
446
+ return chunks if chunks else [content]
447
+
448
+
449
+ def _chunk_by_sentences(
450
+ text: str,
451
+ chunk_size_chars: int,
452
+ overlap_chars: int,
453
+ ) -> list[str]:
454
+ """Split text by sentence boundaries."""
455
+ # Split on sentence-ending punctuation followed by whitespace
456
+ sentence_pattern = r'(?<=[.!?])\s+'
457
+ sentences = re.split(sentence_pattern, text)
458
+
459
+ chunks: list[str] = []
460
+ current_chunk: list[str] = []
461
+ current_size = 0
462
+
463
+ for sentence in sentences:
464
+ sentence = sentence.strip()
465
+ if not sentence:
466
+ continue
467
+
468
+ sent_size = len(sentence)
469
+
470
+ # If a single sentence is too large, split it by fixed size
471
+ if sent_size > chunk_size_chars:
472
+ if current_chunk:
473
+ chunks.append(' '.join(current_chunk))
474
+ current_chunk = []
475
+ current_size = 0
476
+
477
+ # Split by fixed size as last resort
478
+ fixed_chunks = _chunk_by_size(sentence, chunk_size_chars, overlap_chars)
479
+ chunks.extend(fixed_chunks)
480
+ continue
481
+
482
+ # Check if adding this sentence would exceed chunk size
483
+ if current_chunk and current_size + sent_size + 1 > chunk_size_chars:
484
+ chunks.append(' '.join(current_chunk))
485
+
486
+ # Start new chunk with overlap
487
+ overlap_text = _get_overlap_text(' '.join(current_chunk), overlap_chars)
488
+ if overlap_text:
489
+ current_chunk = [overlap_text]
490
+ current_size = len(overlap_text)
491
+ else:
492
+ current_chunk = []
493
+ current_size = 0
494
+
495
+ current_chunk.append(sentence)
496
+ current_size += sent_size + 1
497
+
498
+ if current_chunk:
499
+ chunks.append(' '.join(current_chunk))
500
+
501
+ return chunks
502
+
503
+
504
+ def _chunk_by_size(
505
+ text: str,
506
+ chunk_size_chars: int,
507
+ overlap_chars: int,
508
+ ) -> list[str]:
509
+ """Split text by fixed character size (last resort)."""
510
+ chunks: list[str] = []
511
+ start = 0
512
+
513
+ while start < len(text):
514
+ end = min(start + chunk_size_chars, len(text))
515
+
516
+ # Try to break at word boundary
517
+ if end < len(text):
518
+ space_idx = text.rfind(' ', start, end)
519
+ if space_idx > start:
520
+ end = space_idx
521
+
522
+ chunks.append(text[start:end].strip())
523
+
524
+ # Move start forward, ensuring progress even if overlap >= chunk_size
525
+ # Always advance by at least (chunk_size - overlap) or 1 char minimum
526
+ min_progress = max(1, chunk_size_chars - overlap_chars)
527
+ start = max(start + min_progress, end - overlap_chars)
528
+
529
+ return chunks
530
+
531
+
532
+ def _get_overlap_text(text: str, overlap_chars: int) -> str:
533
+ """Get the last overlap_chars characters of text, breaking at word boundary."""
534
+ if len(text) <= overlap_chars:
535
+ return text
536
+
537
+ overlap_start = len(text) - overlap_chars
538
+ # Find the next word boundary after overlap_start
539
+ space_idx = text.find(' ', overlap_start)
540
+ if space_idx != -1:
541
+ return text[space_idx + 1 :]
542
+ return text[overlap_start:]
543
+
544
+
545
+ def chunk_message_content(
546
+ content: str,
547
+ chunk_size_tokens: int | None = None,
548
+ overlap_tokens: int | None = None,
549
+ ) -> list[str]:
550
+ """Split conversation content preserving message boundaries.
551
+
552
+ Never splits mid-message. Messages are identified by patterns like:
553
+ - "Speaker: message"
554
+ - JSON message arrays
555
+ - Newline-separated messages
556
+
557
+ Args:
558
+ content: Conversation content to chunk
559
+ chunk_size_tokens: Target size per chunk in tokens (default from env)
560
+ overlap_tokens: Overlap between chunks in tokens (default from env)
561
+
562
+ Returns:
563
+ List of conversation chunks
564
+ """
565
+ chunk_size_tokens = chunk_size_tokens or CHUNK_TOKEN_SIZE
566
+ overlap_tokens = overlap_tokens or CHUNK_OVERLAP_TOKENS
567
+
568
+ chunk_size_chars = _tokens_to_chars(chunk_size_tokens)
569
+ overlap_chars = _tokens_to_chars(overlap_tokens)
570
+
571
+ if len(content) <= chunk_size_chars:
572
+ return [content]
573
+
574
+ # Try to detect message format
575
+ # Check if it's JSON (array of message objects)
576
+ try:
577
+ data = json.loads(content)
578
+ if isinstance(data, list):
579
+ return _chunk_message_array(data, chunk_size_chars, overlap_chars)
580
+ except json.JSONDecodeError:
581
+ pass
582
+
583
+ # Try speaker pattern (e.g., "Alice: Hello")
584
+ speaker_pattern = r'^([A-Za-z_][A-Za-z0-9_\s]*):(.+?)(?=^[A-Za-z_][A-Za-z0-9_\s]*:|$)'
585
+ if re.search(speaker_pattern, content, re.MULTILINE | re.DOTALL):
586
+ return _chunk_speaker_messages(content, chunk_size_chars, overlap_chars)
587
+
588
+ # Fallback to line-based chunking
589
+ return _chunk_by_lines(content, chunk_size_chars, overlap_chars)
590
+
591
+
592
+ def _chunk_message_array(
593
+ messages: list,
594
+ chunk_size_chars: int,
595
+ overlap_chars: int,
596
+ ) -> list[str]:
597
+ """Chunk a JSON array of message objects."""
598
+ # Delegate to JSON array chunking
599
+ chunks = _chunk_json_array(messages, chunk_size_chars, overlap_chars)
600
+ return chunks
601
+
602
+
603
+ def _chunk_speaker_messages(
604
+ content: str,
605
+ chunk_size_chars: int,
606
+ overlap_chars: int,
607
+ ) -> list[str]:
608
+ """Chunk messages in 'Speaker: message' format."""
609
+ # Split on speaker patterns
610
+ pattern = r'(?=^[A-Za-z_][A-Za-z0-9_\s]*:)'
611
+ messages = re.split(pattern, content, flags=re.MULTILINE)
612
+ messages = [m.strip() for m in messages if m.strip()]
613
+
614
+ if not messages:
615
+ return [content]
616
+
617
+ chunks: list[str] = []
618
+ current_messages: list[str] = []
619
+ current_size = 0
620
+
621
+ for message in messages:
622
+ msg_size = len(message)
623
+
624
+ # If a single message is too large, include it as its own chunk
625
+ if msg_size > chunk_size_chars:
626
+ if current_messages:
627
+ chunks.append('\n'.join(current_messages))
628
+ current_messages = []
629
+ current_size = 0
630
+ chunks.append(message)
631
+ continue
632
+
633
+ if current_messages and current_size + msg_size + 1 > chunk_size_chars:
634
+ chunks.append('\n'.join(current_messages))
635
+
636
+ # Get overlap (last message(s) that fit)
637
+ overlap_messages = _get_overlap_messages(current_messages, overlap_chars)
638
+ current_messages = overlap_messages
639
+ current_size = sum(len(m) for m in current_messages) + len(current_messages) - 1
640
+
641
+ current_messages.append(message)
642
+ current_size += msg_size + 1
643
+
644
+ if current_messages:
645
+ chunks.append('\n'.join(current_messages))
646
+
647
+ return chunks if chunks else [content]
648
+
649
+
650
+ def _get_overlap_messages(messages: list[str], overlap_chars: int) -> list[str]:
651
+ """Get messages from the end that fit within overlap_chars."""
652
+ if not messages:
653
+ return []
654
+
655
+ overlap: list[str] = []
656
+ current_size = 0
657
+
658
+ for msg in reversed(messages):
659
+ msg_size = len(msg) + 1
660
+ if current_size + msg_size > overlap_chars:
661
+ break
662
+ overlap.insert(0, msg)
663
+ current_size += msg_size
664
+
665
+ return overlap
666
+
667
+
668
+ def _chunk_by_lines(
669
+ content: str,
670
+ chunk_size_chars: int,
671
+ overlap_chars: int,
672
+ ) -> list[str]:
673
+ """Chunk content by line boundaries."""
674
+ lines = content.split('\n')
675
+
676
+ chunks: list[str] = []
677
+ current_lines: list[str] = []
678
+ current_size = 0
679
+
680
+ for line in lines:
681
+ line_size = len(line) + 1
682
+
683
+ if current_lines and current_size + line_size > chunk_size_chars:
684
+ chunks.append('\n'.join(current_lines))
685
+
686
+ # Get overlap lines
687
+ overlap_text = '\n'.join(current_lines)
688
+ overlap = _get_overlap_text(overlap_text, overlap_chars)
689
+ if overlap:
690
+ current_lines = overlap.split('\n')
691
+ current_size = len(overlap)
692
+ else:
693
+ current_lines = []
694
+ current_size = 0
695
+
696
+ current_lines.append(line)
697
+ current_size += line_size
698
+
699
+ if current_lines:
700
+ chunks.append('\n'.join(current_lines))
701
+
702
+ return chunks if chunks else [content]
@@ -40,3 +40,16 @@ def ensure_utc(dt: datetime | None) -> datetime | None:
40
40
  return dt.astimezone(timezone.utc)
41
41
 
42
42
  return dt
43
+
44
+
45
+ def convert_datetimes_to_strings(obj):
46
+ if isinstance(obj, dict):
47
+ return {k: convert_datetimes_to_strings(v) for k, v in obj.items()}
48
+ elif isinstance(obj, list):
49
+ return [convert_datetimes_to_strings(item) for item in obj]
50
+ elif isinstance(obj, tuple):
51
+ return tuple(convert_datetimes_to_strings(item) for item in obj)
52
+ elif isinstance(obj, datetime):
53
+ return obj.isoformat()
54
+ else:
55
+ return obj