kailash 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. kailash/__init__.py +33 -1
  2. kailash/access_control/__init__.py +129 -0
  3. kailash/access_control/managers.py +461 -0
  4. kailash/access_control/rule_evaluators.py +467 -0
  5. kailash/access_control_abac.py +825 -0
  6. kailash/config/__init__.py +27 -0
  7. kailash/config/database_config.py +359 -0
  8. kailash/database/__init__.py +28 -0
  9. kailash/database/execution_pipeline.py +499 -0
  10. kailash/middleware/__init__.py +306 -0
  11. kailash/middleware/auth/__init__.py +33 -0
  12. kailash/middleware/auth/access_control.py +436 -0
  13. kailash/middleware/auth/auth_manager.py +422 -0
  14. kailash/middleware/auth/jwt_auth.py +477 -0
  15. kailash/middleware/auth/kailash_jwt_auth.py +616 -0
  16. kailash/middleware/communication/__init__.py +37 -0
  17. kailash/middleware/communication/ai_chat.py +989 -0
  18. kailash/middleware/communication/api_gateway.py +802 -0
  19. kailash/middleware/communication/events.py +470 -0
  20. kailash/middleware/communication/realtime.py +710 -0
  21. kailash/middleware/core/__init__.py +21 -0
  22. kailash/middleware/core/agent_ui.py +890 -0
  23. kailash/middleware/core/schema.py +643 -0
  24. kailash/middleware/core/workflows.py +396 -0
  25. kailash/middleware/database/__init__.py +63 -0
  26. kailash/middleware/database/base.py +113 -0
  27. kailash/middleware/database/base_models.py +525 -0
  28. kailash/middleware/database/enums.py +106 -0
  29. kailash/middleware/database/migrations.py +12 -0
  30. kailash/{api/database.py → middleware/database/models.py} +183 -291
  31. kailash/middleware/database/repositories.py +685 -0
  32. kailash/middleware/database/session_manager.py +19 -0
  33. kailash/middleware/mcp/__init__.py +38 -0
  34. kailash/middleware/mcp/client_integration.py +585 -0
  35. kailash/middleware/mcp/enhanced_server.py +576 -0
  36. kailash/nodes/__init__.py +25 -3
  37. kailash/nodes/admin/__init__.py +35 -0
  38. kailash/nodes/admin/audit_log.py +794 -0
  39. kailash/nodes/admin/permission_check.py +864 -0
  40. kailash/nodes/admin/role_management.py +823 -0
  41. kailash/nodes/admin/security_event.py +1519 -0
  42. kailash/nodes/admin/user_management.py +944 -0
  43. kailash/nodes/ai/a2a.py +24 -7
  44. kailash/nodes/ai/ai_providers.py +1 -0
  45. kailash/nodes/ai/embedding_generator.py +11 -11
  46. kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
  47. kailash/nodes/ai/llm_agent.py +407 -2
  48. kailash/nodes/ai/self_organizing.py +85 -10
  49. kailash/nodes/api/auth.py +287 -6
  50. kailash/nodes/api/rest.py +151 -0
  51. kailash/nodes/auth/__init__.py +17 -0
  52. kailash/nodes/auth/directory_integration.py +1228 -0
  53. kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
  54. kailash/nodes/auth/mfa.py +2338 -0
  55. kailash/nodes/auth/risk_assessment.py +872 -0
  56. kailash/nodes/auth/session_management.py +1093 -0
  57. kailash/nodes/auth/sso.py +1040 -0
  58. kailash/nodes/base.py +344 -13
  59. kailash/nodes/base_cycle_aware.py +4 -2
  60. kailash/nodes/base_with_acl.py +1 -1
  61. kailash/nodes/code/python.py +283 -10
  62. kailash/nodes/compliance/__init__.py +9 -0
  63. kailash/nodes/compliance/data_retention.py +1888 -0
  64. kailash/nodes/compliance/gdpr.py +2004 -0
  65. kailash/nodes/data/__init__.py +22 -2
  66. kailash/nodes/data/async_connection.py +469 -0
  67. kailash/nodes/data/async_sql.py +757 -0
  68. kailash/nodes/data/async_vector.py +598 -0
  69. kailash/nodes/data/readers.py +767 -0
  70. kailash/nodes/data/retrieval.py +360 -1
  71. kailash/nodes/data/sharepoint_graph.py +397 -21
  72. kailash/nodes/data/sql.py +94 -5
  73. kailash/nodes/data/streaming.py +68 -8
  74. kailash/nodes/data/vector_db.py +54 -4
  75. kailash/nodes/enterprise/__init__.py +13 -0
  76. kailash/nodes/enterprise/batch_processor.py +741 -0
  77. kailash/nodes/enterprise/data_lineage.py +497 -0
  78. kailash/nodes/logic/convergence.py +31 -9
  79. kailash/nodes/logic/operations.py +14 -3
  80. kailash/nodes/mixins/__init__.py +8 -0
  81. kailash/nodes/mixins/event_emitter.py +201 -0
  82. kailash/nodes/mixins/mcp.py +9 -4
  83. kailash/nodes/mixins/security.py +165 -0
  84. kailash/nodes/monitoring/__init__.py +7 -0
  85. kailash/nodes/monitoring/performance_benchmark.py +2497 -0
  86. kailash/nodes/rag/__init__.py +284 -0
  87. kailash/nodes/rag/advanced.py +1615 -0
  88. kailash/nodes/rag/agentic.py +773 -0
  89. kailash/nodes/rag/conversational.py +999 -0
  90. kailash/nodes/rag/evaluation.py +875 -0
  91. kailash/nodes/rag/federated.py +1188 -0
  92. kailash/nodes/rag/graph.py +721 -0
  93. kailash/nodes/rag/multimodal.py +671 -0
  94. kailash/nodes/rag/optimized.py +933 -0
  95. kailash/nodes/rag/privacy.py +1059 -0
  96. kailash/nodes/rag/query_processing.py +1335 -0
  97. kailash/nodes/rag/realtime.py +764 -0
  98. kailash/nodes/rag/registry.py +547 -0
  99. kailash/nodes/rag/router.py +837 -0
  100. kailash/nodes/rag/similarity.py +1854 -0
  101. kailash/nodes/rag/strategies.py +566 -0
  102. kailash/nodes/rag/workflows.py +575 -0
  103. kailash/nodes/security/__init__.py +19 -0
  104. kailash/nodes/security/abac_evaluator.py +1411 -0
  105. kailash/nodes/security/audit_log.py +91 -0
  106. kailash/nodes/security/behavior_analysis.py +1893 -0
  107. kailash/nodes/security/credential_manager.py +401 -0
  108. kailash/nodes/security/rotating_credentials.py +760 -0
  109. kailash/nodes/security/security_event.py +132 -0
  110. kailash/nodes/security/threat_detection.py +1103 -0
  111. kailash/nodes/testing/__init__.py +9 -0
  112. kailash/nodes/testing/credential_testing.py +499 -0
  113. kailash/nodes/transform/__init__.py +10 -2
  114. kailash/nodes/transform/chunkers.py +592 -1
  115. kailash/nodes/transform/processors.py +484 -14
  116. kailash/nodes/validation.py +321 -0
  117. kailash/runtime/access_controlled.py +1 -1
  118. kailash/runtime/async_local.py +41 -7
  119. kailash/runtime/docker.py +1 -1
  120. kailash/runtime/local.py +474 -55
  121. kailash/runtime/parallel.py +1 -1
  122. kailash/runtime/parallel_cyclic.py +1 -1
  123. kailash/runtime/testing.py +210 -2
  124. kailash/utils/migrations/__init__.py +25 -0
  125. kailash/utils/migrations/generator.py +433 -0
  126. kailash/utils/migrations/models.py +231 -0
  127. kailash/utils/migrations/runner.py +489 -0
  128. kailash/utils/secure_logging.py +342 -0
  129. kailash/workflow/__init__.py +16 -0
  130. kailash/workflow/cyclic_runner.py +3 -4
  131. kailash/workflow/graph.py +70 -2
  132. kailash/workflow/resilience.py +249 -0
  133. kailash/workflow/templates.py +726 -0
  134. {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
  135. kailash-0.4.0.dist-info/RECORD +223 -0
  136. kailash/api/__init__.py +0 -17
  137. kailash/api/__main__.py +0 -6
  138. kailash/api/studio_secure.py +0 -893
  139. kailash/mcp/__main__.py +0 -13
  140. kailash/mcp/server_new.py +0 -336
  141. kailash/mcp/servers/__init__.py +0 -12
  142. kailash-0.3.2.dist-info/RECORD +0 -136
  143. {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
  144. {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
  145. {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
  146. {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
1
  """Document chunking nodes for splitting text into manageable pieces."""
2
2
 
3
- from typing import Any
3
+ import re
4
+ from typing import Any, Optional
5
+
6
+ import numpy as np
4
7
 
5
8
  from kailash.nodes.base import Node, NodeParameter, register_node
6
9
 
@@ -76,3 +79,591 @@ class HierarchicalChunkerNode(Node):
76
79
  all_chunks.append(chunk_data)
77
80
 
78
81
  return {"chunks": all_chunks}
82
+
83
+
84
+ @register_node()
85
+ class SemanticChunkerNode(Node):
86
+ """
87
+ Semantic chunking that splits text based on semantic similarity
88
+ to create meaningful, coherent chunks.
89
+
90
+ This node uses embeddings to find natural semantic boundaries in text,
91
+ creating chunks that maintain topical coherence. It's superior to
92
+ simple character/token-based splitting for maintaining context.
93
+ """
94
+
95
+ def __init__(self, name: str = "semantic_chunker", **kwargs):
96
+ # Set attributes before calling super().__init__() as Kailash validates during init
97
+ self.chunk_size = kwargs.get("chunk_size", 2000)
98
+ self.chunk_overlap = kwargs.get("chunk_overlap", 200)
99
+ self.similarity_threshold = kwargs.get("similarity_threshold", 0.75)
100
+ self.window_size = kwargs.get("window_size", 3) # Sentences to consider
101
+ self.min_chunk_size = kwargs.get("min_chunk_size", 100)
102
+ self.preserve_sentences = kwargs.get("preserve_sentences", True)
103
+
104
+ super().__init__(name=name)
105
+
106
+ def get_parameters(self) -> dict[str, NodeParameter]:
107
+ return {
108
+ "text": NodeParameter(
109
+ name="text",
110
+ type=str,
111
+ required=True,
112
+ description="Text to chunk semantically",
113
+ ),
114
+ "embeddings": NodeParameter(
115
+ name="embeddings",
116
+ type=list,
117
+ required=False,
118
+ description="Pre-computed sentence embeddings (optional)",
119
+ ),
120
+ "chunk_size": NodeParameter(
121
+ name="chunk_size",
122
+ type=int,
123
+ required=False,
124
+ default=self.chunk_size,
125
+ description="Target size for each chunk in characters",
126
+ ),
127
+ "chunk_overlap": NodeParameter(
128
+ name="chunk_overlap",
129
+ type=int,
130
+ required=False,
131
+ default=self.chunk_overlap,
132
+ description="Number of characters to overlap between chunks",
133
+ ),
134
+ "similarity_threshold": NodeParameter(
135
+ name="similarity_threshold",
136
+ type=float,
137
+ required=False,
138
+ default=self.similarity_threshold,
139
+ description="Similarity threshold for semantic boundaries (0.0-1.0)",
140
+ ),
141
+ "window_size": NodeParameter(
142
+ name="window_size",
143
+ type=int,
144
+ required=False,
145
+ default=self.window_size,
146
+ description="Number of sentences to consider for similarity",
147
+ ),
148
+ "metadata": NodeParameter(
149
+ name="metadata",
150
+ type=dict,
151
+ required=False,
152
+ default={},
153
+ description="Additional metadata to include with chunks",
154
+ ),
155
+ }
156
+
157
+ def run(self, **kwargs) -> dict[str, Any]:
158
+ text = kwargs.get("text", "")
159
+ embeddings = kwargs.get("embeddings")
160
+ chunk_size = kwargs.get("chunk_size", self.chunk_size)
161
+ chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
162
+ similarity_threshold = kwargs.get(
163
+ "similarity_threshold", self.similarity_threshold
164
+ )
165
+ window_size = kwargs.get("window_size", self.window_size)
166
+ metadata = kwargs.get("metadata", {})
167
+
168
+ if not text.strip():
169
+ return {"chunks": []}
170
+
171
+ # Split into sentences
172
+ sentences = self._split_into_sentences(text)
173
+
174
+ if len(sentences) <= 1:
175
+ return {"chunks": [self._create_single_chunk(text, 0, metadata)]}
176
+
177
+ # Find semantic boundaries
178
+ if embeddings and len(embeddings) == len(sentences):
179
+ # Use provided embeddings
180
+ boundaries = self._find_semantic_boundaries(
181
+ sentences, embeddings, similarity_threshold, window_size
182
+ )
183
+ else:
184
+ # Fall back to statistical boundaries based on sentence length variance
185
+ boundaries = self._find_statistical_boundaries(sentences, chunk_size)
186
+
187
+ # Create chunks from boundaries
188
+ chunks = self._create_chunks_from_boundaries(
189
+ text, sentences, boundaries, chunk_overlap, chunk_size, metadata
190
+ )
191
+
192
+ return {"chunks": chunks}
193
+
194
+ def _split_into_sentences(self, text: str) -> list[str]:
195
+ """Split text into sentences using regex."""
196
+ # Improved sentence splitting pattern
197
+ sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
198
+ sentences = re.split(sentence_pattern, text.strip())
199
+
200
+ # Further split long sentences
201
+ final_sentences = []
202
+ for sentence in sentences:
203
+ if len(sentence) > 500: # Long sentence threshold
204
+ # Try to split on semicolons or commas
205
+ sub_sentences = re.split(r"[;,]\s+", sentence)
206
+ final_sentences.extend(sub_sentences)
207
+ else:
208
+ final_sentences.append(sentence)
209
+
210
+ return [s.strip() for s in final_sentences if s.strip()]
211
+
212
+ def _find_semantic_boundaries(
213
+ self,
214
+ sentences: list[str],
215
+ embeddings: list[list[float]],
216
+ similarity_threshold: float,
217
+ window_size: int,
218
+ ) -> list[int]:
219
+ """Find semantic boundaries using embedding similarity."""
220
+ boundaries = [0] # Always start with first sentence
221
+
222
+ for i in range(1, len(sentences) - 1):
223
+ # Calculate similarity in sliding window
224
+ window_similarities = []
225
+
226
+ for j in range(
227
+ max(0, i - window_size), min(len(sentences), i + window_size + 1)
228
+ ):
229
+ if j != i:
230
+ similarity = self._cosine_similarity(embeddings[i], embeddings[j])
231
+ window_similarities.append(similarity)
232
+
233
+ # Check if this is a good boundary point
234
+ avg_similarity = np.mean(window_similarities) if window_similarities else 0
235
+
236
+ if avg_similarity < similarity_threshold:
237
+ boundaries.append(i)
238
+
239
+ boundaries.append(len(sentences)) # Always end with last sentence
240
+ return boundaries
241
+
242
+ def _find_statistical_boundaries(
243
+ self, sentences: list[str], target_chunk_size: int
244
+ ) -> list[int]:
245
+ """Find boundaries based on statistical properties when embeddings unavailable."""
246
+ boundaries = [0]
247
+ current_size = 0
248
+
249
+ for i, sentence in enumerate(sentences):
250
+ current_size += len(sentence)
251
+
252
+ # Check if we should create a boundary
253
+ if current_size >= target_chunk_size and i < len(sentences) - 1:
254
+ # Look for natural break points
255
+ if any(
256
+ sentence.endswith(end) for end in [".", "!", "?", '."', '!"', '?"']
257
+ ):
258
+ boundaries.append(i + 1)
259
+ current_size = 0
260
+
261
+ boundaries.append(len(sentences))
262
+ return sorted(list(set(boundaries))) # Remove duplicates and sort
263
+
264
+ def _cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
265
+ """Calculate cosine similarity between two vectors."""
266
+ vec1_np = np.array(vec1)
267
+ vec2_np = np.array(vec2)
268
+
269
+ norm1 = np.linalg.norm(vec1_np)
270
+ norm2 = np.linalg.norm(vec2_np)
271
+
272
+ if norm1 == 0 or norm2 == 0:
273
+ return 0.0
274
+
275
+ return np.dot(vec1_np, vec2_np) / (norm1 * norm2)
276
+
277
+ def _create_chunks_from_boundaries(
278
+ self,
279
+ text: str,
280
+ sentences: list[str],
281
+ boundaries: list[int],
282
+ overlap: int,
283
+ max_chunk_size: int,
284
+ metadata: dict,
285
+ ) -> list[dict[str, Any]]:
286
+ """Create chunks from boundary indices."""
287
+ chunks = []
288
+
289
+ for i in range(len(boundaries) - 1):
290
+ start_idx = boundaries[i]
291
+ end_idx = boundaries[i + 1]
292
+
293
+ # Get sentences for this chunk
294
+ chunk_sentences = sentences[start_idx:end_idx]
295
+ chunk_text = " ".join(chunk_sentences)
296
+
297
+ # Add overlap from previous chunk if not first chunk
298
+ if i > 0 and overlap > 0:
299
+ # Get last part of previous chunk
300
+ prev_chunk_text = chunks[-1]["content"]
301
+ overlap_text = (
302
+ prev_chunk_text[-overlap:]
303
+ if len(prev_chunk_text) > overlap
304
+ else prev_chunk_text
305
+ )
306
+
307
+ # Find clean break point for overlap
308
+ last_period = overlap_text.rfind(". ")
309
+ if last_period > 0:
310
+ overlap_text = overlap_text[last_period + 2 :]
311
+
312
+ chunk_text = overlap_text + " " + chunk_text
313
+
314
+ # Ensure chunk doesn't exceed max size
315
+ if len(chunk_text) > max_chunk_size:
316
+ # Split further if needed
317
+ sub_chunks = self._split_large_chunk(chunk_text, max_chunk_size)
318
+ for j, sub_chunk in enumerate(sub_chunks):
319
+ chunk_data = self._create_chunk_data(
320
+ sub_chunk, len(chunks) + j, start_idx, end_idx, metadata
321
+ )
322
+ chunks.append(chunk_data)
323
+ else:
324
+ chunk_data = self._create_chunk_data(
325
+ chunk_text, len(chunks), start_idx, end_idx, metadata
326
+ )
327
+ chunks.append(chunk_data)
328
+
329
+ return chunks
330
+
331
+ def _split_large_chunk(self, text: str, max_size: int) -> list[str]:
332
+ """Split a large chunk into smaller pieces."""
333
+ chunks = []
334
+ words = text.split()
335
+ current_chunk = []
336
+ current_size = 0
337
+
338
+ for word in words:
339
+ word_size = len(word) + 1 # +1 for space
340
+
341
+ if current_size + word_size > max_size and current_chunk:
342
+ chunks.append(" ".join(current_chunk))
343
+ current_chunk = [word]
344
+ current_size = word_size
345
+ else:
346
+ current_chunk.append(word)
347
+ current_size += word_size
348
+
349
+ if current_chunk:
350
+ chunks.append(" ".join(current_chunk))
351
+
352
+ return chunks
353
+
354
+ def _create_single_chunk(
355
+ self, text: str, index: int, metadata: dict
356
+ ) -> dict[str, Any]:
357
+ """Create a single chunk when text is too small to split."""
358
+ return {
359
+ "chunk_id": f"chunk_{index}",
360
+ "chunk_index": index,
361
+ "content": text.strip(),
362
+ "start_sentence": 0,
363
+ "end_sentence": 0,
364
+ "chunk_length": len(text),
365
+ "word_count": len(text.split()),
366
+ "chunking_method": "semantic",
367
+ **metadata,
368
+ }
369
+
370
+ def _create_chunk_data(
371
+ self,
372
+ chunk_text: str,
373
+ chunk_index: int,
374
+ start_sentence: int,
375
+ end_sentence: int,
376
+ metadata: dict,
377
+ ) -> dict[str, Any]:
378
+ """Create metadata for a chunk."""
379
+ return {
380
+ "chunk_id": f"chunk_{chunk_index}",
381
+ "chunk_index": chunk_index,
382
+ "content": chunk_text.strip(),
383
+ "start_sentence": start_sentence,
384
+ "end_sentence": end_sentence,
385
+ "chunk_length": len(chunk_text),
386
+ "word_count": len(chunk_text.split()),
387
+ "chunking_method": "semantic",
388
+ **metadata,
389
+ }
390
+
391
+
392
+ @register_node()
393
+ class StatisticalChunkerNode(Node):
394
+ """
395
+ Statistical chunking that splits text based on sentence embeddings variance
396
+ to identify natural topic boundaries.
397
+
398
+ This method analyzes the statistical properties of sentence embeddings
399
+ to find points where the content significantly shifts, making it ideal
400
+ for technical documents and structured content.
401
+ """
402
+
403
+ def __init__(self, name: str = "statistical_chunker", **kwargs):
404
+ # Set attributes before calling super().__init__() as Kailash validates during init
405
+ self.chunk_size = kwargs.get("chunk_size", 2000)
406
+ self.variance_threshold = kwargs.get("variance_threshold", 0.5)
407
+ self.min_sentences_per_chunk = kwargs.get("min_sentences_per_chunk", 3)
408
+ self.max_sentences_per_chunk = kwargs.get("max_sentences_per_chunk", 50)
409
+ self.use_sliding_window = kwargs.get("use_sliding_window", True)
410
+ self.window_size = kwargs.get("window_size", 5)
411
+
412
+ super().__init__(name=name)
413
+
414
+ def get_parameters(self) -> dict[str, NodeParameter]:
415
+ return {
416
+ "text": NodeParameter(
417
+ name="text",
418
+ type=str,
419
+ required=True,
420
+ description="Text to chunk using statistical analysis",
421
+ ),
422
+ "embeddings": NodeParameter(
423
+ name="embeddings",
424
+ type=list,
425
+ required=False,
426
+ description="Pre-computed sentence embeddings (optional)",
427
+ ),
428
+ "chunk_size": NodeParameter(
429
+ name="chunk_size",
430
+ type=int,
431
+ required=False,
432
+ default=self.chunk_size,
433
+ description="Target size for each chunk in characters",
434
+ ),
435
+ "variance_threshold": NodeParameter(
436
+ name="variance_threshold",
437
+ type=float,
438
+ required=False,
439
+ default=self.variance_threshold,
440
+ description="Variance threshold for detecting boundaries",
441
+ ),
442
+ "min_sentences_per_chunk": NodeParameter(
443
+ name="min_sentences_per_chunk",
444
+ type=int,
445
+ required=False,
446
+ default=self.min_sentences_per_chunk,
447
+ description="Minimum sentences per chunk",
448
+ ),
449
+ "max_sentences_per_chunk": NodeParameter(
450
+ name="max_sentences_per_chunk",
451
+ type=int,
452
+ required=False,
453
+ default=self.max_sentences_per_chunk,
454
+ description="Maximum sentences per chunk",
455
+ ),
456
+ "metadata": NodeParameter(
457
+ name="metadata",
458
+ type=dict,
459
+ required=False,
460
+ default={},
461
+ description="Additional metadata to include with chunks",
462
+ ),
463
+ }
464
+
465
+ def run(self, **kwargs) -> dict[str, Any]:
466
+ text = kwargs.get("text", "")
467
+ embeddings = kwargs.get("embeddings")
468
+ chunk_size = kwargs.get("chunk_size", self.chunk_size)
469
+ variance_threshold = kwargs.get("variance_threshold", self.variance_threshold)
470
+ min_sentences = kwargs.get(
471
+ "min_sentences_per_chunk", self.min_sentences_per_chunk
472
+ )
473
+ max_sentences = kwargs.get(
474
+ "max_sentences_per_chunk", self.max_sentences_per_chunk
475
+ )
476
+ metadata = kwargs.get("metadata", {})
477
+
478
+ if not text.strip():
479
+ return {"chunks": []}
480
+
481
+ # Split into sentences
482
+ sentences = self._split_into_sentences(text)
483
+
484
+ if len(sentences) <= min_sentences:
485
+ return {"chunks": [self._create_single_chunk(text, 0, metadata)]}
486
+
487
+ # Find statistical boundaries
488
+ if embeddings and len(embeddings) == len(sentences):
489
+ # Use provided embeddings
490
+ boundaries = self._find_statistical_boundaries(
491
+ sentences, embeddings, variance_threshold, min_sentences, max_sentences
492
+ )
493
+ else:
494
+ # Fall back to length-based boundaries
495
+ boundaries = self._find_length_based_boundaries(
496
+ sentences, chunk_size, min_sentences, max_sentences
497
+ )
498
+
499
+ # Create chunks from boundaries
500
+ chunks = self._create_chunks_from_boundaries(
501
+ text, sentences, boundaries, metadata
502
+ )
503
+
504
+ return {"chunks": chunks}
505
+
506
+ def _split_into_sentences(self, text: str) -> list[str]:
507
+ """Split text into sentences."""
508
+ # Use same sentence splitting as SemanticChunkerNode
509
+ sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
510
+ sentences = re.split(sentence_pattern, text.strip())
511
+ return [s.strip() for s in sentences if s.strip()]
512
+
513
+ def _find_statistical_boundaries(
514
+ self,
515
+ sentences: list[str],
516
+ embeddings: list[list[float]],
517
+ variance_threshold: float,
518
+ min_sentences: int,
519
+ max_sentences: int,
520
+ ) -> list[int]:
521
+ """Find boundaries based on embedding variance analysis."""
522
+ boundaries = [0]
523
+
524
+ if self.use_sliding_window:
525
+ # Calculate variance in sliding windows
526
+ variances = []
527
+ for i in range(len(embeddings) - self.window_size + 1):
528
+ window_embeddings = embeddings[i : i + self.window_size]
529
+ variance = self._calculate_embedding_variance(window_embeddings)
530
+ variances.append(variance)
531
+
532
+ # Find peaks in variance (indicating topic shifts)
533
+ current_chunk_start = 0
534
+ for i, variance in enumerate(variances):
535
+ sentences_in_chunk = i - current_chunk_start
536
+
537
+ # Check if we should create boundary
538
+ if (
539
+ variance > variance_threshold
540
+ and sentences_in_chunk >= min_sentences
541
+ ) or sentences_in_chunk >= max_sentences:
542
+ boundaries.append(i + self.window_size // 2)
543
+ current_chunk_start = i + self.window_size // 2
544
+ else:
545
+ # Simple variance-based splitting
546
+ current_chunk_start = 0
547
+ for i in range(min_sentences, len(sentences), min_sentences):
548
+ if i - current_chunk_start >= max_sentences:
549
+ boundaries.append(i)
550
+ current_chunk_start = i
551
+ elif i < len(sentences) - min_sentences:
552
+ # Check variance between chunks
553
+ chunk1_embeddings = embeddings[current_chunk_start:i]
554
+ chunk2_embeddings = embeddings[
555
+ i : min(i + min_sentences, len(embeddings))
556
+ ]
557
+
558
+ inter_variance = self._calculate_inter_chunk_variance(
559
+ chunk1_embeddings, chunk2_embeddings
560
+ )
561
+
562
+ if inter_variance > variance_threshold:
563
+ boundaries.append(i)
564
+ current_chunk_start = i
565
+
566
+ boundaries.append(len(sentences))
567
+ return sorted(list(set(boundaries)))
568
+
569
+ def _calculate_embedding_variance(self, embeddings: list[list[float]]) -> float:
570
+ """Calculate variance of embeddings."""
571
+ if not embeddings:
572
+ return 0.0
573
+
574
+ embeddings_array = np.array(embeddings)
575
+ mean_embedding = np.mean(embeddings_array, axis=0)
576
+
577
+ # Calculate distances from mean
578
+ distances = [np.linalg.norm(emb - mean_embedding) for emb in embeddings_array]
579
+
580
+ return np.var(distances)
581
+
582
+ def _calculate_inter_chunk_variance(
583
+ self, chunk1_embeddings: list[list[float]], chunk2_embeddings: list[list[float]]
584
+ ) -> float:
585
+ """Calculate variance between two chunks."""
586
+ if not chunk1_embeddings or not chunk2_embeddings:
587
+ return 0.0
588
+
589
+ # Calculate centroids
590
+ centroid1 = np.mean(chunk1_embeddings, axis=0)
591
+ centroid2 = np.mean(chunk2_embeddings, axis=0)
592
+
593
+ # Return distance between centroids
594
+ return np.linalg.norm(centroid1 - centroid2)
595
+
596
+ def _find_length_based_boundaries(
597
+ self,
598
+ sentences: list[str],
599
+ target_chunk_size: int,
600
+ min_sentences: int,
601
+ max_sentences: int,
602
+ ) -> list[int]:
603
+ """Find boundaries based on length when embeddings unavailable."""
604
+ boundaries = [0]
605
+ current_size = 0
606
+ current_sentences = 0
607
+
608
+ for i, sentence in enumerate(sentences):
609
+ current_size += len(sentence)
610
+ current_sentences += 1
611
+
612
+ # Check if we should create boundary
613
+ if (
614
+ current_size >= target_chunk_size and current_sentences >= min_sentences
615
+ ) or current_sentences >= max_sentences:
616
+ if i < len(sentences) - 1: # Don't create boundary at last sentence
617
+ boundaries.append(i + 1)
618
+ current_size = 0
619
+ current_sentences = 0
620
+
621
+ boundaries.append(len(sentences))
622
+ return sorted(list(set(boundaries)))
623
+
624
+ def _create_chunks_from_boundaries(
625
+ self, text: str, sentences: list[str], boundaries: list[int], metadata: dict
626
+ ) -> list[dict[str, Any]]:
627
+ """Create chunks from boundary indices."""
628
+ chunks = []
629
+
630
+ for i in range(len(boundaries) - 1):
631
+ start_idx = boundaries[i]
632
+ end_idx = boundaries[i + 1]
633
+
634
+ # Get sentences for this chunk
635
+ chunk_sentences = sentences[start_idx:end_idx]
636
+ chunk_text = " ".join(chunk_sentences)
637
+
638
+ chunk_data = {
639
+ "chunk_id": f"chunk_{i}",
640
+ "chunk_index": i,
641
+ "content": chunk_text.strip(),
642
+ "start_sentence": start_idx,
643
+ "end_sentence": end_idx,
644
+ "sentence_count": len(chunk_sentences),
645
+ "chunk_length": len(chunk_text),
646
+ "word_count": len(chunk_text.split()),
647
+ "chunking_method": "statistical",
648
+ **metadata,
649
+ }
650
+ chunks.append(chunk_data)
651
+
652
+ return chunks
653
+
654
+ def _create_single_chunk(
655
+ self, text: str, index: int, metadata: dict
656
+ ) -> dict[str, Any]:
657
+ """Create a single chunk when text is too small to split."""
658
+ return {
659
+ "chunk_id": f"chunk_{index}",
660
+ "chunk_index": index,
661
+ "content": text.strip(),
662
+ "start_sentence": 0,
663
+ "end_sentence": 0,
664
+ "sentence_count": 1,
665
+ "chunk_length": len(text),
666
+ "word_count": len(text.split()),
667
+ "chunking_method": "statistical",
668
+ **metadata,
669
+ }