kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
|
|
1
1
|
"""Document chunking nodes for splitting text into manageable pieces."""
|
2
2
|
|
3
|
-
|
3
|
+
import re
|
4
|
+
from typing import Any, Optional
|
5
|
+
|
6
|
+
import numpy as np
|
4
7
|
|
5
8
|
from kailash.nodes.base import Node, NodeParameter, register_node
|
6
9
|
|
@@ -76,3 +79,591 @@ class HierarchicalChunkerNode(Node):
|
|
76
79
|
all_chunks.append(chunk_data)
|
77
80
|
|
78
81
|
return {"chunks": all_chunks}
|
82
|
+
|
83
|
+
|
84
|
+
@register_node()
|
85
|
+
class SemanticChunkerNode(Node):
|
86
|
+
"""
|
87
|
+
Semantic chunking that splits text based on semantic similarity
|
88
|
+
to create meaningful, coherent chunks.
|
89
|
+
|
90
|
+
This node uses embeddings to find natural semantic boundaries in text,
|
91
|
+
creating chunks that maintain topical coherence. It's superior to
|
92
|
+
simple character/token-based splitting for maintaining context.
|
93
|
+
"""
|
94
|
+
|
95
|
+
def __init__(self, name: str = "semantic_chunker", **kwargs):
|
96
|
+
# Set attributes before calling super().__init__() as Kailash validates during init
|
97
|
+
self.chunk_size = kwargs.get("chunk_size", 2000)
|
98
|
+
self.chunk_overlap = kwargs.get("chunk_overlap", 200)
|
99
|
+
self.similarity_threshold = kwargs.get("similarity_threshold", 0.75)
|
100
|
+
self.window_size = kwargs.get("window_size", 3) # Sentences to consider
|
101
|
+
self.min_chunk_size = kwargs.get("min_chunk_size", 100)
|
102
|
+
self.preserve_sentences = kwargs.get("preserve_sentences", True)
|
103
|
+
|
104
|
+
super().__init__(name=name)
|
105
|
+
|
106
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
107
|
+
return {
|
108
|
+
"text": NodeParameter(
|
109
|
+
name="text",
|
110
|
+
type=str,
|
111
|
+
required=True,
|
112
|
+
description="Text to chunk semantically",
|
113
|
+
),
|
114
|
+
"embeddings": NodeParameter(
|
115
|
+
name="embeddings",
|
116
|
+
type=list,
|
117
|
+
required=False,
|
118
|
+
description="Pre-computed sentence embeddings (optional)",
|
119
|
+
),
|
120
|
+
"chunk_size": NodeParameter(
|
121
|
+
name="chunk_size",
|
122
|
+
type=int,
|
123
|
+
required=False,
|
124
|
+
default=self.chunk_size,
|
125
|
+
description="Target size for each chunk in characters",
|
126
|
+
),
|
127
|
+
"chunk_overlap": NodeParameter(
|
128
|
+
name="chunk_overlap",
|
129
|
+
type=int,
|
130
|
+
required=False,
|
131
|
+
default=self.chunk_overlap,
|
132
|
+
description="Number of characters to overlap between chunks",
|
133
|
+
),
|
134
|
+
"similarity_threshold": NodeParameter(
|
135
|
+
name="similarity_threshold",
|
136
|
+
type=float,
|
137
|
+
required=False,
|
138
|
+
default=self.similarity_threshold,
|
139
|
+
description="Similarity threshold for semantic boundaries (0.0-1.0)",
|
140
|
+
),
|
141
|
+
"window_size": NodeParameter(
|
142
|
+
name="window_size",
|
143
|
+
type=int,
|
144
|
+
required=False,
|
145
|
+
default=self.window_size,
|
146
|
+
description="Number of sentences to consider for similarity",
|
147
|
+
),
|
148
|
+
"metadata": NodeParameter(
|
149
|
+
name="metadata",
|
150
|
+
type=dict,
|
151
|
+
required=False,
|
152
|
+
default={},
|
153
|
+
description="Additional metadata to include with chunks",
|
154
|
+
),
|
155
|
+
}
|
156
|
+
|
157
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
158
|
+
text = kwargs.get("text", "")
|
159
|
+
embeddings = kwargs.get("embeddings")
|
160
|
+
chunk_size = kwargs.get("chunk_size", self.chunk_size)
|
161
|
+
chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
|
162
|
+
similarity_threshold = kwargs.get(
|
163
|
+
"similarity_threshold", self.similarity_threshold
|
164
|
+
)
|
165
|
+
window_size = kwargs.get("window_size", self.window_size)
|
166
|
+
metadata = kwargs.get("metadata", {})
|
167
|
+
|
168
|
+
if not text.strip():
|
169
|
+
return {"chunks": []}
|
170
|
+
|
171
|
+
# Split into sentences
|
172
|
+
sentences = self._split_into_sentences(text)
|
173
|
+
|
174
|
+
if len(sentences) <= 1:
|
175
|
+
return {"chunks": [self._create_single_chunk(text, 0, metadata)]}
|
176
|
+
|
177
|
+
# Find semantic boundaries
|
178
|
+
if embeddings and len(embeddings) == len(sentences):
|
179
|
+
# Use provided embeddings
|
180
|
+
boundaries = self._find_semantic_boundaries(
|
181
|
+
sentences, embeddings, similarity_threshold, window_size
|
182
|
+
)
|
183
|
+
else:
|
184
|
+
# Fall back to statistical boundaries based on sentence length variance
|
185
|
+
boundaries = self._find_statistical_boundaries(sentences, chunk_size)
|
186
|
+
|
187
|
+
# Create chunks from boundaries
|
188
|
+
chunks = self._create_chunks_from_boundaries(
|
189
|
+
text, sentences, boundaries, chunk_overlap, chunk_size, metadata
|
190
|
+
)
|
191
|
+
|
192
|
+
return {"chunks": chunks}
|
193
|
+
|
194
|
+
def _split_into_sentences(self, text: str) -> list[str]:
|
195
|
+
"""Split text into sentences using regex."""
|
196
|
+
# Improved sentence splitting pattern
|
197
|
+
sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
|
198
|
+
sentences = re.split(sentence_pattern, text.strip())
|
199
|
+
|
200
|
+
# Further split long sentences
|
201
|
+
final_sentences = []
|
202
|
+
for sentence in sentences:
|
203
|
+
if len(sentence) > 500: # Long sentence threshold
|
204
|
+
# Try to split on semicolons or commas
|
205
|
+
sub_sentences = re.split(r"[;,]\s+", sentence)
|
206
|
+
final_sentences.extend(sub_sentences)
|
207
|
+
else:
|
208
|
+
final_sentences.append(sentence)
|
209
|
+
|
210
|
+
return [s.strip() for s in final_sentences if s.strip()]
|
211
|
+
|
212
|
+
def _find_semantic_boundaries(
|
213
|
+
self,
|
214
|
+
sentences: list[str],
|
215
|
+
embeddings: list[list[float]],
|
216
|
+
similarity_threshold: float,
|
217
|
+
window_size: int,
|
218
|
+
) -> list[int]:
|
219
|
+
"""Find semantic boundaries using embedding similarity."""
|
220
|
+
boundaries = [0] # Always start with first sentence
|
221
|
+
|
222
|
+
for i in range(1, len(sentences) - 1):
|
223
|
+
# Calculate similarity in sliding window
|
224
|
+
window_similarities = []
|
225
|
+
|
226
|
+
for j in range(
|
227
|
+
max(0, i - window_size), min(len(sentences), i + window_size + 1)
|
228
|
+
):
|
229
|
+
if j != i:
|
230
|
+
similarity = self._cosine_similarity(embeddings[i], embeddings[j])
|
231
|
+
window_similarities.append(similarity)
|
232
|
+
|
233
|
+
# Check if this is a good boundary point
|
234
|
+
avg_similarity = np.mean(window_similarities) if window_similarities else 0
|
235
|
+
|
236
|
+
if avg_similarity < similarity_threshold:
|
237
|
+
boundaries.append(i)
|
238
|
+
|
239
|
+
boundaries.append(len(sentences)) # Always end with last sentence
|
240
|
+
return boundaries
|
241
|
+
|
242
|
+
def _find_statistical_boundaries(
|
243
|
+
self, sentences: list[str], target_chunk_size: int
|
244
|
+
) -> list[int]:
|
245
|
+
"""Find boundaries based on statistical properties when embeddings unavailable."""
|
246
|
+
boundaries = [0]
|
247
|
+
current_size = 0
|
248
|
+
|
249
|
+
for i, sentence in enumerate(sentences):
|
250
|
+
current_size += len(sentence)
|
251
|
+
|
252
|
+
# Check if we should create a boundary
|
253
|
+
if current_size >= target_chunk_size and i < len(sentences) - 1:
|
254
|
+
# Look for natural break points
|
255
|
+
if any(
|
256
|
+
sentence.endswith(end) for end in [".", "!", "?", '."', '!"', '?"']
|
257
|
+
):
|
258
|
+
boundaries.append(i + 1)
|
259
|
+
current_size = 0
|
260
|
+
|
261
|
+
boundaries.append(len(sentences))
|
262
|
+
return sorted(list(set(boundaries))) # Remove duplicates and sort
|
263
|
+
|
264
|
+
def _cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
|
265
|
+
"""Calculate cosine similarity between two vectors."""
|
266
|
+
vec1_np = np.array(vec1)
|
267
|
+
vec2_np = np.array(vec2)
|
268
|
+
|
269
|
+
norm1 = np.linalg.norm(vec1_np)
|
270
|
+
norm2 = np.linalg.norm(vec2_np)
|
271
|
+
|
272
|
+
if norm1 == 0 or norm2 == 0:
|
273
|
+
return 0.0
|
274
|
+
|
275
|
+
return np.dot(vec1_np, vec2_np) / (norm1 * norm2)
|
276
|
+
|
277
|
+
def _create_chunks_from_boundaries(
|
278
|
+
self,
|
279
|
+
text: str,
|
280
|
+
sentences: list[str],
|
281
|
+
boundaries: list[int],
|
282
|
+
overlap: int,
|
283
|
+
max_chunk_size: int,
|
284
|
+
metadata: dict,
|
285
|
+
) -> list[dict[str, Any]]:
|
286
|
+
"""Create chunks from boundary indices."""
|
287
|
+
chunks = []
|
288
|
+
|
289
|
+
for i in range(len(boundaries) - 1):
|
290
|
+
start_idx = boundaries[i]
|
291
|
+
end_idx = boundaries[i + 1]
|
292
|
+
|
293
|
+
# Get sentences for this chunk
|
294
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
295
|
+
chunk_text = " ".join(chunk_sentences)
|
296
|
+
|
297
|
+
# Add overlap from previous chunk if not first chunk
|
298
|
+
if i > 0 and overlap > 0:
|
299
|
+
# Get last part of previous chunk
|
300
|
+
prev_chunk_text = chunks[-1]["content"]
|
301
|
+
overlap_text = (
|
302
|
+
prev_chunk_text[-overlap:]
|
303
|
+
if len(prev_chunk_text) > overlap
|
304
|
+
else prev_chunk_text
|
305
|
+
)
|
306
|
+
|
307
|
+
# Find clean break point for overlap
|
308
|
+
last_period = overlap_text.rfind(". ")
|
309
|
+
if last_period > 0:
|
310
|
+
overlap_text = overlap_text[last_period + 2 :]
|
311
|
+
|
312
|
+
chunk_text = overlap_text + " " + chunk_text
|
313
|
+
|
314
|
+
# Ensure chunk doesn't exceed max size
|
315
|
+
if len(chunk_text) > max_chunk_size:
|
316
|
+
# Split further if needed
|
317
|
+
sub_chunks = self._split_large_chunk(chunk_text, max_chunk_size)
|
318
|
+
for j, sub_chunk in enumerate(sub_chunks):
|
319
|
+
chunk_data = self._create_chunk_data(
|
320
|
+
sub_chunk, len(chunks) + j, start_idx, end_idx, metadata
|
321
|
+
)
|
322
|
+
chunks.append(chunk_data)
|
323
|
+
else:
|
324
|
+
chunk_data = self._create_chunk_data(
|
325
|
+
chunk_text, len(chunks), start_idx, end_idx, metadata
|
326
|
+
)
|
327
|
+
chunks.append(chunk_data)
|
328
|
+
|
329
|
+
return chunks
|
330
|
+
|
331
|
+
def _split_large_chunk(self, text: str, max_size: int) -> list[str]:
|
332
|
+
"""Split a large chunk into smaller pieces."""
|
333
|
+
chunks = []
|
334
|
+
words = text.split()
|
335
|
+
current_chunk = []
|
336
|
+
current_size = 0
|
337
|
+
|
338
|
+
for word in words:
|
339
|
+
word_size = len(word) + 1 # +1 for space
|
340
|
+
|
341
|
+
if current_size + word_size > max_size and current_chunk:
|
342
|
+
chunks.append(" ".join(current_chunk))
|
343
|
+
current_chunk = [word]
|
344
|
+
current_size = word_size
|
345
|
+
else:
|
346
|
+
current_chunk.append(word)
|
347
|
+
current_size += word_size
|
348
|
+
|
349
|
+
if current_chunk:
|
350
|
+
chunks.append(" ".join(current_chunk))
|
351
|
+
|
352
|
+
return chunks
|
353
|
+
|
354
|
+
def _create_single_chunk(
|
355
|
+
self, text: str, index: int, metadata: dict
|
356
|
+
) -> dict[str, Any]:
|
357
|
+
"""Create a single chunk when text is too small to split."""
|
358
|
+
return {
|
359
|
+
"chunk_id": f"chunk_{index}",
|
360
|
+
"chunk_index": index,
|
361
|
+
"content": text.strip(),
|
362
|
+
"start_sentence": 0,
|
363
|
+
"end_sentence": 0,
|
364
|
+
"chunk_length": len(text),
|
365
|
+
"word_count": len(text.split()),
|
366
|
+
"chunking_method": "semantic",
|
367
|
+
**metadata,
|
368
|
+
}
|
369
|
+
|
370
|
+
def _create_chunk_data(
|
371
|
+
self,
|
372
|
+
chunk_text: str,
|
373
|
+
chunk_index: int,
|
374
|
+
start_sentence: int,
|
375
|
+
end_sentence: int,
|
376
|
+
metadata: dict,
|
377
|
+
) -> dict[str, Any]:
|
378
|
+
"""Create metadata for a chunk."""
|
379
|
+
return {
|
380
|
+
"chunk_id": f"chunk_{chunk_index}",
|
381
|
+
"chunk_index": chunk_index,
|
382
|
+
"content": chunk_text.strip(),
|
383
|
+
"start_sentence": start_sentence,
|
384
|
+
"end_sentence": end_sentence,
|
385
|
+
"chunk_length": len(chunk_text),
|
386
|
+
"word_count": len(chunk_text.split()),
|
387
|
+
"chunking_method": "semantic",
|
388
|
+
**metadata,
|
389
|
+
}
|
390
|
+
|
391
|
+
|
392
|
+
@register_node()
|
393
|
+
class StatisticalChunkerNode(Node):
|
394
|
+
"""
|
395
|
+
Statistical chunking that splits text based on sentence embeddings variance
|
396
|
+
to identify natural topic boundaries.
|
397
|
+
|
398
|
+
This method analyzes the statistical properties of sentence embeddings
|
399
|
+
to find points where the content significantly shifts, making it ideal
|
400
|
+
for technical documents and structured content.
|
401
|
+
"""
|
402
|
+
|
403
|
+
def __init__(self, name: str = "statistical_chunker", **kwargs):
|
404
|
+
# Set attributes before calling super().__init__() as Kailash validates during init
|
405
|
+
self.chunk_size = kwargs.get("chunk_size", 2000)
|
406
|
+
self.variance_threshold = kwargs.get("variance_threshold", 0.5)
|
407
|
+
self.min_sentences_per_chunk = kwargs.get("min_sentences_per_chunk", 3)
|
408
|
+
self.max_sentences_per_chunk = kwargs.get("max_sentences_per_chunk", 50)
|
409
|
+
self.use_sliding_window = kwargs.get("use_sliding_window", True)
|
410
|
+
self.window_size = kwargs.get("window_size", 5)
|
411
|
+
|
412
|
+
super().__init__(name=name)
|
413
|
+
|
414
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
415
|
+
return {
|
416
|
+
"text": NodeParameter(
|
417
|
+
name="text",
|
418
|
+
type=str,
|
419
|
+
required=True,
|
420
|
+
description="Text to chunk using statistical analysis",
|
421
|
+
),
|
422
|
+
"embeddings": NodeParameter(
|
423
|
+
name="embeddings",
|
424
|
+
type=list,
|
425
|
+
required=False,
|
426
|
+
description="Pre-computed sentence embeddings (optional)",
|
427
|
+
),
|
428
|
+
"chunk_size": NodeParameter(
|
429
|
+
name="chunk_size",
|
430
|
+
type=int,
|
431
|
+
required=False,
|
432
|
+
default=self.chunk_size,
|
433
|
+
description="Target size for each chunk in characters",
|
434
|
+
),
|
435
|
+
"variance_threshold": NodeParameter(
|
436
|
+
name="variance_threshold",
|
437
|
+
type=float,
|
438
|
+
required=False,
|
439
|
+
default=self.variance_threshold,
|
440
|
+
description="Variance threshold for detecting boundaries",
|
441
|
+
),
|
442
|
+
"min_sentences_per_chunk": NodeParameter(
|
443
|
+
name="min_sentences_per_chunk",
|
444
|
+
type=int,
|
445
|
+
required=False,
|
446
|
+
default=self.min_sentences_per_chunk,
|
447
|
+
description="Minimum sentences per chunk",
|
448
|
+
),
|
449
|
+
"max_sentences_per_chunk": NodeParameter(
|
450
|
+
name="max_sentences_per_chunk",
|
451
|
+
type=int,
|
452
|
+
required=False,
|
453
|
+
default=self.max_sentences_per_chunk,
|
454
|
+
description="Maximum sentences per chunk",
|
455
|
+
),
|
456
|
+
"metadata": NodeParameter(
|
457
|
+
name="metadata",
|
458
|
+
type=dict,
|
459
|
+
required=False,
|
460
|
+
default={},
|
461
|
+
description="Additional metadata to include with chunks",
|
462
|
+
),
|
463
|
+
}
|
464
|
+
|
465
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
466
|
+
text = kwargs.get("text", "")
|
467
|
+
embeddings = kwargs.get("embeddings")
|
468
|
+
chunk_size = kwargs.get("chunk_size", self.chunk_size)
|
469
|
+
variance_threshold = kwargs.get("variance_threshold", self.variance_threshold)
|
470
|
+
min_sentences = kwargs.get(
|
471
|
+
"min_sentences_per_chunk", self.min_sentences_per_chunk
|
472
|
+
)
|
473
|
+
max_sentences = kwargs.get(
|
474
|
+
"max_sentences_per_chunk", self.max_sentences_per_chunk
|
475
|
+
)
|
476
|
+
metadata = kwargs.get("metadata", {})
|
477
|
+
|
478
|
+
if not text.strip():
|
479
|
+
return {"chunks": []}
|
480
|
+
|
481
|
+
# Split into sentences
|
482
|
+
sentences = self._split_into_sentences(text)
|
483
|
+
|
484
|
+
if len(sentences) <= min_sentences:
|
485
|
+
return {"chunks": [self._create_single_chunk(text, 0, metadata)]}
|
486
|
+
|
487
|
+
# Find statistical boundaries
|
488
|
+
if embeddings and len(embeddings) == len(sentences):
|
489
|
+
# Use provided embeddings
|
490
|
+
boundaries = self._find_statistical_boundaries(
|
491
|
+
sentences, embeddings, variance_threshold, min_sentences, max_sentences
|
492
|
+
)
|
493
|
+
else:
|
494
|
+
# Fall back to length-based boundaries
|
495
|
+
boundaries = self._find_length_based_boundaries(
|
496
|
+
sentences, chunk_size, min_sentences, max_sentences
|
497
|
+
)
|
498
|
+
|
499
|
+
# Create chunks from boundaries
|
500
|
+
chunks = self._create_chunks_from_boundaries(
|
501
|
+
text, sentences, boundaries, metadata
|
502
|
+
)
|
503
|
+
|
504
|
+
return {"chunks": chunks}
|
505
|
+
|
506
|
+
def _split_into_sentences(self, text: str) -> list[str]:
|
507
|
+
"""Split text into sentences."""
|
508
|
+
# Use same sentence splitting as SemanticChunkerNode
|
509
|
+
sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
|
510
|
+
sentences = re.split(sentence_pattern, text.strip())
|
511
|
+
return [s.strip() for s in sentences if s.strip()]
|
512
|
+
|
513
|
+
def _find_statistical_boundaries(
|
514
|
+
self,
|
515
|
+
sentences: list[str],
|
516
|
+
embeddings: list[list[float]],
|
517
|
+
variance_threshold: float,
|
518
|
+
min_sentences: int,
|
519
|
+
max_sentences: int,
|
520
|
+
) -> list[int]:
|
521
|
+
"""Find boundaries based on embedding variance analysis."""
|
522
|
+
boundaries = [0]
|
523
|
+
|
524
|
+
if self.use_sliding_window:
|
525
|
+
# Calculate variance in sliding windows
|
526
|
+
variances = []
|
527
|
+
for i in range(len(embeddings) - self.window_size + 1):
|
528
|
+
window_embeddings = embeddings[i : i + self.window_size]
|
529
|
+
variance = self._calculate_embedding_variance(window_embeddings)
|
530
|
+
variances.append(variance)
|
531
|
+
|
532
|
+
# Find peaks in variance (indicating topic shifts)
|
533
|
+
current_chunk_start = 0
|
534
|
+
for i, variance in enumerate(variances):
|
535
|
+
sentences_in_chunk = i - current_chunk_start
|
536
|
+
|
537
|
+
# Check if we should create boundary
|
538
|
+
if (
|
539
|
+
variance > variance_threshold
|
540
|
+
and sentences_in_chunk >= min_sentences
|
541
|
+
) or sentences_in_chunk >= max_sentences:
|
542
|
+
boundaries.append(i + self.window_size // 2)
|
543
|
+
current_chunk_start = i + self.window_size // 2
|
544
|
+
else:
|
545
|
+
# Simple variance-based splitting
|
546
|
+
current_chunk_start = 0
|
547
|
+
for i in range(min_sentences, len(sentences), min_sentences):
|
548
|
+
if i - current_chunk_start >= max_sentences:
|
549
|
+
boundaries.append(i)
|
550
|
+
current_chunk_start = i
|
551
|
+
elif i < len(sentences) - min_sentences:
|
552
|
+
# Check variance between chunks
|
553
|
+
chunk1_embeddings = embeddings[current_chunk_start:i]
|
554
|
+
chunk2_embeddings = embeddings[
|
555
|
+
i : min(i + min_sentences, len(embeddings))
|
556
|
+
]
|
557
|
+
|
558
|
+
inter_variance = self._calculate_inter_chunk_variance(
|
559
|
+
chunk1_embeddings, chunk2_embeddings
|
560
|
+
)
|
561
|
+
|
562
|
+
if inter_variance > variance_threshold:
|
563
|
+
boundaries.append(i)
|
564
|
+
current_chunk_start = i
|
565
|
+
|
566
|
+
boundaries.append(len(sentences))
|
567
|
+
return sorted(list(set(boundaries)))
|
568
|
+
|
569
|
+
def _calculate_embedding_variance(self, embeddings: list[list[float]]) -> float:
|
570
|
+
"""Calculate variance of embeddings."""
|
571
|
+
if not embeddings:
|
572
|
+
return 0.0
|
573
|
+
|
574
|
+
embeddings_array = np.array(embeddings)
|
575
|
+
mean_embedding = np.mean(embeddings_array, axis=0)
|
576
|
+
|
577
|
+
# Calculate distances from mean
|
578
|
+
distances = [np.linalg.norm(emb - mean_embedding) for emb in embeddings_array]
|
579
|
+
|
580
|
+
return np.var(distances)
|
581
|
+
|
582
|
+
def _calculate_inter_chunk_variance(
|
583
|
+
self, chunk1_embeddings: list[list[float]], chunk2_embeddings: list[list[float]]
|
584
|
+
) -> float:
|
585
|
+
"""Calculate variance between two chunks."""
|
586
|
+
if not chunk1_embeddings or not chunk2_embeddings:
|
587
|
+
return 0.0
|
588
|
+
|
589
|
+
# Calculate centroids
|
590
|
+
centroid1 = np.mean(chunk1_embeddings, axis=0)
|
591
|
+
centroid2 = np.mean(chunk2_embeddings, axis=0)
|
592
|
+
|
593
|
+
# Return distance between centroids
|
594
|
+
return np.linalg.norm(centroid1 - centroid2)
|
595
|
+
|
596
|
+
def _find_length_based_boundaries(
|
597
|
+
self,
|
598
|
+
sentences: list[str],
|
599
|
+
target_chunk_size: int,
|
600
|
+
min_sentences: int,
|
601
|
+
max_sentences: int,
|
602
|
+
) -> list[int]:
|
603
|
+
"""Find boundaries based on length when embeddings unavailable."""
|
604
|
+
boundaries = [0]
|
605
|
+
current_size = 0
|
606
|
+
current_sentences = 0
|
607
|
+
|
608
|
+
for i, sentence in enumerate(sentences):
|
609
|
+
current_size += len(sentence)
|
610
|
+
current_sentences += 1
|
611
|
+
|
612
|
+
# Check if we should create boundary
|
613
|
+
if (
|
614
|
+
current_size >= target_chunk_size and current_sentences >= min_sentences
|
615
|
+
) or current_sentences >= max_sentences:
|
616
|
+
if i < len(sentences) - 1: # Don't create boundary at last sentence
|
617
|
+
boundaries.append(i + 1)
|
618
|
+
current_size = 0
|
619
|
+
current_sentences = 0
|
620
|
+
|
621
|
+
boundaries.append(len(sentences))
|
622
|
+
return sorted(list(set(boundaries)))
|
623
|
+
|
624
|
+
def _create_chunks_from_boundaries(
|
625
|
+
self, text: str, sentences: list[str], boundaries: list[int], metadata: dict
|
626
|
+
) -> list[dict[str, Any]]:
|
627
|
+
"""Create chunks from boundary indices."""
|
628
|
+
chunks = []
|
629
|
+
|
630
|
+
for i in range(len(boundaries) - 1):
|
631
|
+
start_idx = boundaries[i]
|
632
|
+
end_idx = boundaries[i + 1]
|
633
|
+
|
634
|
+
# Get sentences for this chunk
|
635
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
636
|
+
chunk_text = " ".join(chunk_sentences)
|
637
|
+
|
638
|
+
chunk_data = {
|
639
|
+
"chunk_id": f"chunk_{i}",
|
640
|
+
"chunk_index": i,
|
641
|
+
"content": chunk_text.strip(),
|
642
|
+
"start_sentence": start_idx,
|
643
|
+
"end_sentence": end_idx,
|
644
|
+
"sentence_count": len(chunk_sentences),
|
645
|
+
"chunk_length": len(chunk_text),
|
646
|
+
"word_count": len(chunk_text.split()),
|
647
|
+
"chunking_method": "statistical",
|
648
|
+
**metadata,
|
649
|
+
}
|
650
|
+
chunks.append(chunk_data)
|
651
|
+
|
652
|
+
return chunks
|
653
|
+
|
654
|
+
def _create_single_chunk(
|
655
|
+
self, text: str, index: int, metadata: dict
|
656
|
+
) -> dict[str, Any]:
|
657
|
+
"""Create a single chunk when text is too small to split."""
|
658
|
+
return {
|
659
|
+
"chunk_id": f"chunk_{index}",
|
660
|
+
"chunk_index": index,
|
661
|
+
"content": text.strip(),
|
662
|
+
"start_sentence": 0,
|
663
|
+
"end_sentence": 0,
|
664
|
+
"sentence_count": 1,
|
665
|
+
"chunk_length": len(text),
|
666
|
+
"word_count": len(text.split()),
|
667
|
+
"chunking_method": "statistical",
|
668
|
+
**metadata,
|
669
|
+
}
|