kailash 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. kailash/__init__.py +31 -0
  2. kailash/__main__.py +11 -0
  3. kailash/cli/__init__.py +5 -0
  4. kailash/cli/commands.py +563 -0
  5. kailash/manifest.py +778 -0
  6. kailash/nodes/__init__.py +23 -0
  7. kailash/nodes/ai/__init__.py +26 -0
  8. kailash/nodes/ai/agents.py +417 -0
  9. kailash/nodes/ai/models.py +488 -0
  10. kailash/nodes/api/__init__.py +52 -0
  11. kailash/nodes/api/auth.py +567 -0
  12. kailash/nodes/api/graphql.py +480 -0
  13. kailash/nodes/api/http.py +598 -0
  14. kailash/nodes/api/rate_limiting.py +572 -0
  15. kailash/nodes/api/rest.py +665 -0
  16. kailash/nodes/base.py +1032 -0
  17. kailash/nodes/base_async.py +128 -0
  18. kailash/nodes/code/__init__.py +32 -0
  19. kailash/nodes/code/python.py +1021 -0
  20. kailash/nodes/data/__init__.py +125 -0
  21. kailash/nodes/data/readers.py +496 -0
  22. kailash/nodes/data/sharepoint_graph.py +623 -0
  23. kailash/nodes/data/sql.py +380 -0
  24. kailash/nodes/data/streaming.py +1168 -0
  25. kailash/nodes/data/vector_db.py +964 -0
  26. kailash/nodes/data/writers.py +529 -0
  27. kailash/nodes/logic/__init__.py +6 -0
  28. kailash/nodes/logic/async_operations.py +702 -0
  29. kailash/nodes/logic/operations.py +551 -0
  30. kailash/nodes/transform/__init__.py +5 -0
  31. kailash/nodes/transform/processors.py +379 -0
  32. kailash/runtime/__init__.py +6 -0
  33. kailash/runtime/async_local.py +356 -0
  34. kailash/runtime/docker.py +697 -0
  35. kailash/runtime/local.py +434 -0
  36. kailash/runtime/parallel.py +557 -0
  37. kailash/runtime/runner.py +110 -0
  38. kailash/runtime/testing.py +347 -0
  39. kailash/sdk_exceptions.py +307 -0
  40. kailash/tracking/__init__.py +7 -0
  41. kailash/tracking/manager.py +885 -0
  42. kailash/tracking/metrics_collector.py +342 -0
  43. kailash/tracking/models.py +535 -0
  44. kailash/tracking/storage/__init__.py +0 -0
  45. kailash/tracking/storage/base.py +113 -0
  46. kailash/tracking/storage/database.py +619 -0
  47. kailash/tracking/storage/filesystem.py +543 -0
  48. kailash/utils/__init__.py +0 -0
  49. kailash/utils/export.py +924 -0
  50. kailash/utils/templates.py +680 -0
  51. kailash/visualization/__init__.py +62 -0
  52. kailash/visualization/api.py +732 -0
  53. kailash/visualization/dashboard.py +951 -0
  54. kailash/visualization/performance.py +808 -0
  55. kailash/visualization/reports.py +1471 -0
  56. kailash/workflow/__init__.py +15 -0
  57. kailash/workflow/builder.py +245 -0
  58. kailash/workflow/graph.py +827 -0
  59. kailash/workflow/mermaid_visualizer.py +628 -0
  60. kailash/workflow/mock_registry.py +63 -0
  61. kailash/workflow/runner.py +302 -0
  62. kailash/workflow/state.py +238 -0
  63. kailash/workflow/visualization.py +588 -0
  64. kailash-0.1.0.dist-info/METADATA +710 -0
  65. kailash-0.1.0.dist-info/RECORD +69 -0
  66. kailash-0.1.0.dist-info/WHEEL +5 -0
  67. kailash-0.1.0.dist-info/entry_points.txt +2 -0
  68. kailash-0.1.0.dist-info/licenses/LICENSE +21 -0
  69. kailash-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,964 @@
1
+ """Vector database and embedding nodes for the Kailash system.
2
+
3
+ This module provides nodes for interacting with vector databases and generating
4
+ embeddings. Key features include:
5
+
6
+ - Unified interface for various vector databases (Pinecone, Weaviate, Milvus, etc.)
7
+ - Embedding generation using various models (OpenAI, HuggingFace, etc.)
8
+ - Text chunking and preprocessing
9
+ - Vector similarity search
10
+ - Metadata filtering
11
+
12
+ Design Philosophy:
13
+ - Abstract away vector database differences
14
+ - Support multiple embedding models
15
+ - Provide flexible search capabilities
16
+ - Enable metadata-based filtering
17
+ - Handle text preprocessing
18
+
19
+ Common Use Cases:
20
+ - Semantic search applications
21
+ - RAG (Retrieval Augmented Generation) pipelines
22
+ - Content similarity analysis
23
+ - Document clustering
24
+ - Knowledge base retrieval
25
+
26
+ Example:
27
+ >>> # Generate embeddings
28
+ >>> embedder = EmbeddingNode()
29
+ >>> embedder.configure({"model": "openai", "model_name": "text-embedding-ada-002"})
30
+ >>> result = embedder.execute({"texts": ["Hello world", "Goodbye world"]})
31
+ >>>
32
+ >>> # Store in vector database
33
+ >>> vector_db = VectorDatabaseNode()
34
+ >>> vector_db.configure({
35
+ ... "provider": "pinecone",
36
+ ... "index_name": "my-index",
37
+ ... "api_key": "your-api-key"
38
+ ... })
39
+ >>> vector_db.execute({
40
+ ... "operation": "upsert",
41
+ ... "vectors": result["embeddings"],
42
+ ... "ids": ["doc1", "doc2"],
43
+ ... "metadata": [{"source": "file1"}, {"source": "file2"}]
44
+ ... })
45
+ """
46
+
47
+ from typing import Any, Dict, List
48
+
49
+ import numpy as np
50
+
51
+ from kailash.nodes.base import Node, NodeMetadata, NodeParameter, register_node
52
+ from kailash.sdk_exceptions import NodeConfigurationError, NodeExecutionError
53
+
54
+
55
+ @register_node()
56
+ class EmbeddingNode(Node):
57
+ """Generates embeddings for text data using various embedding models.
58
+
59
+ This node provides a unified interface for generating text embeddings using
60
+ different models and providers (OpenAI, HuggingFace, Cohere, etc.). It handles
61
+ text preprocessing, batching, and error recovery.
62
+
63
+ Design Pattern:
64
+ - Strategy pattern for different embedding providers
65
+ - Facade pattern for unified interface
66
+ - Builder pattern for configuration
67
+
68
+ Features:
69
+ - Multiple embedding model support
70
+ - Automatic text truncation
71
+ - Batch processing
72
+ - Error handling with retries
73
+ - Model caching
74
+
75
+ Common Usage Patterns:
76
+ - Text to vector conversion for similarity search
77
+ - Document embedding for clustering
78
+ - Query embedding for semantic search
79
+ - Content analysis pipelines
80
+
81
+ Upstream Dependencies:
82
+ - Text preprocessing nodes (TextSplitter, TextCleaner)
83
+ - Document reader nodes (PDFReader, DocxReader)
84
+ - API configuration nodes
85
+
86
+ Downstream Consumers:
87
+ - VectorDatabaseNode
88
+ - SimilaritySearchNode
89
+ - ClusteringNode
90
+ - RAG pipeline nodes
91
+
92
+ Configuration:
93
+ model (str): Model provider ("openai", "huggingface", "cohere")
94
+ model_name (str): Specific model name (e.g., "text-embedding-ada-002")
95
+ api_key (str): API key for the provider (if required)
96
+ batch_size (int): Number of texts to process in one batch
97
+ max_tokens (int): Maximum tokens per text
98
+ normalize (bool): Whether to normalize embeddings
99
+
100
+ Inputs:
101
+ texts (List[str]): List of texts to embed
102
+
103
+ Outputs:
104
+ embeddings (List[List[float]]): Generated embeddings
105
+ model_info (Dict): Model metadata (dimensions, etc.)
106
+
107
+ Error Handling:
108
+ - Validates model availability
109
+ - Handles API rate limits
110
+ - Manages token limits
111
+ - Retries on transient failures
112
+
113
+ Example:
114
+ >>> embedder = EmbeddingNode()
115
+ >>> embedder.configure({
116
+ ... "model": "openai",
117
+ ... "model_name": "text-embedding-ada-002",
118
+ ... "api_key": "your-api-key",
119
+ ... "batch_size": 100,
120
+ ... "normalize": True
121
+ ... })
122
+ >>> result = embedder.execute({
123
+ ... "texts": ["Sample text 1", "Sample text 2"]
124
+ ... })
125
+ >>> print(f"Embedding dimensions: {len(result['embeddings'][0])}")
126
+ """
127
+
128
+ metadata = NodeMetadata(
129
+ name="EmbeddingNode",
130
+ description="Generates embeddings for text data",
131
+ version="1.0.0",
132
+ tags={"embedding", "nlp", "vector"},
133
+ )
134
+
135
+ def __init__(self):
136
+ """Initialize the embedding node.
137
+
138
+ Sets up the node with default configuration and prepares for
139
+ model initialization. The actual model is loaded during configuration.
140
+ """
141
+ super().__init__()
142
+ self._model = None
143
+ self._model_info = {}
144
+
145
+ def get_parameters(self) -> Dict[str, NodeParameter]:
146
+ """Define parameters for the embedding node."""
147
+ return {
148
+ "model": NodeParameter(
149
+ name="model",
150
+ type=str,
151
+ description="Model provider",
152
+ required=True,
153
+ default="openai",
154
+ ),
155
+ "model_name": NodeParameter(
156
+ name="model_name",
157
+ type=str,
158
+ description="Specific model name",
159
+ required=True,
160
+ default="text-embedding-ada-002",
161
+ ),
162
+ "api_key": NodeParameter(
163
+ name="api_key",
164
+ type=str,
165
+ description="API key for the provider",
166
+ required=False,
167
+ ),
168
+ "batch_size": NodeParameter(
169
+ name="batch_size",
170
+ type=int,
171
+ description="Batch size for processing",
172
+ required=False,
173
+ default=100,
174
+ ),
175
+ "max_tokens": NodeParameter(
176
+ name="max_tokens",
177
+ type=int,
178
+ description="Maximum tokens per text",
179
+ required=False,
180
+ default=8192,
181
+ ),
182
+ "normalize": NodeParameter(
183
+ name="normalize",
184
+ type=bool,
185
+ description="Normalize embeddings",
186
+ required=False,
187
+ default=True,
188
+ ),
189
+ }
190
+
191
+ def configure(self, config: Dict[str, Any]) -> None:
192
+ """Configure the embedding node with model settings.
193
+
194
+ Validates configuration, initializes the embedding model, and
195
+ prepares for text processing. Different models require different
196
+ configuration parameters.
197
+
198
+ Args:
199
+ config: Configuration dictionary with model settings
200
+
201
+ Raises:
202
+ NodeConfigurationError: If configuration is invalid
203
+ """
204
+ super().configure(config)
205
+
206
+ # Initialize model based on provider
207
+ model_provider = self.config.get("model", "openai")
208
+ model_name = self.config.get("model_name")
209
+
210
+ if not model_name:
211
+ raise NodeConfigurationError("model_name is required")
212
+
213
+ try:
214
+ # Placeholder for actual model initialization
215
+ self._initialize_model(model_provider, model_name)
216
+ except Exception as e:
217
+ raise NodeConfigurationError(f"Failed to initialize model: {str(e)}")
218
+
219
+ def _initialize_model(self, provider: str, model_name: str) -> None:
220
+ """Initialize the embedding model.
221
+
222
+ Loads the specified model and prepares it for use. This is a
223
+ placeholder for actual model initialization logic.
224
+
225
+ Args:
226
+ provider: Model provider name
227
+ model_name: Specific model identifier
228
+
229
+ Raises:
230
+ ValueError: If provider is not supported
231
+ """
232
+ # Placeholder for actual model initialization
233
+ if provider not in ["openai", "huggingface", "cohere", "custom"]:
234
+ raise ValueError(f"Unsupported provider: {provider}")
235
+
236
+ self._model = f"{provider}:{model_name}" # Placeholder
237
+ self._model_info = {
238
+ "provider": provider,
239
+ "model_name": model_name,
240
+ "dimensions": 1536 if provider == "openai" else 768,
241
+ "max_tokens": self.config.get("max_tokens", 8192),
242
+ }
243
+
244
+ def run(self, **kwargs) -> Dict[str, Any]:
245
+ """Generate embeddings for input texts.
246
+
247
+ Implementation of the abstract run method from the base Node class.
248
+
249
+ Args:
250
+ **kwargs: Keyword arguments containing 'texts' list
251
+
252
+ Returns:
253
+ Dictionary containing embeddings and model info
254
+ """
255
+ return self.execute(kwargs)
256
+
257
+ def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
258
+ """Generate embeddings for input texts.
259
+
260
+ Processes the input texts through the configured embedding model,
261
+ handling batching, normalization, and error recovery.
262
+
263
+ Args:
264
+ inputs: Dictionary containing 'texts' list
265
+
266
+ Returns:
267
+ Dictionary containing embeddings and model info
268
+
269
+ Raises:
270
+ NodeExecutionError: If embedding generation fails
271
+ """
272
+ try:
273
+ texts = inputs.get("texts", [])
274
+ if not texts:
275
+ raise ValueError("No texts provided for embedding")
276
+
277
+ # Process texts in batches
278
+ batch_size = self.config.get("batch_size", 100)
279
+ all_embeddings = []
280
+
281
+ for i in range(0, len(texts), batch_size):
282
+ batch = texts[i : i + batch_size]
283
+ batch_embeddings = self._generate_embeddings(batch)
284
+ all_embeddings.extend(batch_embeddings)
285
+
286
+ # Normalize if requested
287
+ if self.config.get("normalize", True):
288
+ all_embeddings = self._normalize_embeddings(all_embeddings)
289
+
290
+ return {
291
+ "embeddings": all_embeddings,
292
+ "model_info": self._model_info.copy(),
293
+ "count": len(all_embeddings),
294
+ }
295
+ except Exception as e:
296
+ raise NodeExecutionError(f"Failed to generate embeddings: {str(e)}")
297
+
298
+ def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
299
+ """Generate embeddings for a batch of texts.
300
+
301
+ This is a placeholder for actual embedding generation logic.
302
+
303
+ Args:
304
+ texts: List of texts to embed
305
+
306
+ Returns:
307
+ List of embedding vectors
308
+ """
309
+ # Placeholder implementation
310
+ dim = self._model_info.get("dimensions", 768)
311
+ return [np.random.randn(dim).tolist() for _ in texts]
312
+
313
+ def _normalize_embeddings(self, embeddings: List[List[float]]) -> List[List[float]]:
314
+ """Normalize embedding vectors to unit length.
315
+
316
+ Normalizes each embedding vector to have a magnitude of 1.0,
317
+ which is useful for cosine similarity calculations.
318
+
319
+ Args:
320
+ embeddings: List of embedding vectors
321
+
322
+ Returns:
323
+ List of normalized embedding vectors
324
+ """
325
+ normalized = []
326
+ for embedding in embeddings:
327
+ vec = np.array(embedding)
328
+ norm = np.linalg.norm(vec)
329
+ if norm > 0:
330
+ normalized.append((vec / norm).tolist())
331
+ else:
332
+ normalized.append(embedding)
333
+ return normalized
334
+
335
+
336
+ @register_node()
337
+ class VectorDatabaseNode(Node):
338
+ """Interacts with vector databases for storing and retrieving embeddings.
339
+
340
+ This node provides a unified interface for various vector databases including
341
+ Pinecone, Weaviate, Milvus, Qdrant, and others. It handles vector operations,
342
+ metadata management, and similarity search.
343
+
344
+ Design Pattern:
345
+ - Repository pattern for data access
346
+ - Adapter pattern for different backends
347
+ - Command pattern for operations
348
+
349
+ Features:
350
+ - Multiple vector database support
351
+ - CRUD operations on vectors
352
+ - Similarity search with filters
353
+ - Hybrid search (vector + keyword)
354
+ - Index management
355
+ - Backup and restore
356
+
357
+ Common Usage Patterns:
358
+ - Storing document embeddings
359
+ - Semantic search implementation
360
+ - Recommendation systems
361
+ - Content deduplication
362
+ - Knowledge graph augmentation
363
+
364
+ Upstream Dependencies:
365
+ - EmbeddingNode (vector generation)
366
+ - Data processing nodes
367
+ - Document extraction nodes
368
+
369
+ Downstream Consumers:
370
+ - Search interface nodes
371
+ - RAG pipeline nodes
372
+ - Analytics nodes
373
+ - Visualization nodes
374
+
375
+ Configuration:
376
+ provider (str): Vector database provider
377
+ connection_string (str): Database connection details
378
+ index_name (str): Name of the vector index
379
+ dimension (int): Vector dimension size
380
+ metric (str): Distance metric ("cosine", "euclidean", "dot")
381
+
382
+ Inputs:
383
+ operation (str): Operation to perform ("upsert", "query", "delete", "fetch")
384
+ vectors (List[List[float]]): Vectors for upsert operations
385
+ ids (List[str]): Vector IDs
386
+ metadata (List[Dict]): Associated metadata
387
+ query_vector (List[float]): Vector for similarity search
388
+ k (int): Number of results to return
389
+ filter (Dict): Metadata filter for search
390
+
391
+ Outputs:
392
+ results (List[Dict]): Operation results
393
+ status (str): Operation status
394
+
395
+ Error Handling:
396
+ - Connection validation
397
+ - Index existence checks
398
+ - Dimension mismatch detection
399
+ - Quota and limit management
400
+
401
+ Example:
402
+ >>> vector_db = VectorDatabaseNode()
403
+ >>> vector_db.configure({
404
+ ... "provider": "pinecone",
405
+ ... "index_name": "my-knowledge-base",
406
+ ... "api_key": "your-api-key",
407
+ ... "dimension": 1536,
408
+ ... "metric": "cosine"
409
+ ... })
410
+ >>>
411
+ >>> # Upsert vectors
412
+ >>> result = vector_db.execute({
413
+ ... "operation": "upsert",
414
+ ... "vectors": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
415
+ ... "ids": ["doc1", "doc2"],
416
+ ... "metadata": [{"title": "Document 1"}, {"title": "Document 2"}]
417
+ ... })
418
+ >>>
419
+ >>> # Query similar vectors
420
+ >>> search_result = vector_db.execute({
421
+ ... "operation": "query",
422
+ ... "query_vector": [0.15, 0.25, ...],
423
+ ... "k": 5,
424
+ ... "filter": {"category": "technical"}
425
+ ... })
426
+ """
427
+
428
+ metadata = NodeMetadata(
429
+ name="VectorDatabaseNode",
430
+ description="Vector database operations",
431
+ version="1.0.0",
432
+ tags={"vector", "database", "storage"},
433
+ )
434
+
435
+ def __init__(self):
436
+ """Initialize the vector database node.
437
+
438
+ Sets up the node and prepares for database connection.
439
+ The actual connection is established during configuration.
440
+ """
441
+ super().__init__()
442
+ self._client = None
443
+ self._index = None
444
+
445
+ def get_parameters(self) -> Dict[str, NodeParameter]:
446
+ """Define parameters for the vector database node."""
447
+ return {
448
+ "provider": NodeParameter(
449
+ name="provider",
450
+ type=str,
451
+ description="Vector database provider",
452
+ required=True,
453
+ ),
454
+ "connection_string": NodeParameter(
455
+ name="connection_string",
456
+ type=str,
457
+ description="Database connection details",
458
+ required=False,
459
+ ),
460
+ "index_name": NodeParameter(
461
+ name="index_name",
462
+ type=str,
463
+ description="Vector index name",
464
+ required=True,
465
+ ),
466
+ "api_key": NodeParameter(
467
+ name="api_key",
468
+ type=str,
469
+ description="API key for cloud providers",
470
+ required=False,
471
+ ),
472
+ "dimension": NodeParameter(
473
+ name="dimension",
474
+ type=int,
475
+ description="Vector dimension size",
476
+ required=True,
477
+ ),
478
+ "metric": NodeParameter(
479
+ name="metric",
480
+ type=str,
481
+ description="Distance metric",
482
+ required=False,
483
+ default="cosine",
484
+ ),
485
+ }
486
+
487
+ def configure(self, config: Dict[str, Any]) -> None:
488
+ """Configure the vector database connection.
489
+
490
+ Establishes connection to the vector database, validates the index,
491
+ and prepares for vector operations.
492
+
493
+ Args:
494
+ config: Configuration with database settings
495
+
496
+ Raises:
497
+ NodeConfigurationError: If connection fails
498
+ """
499
+ super().configure(config)
500
+
501
+ provider = self.config.get("provider")
502
+ index_name = self.config.get("index_name")
503
+
504
+ if not index_name:
505
+ raise NodeConfigurationError("index_name is required")
506
+
507
+ try:
508
+ # Placeholder for actual database connection
509
+ self._connect_to_database(provider)
510
+ except Exception as e:
511
+ raise NodeConfigurationError(f"Failed to connect to {provider}: {str(e)}")
512
+
513
+ def _connect_to_database(self, provider: str) -> None:
514
+ """Connect to the vector database.
515
+
516
+ Establishes connection and prepares the index for operations.
517
+ This is a placeholder for actual connection logic.
518
+
519
+ Args:
520
+ provider: Database provider name
521
+
522
+ Raises:
523
+ ValueError: If provider is not supported
524
+ """
525
+ if provider not in ["pinecone", "weaviate", "milvus", "qdrant", "chroma"]:
526
+ raise ValueError(f"Unsupported provider: {provider}")
527
+
528
+ # Placeholder for actual connection
529
+ self._client = f"{provider}_client"
530
+ self._index = self.config.get("index_name")
531
+
532
+ def run(self, **kwargs) -> Dict[str, Any]:
533
+ """Execute vector database operations.
534
+
535
+ Implementation of the abstract run method from the base Node class.
536
+
537
+ Args:
538
+ **kwargs: Keyword arguments for the operation
539
+
540
+ Returns:
541
+ Operation results
542
+ """
543
+ return self.execute(kwargs)
544
+
545
+ def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
546
+ """Execute vector database operations.
547
+
548
+ Performs the requested operation (upsert, query, delete, fetch)
549
+ on the vector database.
550
+
551
+ Args:
552
+ inputs: Operation parameters
553
+
554
+ Returns:
555
+ Operation results
556
+
557
+ Raises:
558
+ NodeExecutionError: If operation fails
559
+ """
560
+ try:
561
+ operation = inputs.get("operation", "query")
562
+
563
+ if operation == "upsert":
564
+ return self._upsert_vectors(inputs)
565
+ elif operation == "query":
566
+ return self._query_vectors(inputs)
567
+ elif operation == "delete":
568
+ return self._delete_vectors(inputs)
569
+ elif operation == "fetch":
570
+ return self._fetch_vectors(inputs)
571
+ else:
572
+ raise ValueError(f"Unknown operation: {operation}")
573
+ except Exception as e:
574
+ raise NodeExecutionError(f"Vector operation failed: {str(e)}")
575
+
576
+ def _upsert_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
577
+ """Insert or update vectors in the database.
578
+
579
+ Args:
580
+ inputs: Vectors, IDs, and metadata
581
+
582
+ Returns:
583
+ Upsert status
584
+ """
585
+ vectors = inputs.get("vectors", [])
586
+ ids = inputs.get("ids", [])
587
+ metadata = inputs.get("metadata", [])
588
+
589
+ if not vectors or not ids:
590
+ raise ValueError("Vectors and IDs are required for upsert")
591
+
592
+ if len(vectors) != len(ids):
593
+ raise ValueError("Number of vectors must match number of IDs")
594
+
595
+ # Placeholder for actual upsert
596
+ return {
597
+ "operation": "upsert",
598
+ "status": "success",
599
+ "count": len(vectors),
600
+ "index": self._index,
601
+ }
602
+
603
+ def _query_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
604
+ """Query similar vectors from the database.
605
+
606
+ Args:
607
+ inputs: Query vector and parameters
608
+
609
+ Returns:
610
+ Search results
611
+ """
612
+ query_vector = inputs.get("query_vector")
613
+ k = inputs.get("k", 10)
614
+ filter_dict = inputs.get("filter", {})
615
+
616
+ if not query_vector:
617
+ raise ValueError("Query vector is required")
618
+
619
+ # Placeholder for actual query
620
+ return {
621
+ "operation": "query",
622
+ "status": "success",
623
+ "results": [
624
+ {
625
+ "id": f"doc_{i}",
626
+ "score": 0.95 - i * 0.05,
627
+ "metadata": {"title": f"Document {i}"},
628
+ }
629
+ for i in range(min(k, 5))
630
+ ],
631
+ "count": min(k, 5),
632
+ }
633
+
634
+ def _delete_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
635
+ """Delete vectors from the database.
636
+
637
+ Args:
638
+ inputs: Vector IDs to delete
639
+
640
+ Returns:
641
+ Deletion status
642
+ """
643
+ ids = inputs.get("ids", [])
644
+
645
+ if not ids:
646
+ raise ValueError("IDs are required for deletion")
647
+
648
+ # Placeholder for actual deletion
649
+ return {"operation": "delete", "status": "success", "count": len(ids)}
650
+
651
+ def _fetch_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
652
+ """Fetch specific vectors by ID.
653
+
654
+ Args:
655
+ inputs: Vector IDs to fetch
656
+
657
+ Returns:
658
+ Fetched vectors and metadata
659
+ """
660
+ ids = inputs.get("ids", [])
661
+
662
+ if not ids:
663
+ raise ValueError("IDs are required for fetch")
664
+
665
+ # Placeholder for actual fetch
666
+ return {
667
+ "operation": "fetch",
668
+ "status": "success",
669
+ "vectors": {
670
+ id: {
671
+ "values": [0.1] * self.config.get("dimension", 768),
672
+ "metadata": {"id": id},
673
+ }
674
+ for id in ids
675
+ },
676
+ }
677
+
678
+
679
+ @register_node()
680
+ class TextSplitterNode(Node):
681
+ """Splits text into chunks for embedding generation.
682
+
683
+ This node provides various text splitting strategies optimized for
684
+ embedding generation. It handles overlap, token counting, and
685
+ semantic boundaries to create meaningful chunks.
686
+
687
+ Design Pattern:
688
+ - Strategy pattern for splitting algorithms
689
+ - Chain of responsibility for preprocessing
690
+
691
+ Features:
692
+ - Multiple splitting strategies
693
+ - Configurable chunk size and overlap
694
+ - Token-aware splitting
695
+ - Semantic boundary detection
696
+ - Metadata preservation
697
+
698
+ Common Usage Patterns:
699
+ - Document chunking for RAG
700
+ - Long text preprocessing
701
+ - Context window management
702
+ - Batch processing optimization
703
+
704
+ Upstream Dependencies:
705
+ - Document reader nodes
706
+ - Text extraction nodes
707
+ - PDF/DOCX processors
708
+
709
+ Downstream Consumers:
710
+ - EmbeddingNode
711
+ - Text processing nodes
712
+ - Storage nodes
713
+
714
+ Configuration:
715
+ strategy (str): Splitting strategy
716
+ chunk_size (int): Maximum chunk size
717
+ chunk_overlap (int): Overlap between chunks
718
+ separator (str): Text separator
719
+ preserve_sentences (bool): Keep sentence boundaries
720
+
721
+ Inputs:
722
+ text (str): Text to split
723
+ metadata (Dict): Optional metadata to preserve
724
+
725
+ Outputs:
726
+ chunks (List[str]): Text chunks
727
+ chunk_metadata (List[Dict]): Metadata for each chunk
728
+
729
+ Example:
730
+ >>> splitter = TextSplitterNode()
731
+ >>> splitter.configure({
732
+ ... "strategy": "recursive",
733
+ ... "chunk_size": 1000,
734
+ ... "chunk_overlap": 200,
735
+ ... "preserve_sentences": True
736
+ ... })
737
+ >>> result = splitter.execute({
738
+ ... "text": "Long document text...",
739
+ ... "metadata": {"source": "document.pdf"}
740
+ ... })
741
+ >>> print(f"Created {len(result['chunks'])} chunks")
742
+ """
743
+
744
+ metadata = NodeMetadata(
745
+ name="TextSplitterNode",
746
+ description="Splits text into chunks",
747
+ version="1.0.0",
748
+ tags={"text", "processing", "nlp"},
749
+ )
750
+
751
+ def get_parameters(self) -> Dict[str, NodeParameter]:
752
+ """Define parameters for the text splitter node."""
753
+ return {
754
+ "strategy": NodeParameter(
755
+ name="strategy",
756
+ type=str,
757
+ description="Splitting strategy",
758
+ required=False,
759
+ default="recursive",
760
+ ),
761
+ "chunk_size": NodeParameter(
762
+ name="chunk_size",
763
+ type=int,
764
+ description="Maximum chunk size",
765
+ required=False,
766
+ default=1000,
767
+ ),
768
+ "chunk_overlap": NodeParameter(
769
+ name="chunk_overlap",
770
+ type=int,
771
+ description="Overlap between chunks",
772
+ required=False,
773
+ default=200,
774
+ ),
775
+ "separator": NodeParameter(
776
+ name="separator",
777
+ type=str,
778
+ description="Text separator",
779
+ required=False,
780
+ default="\n",
781
+ ),
782
+ "preserve_sentences": NodeParameter(
783
+ name="preserve_sentences",
784
+ type=bool,
785
+ description="Keep sentence boundaries",
786
+ required=False,
787
+ default=True,
788
+ ),
789
+ }
790
+
791
+ def run(self, **kwargs) -> Dict[str, Any]:
792
+ """Split text into chunks using configured strategy.
793
+
794
+ Implementation of the abstract run method from the base Node class.
795
+
796
+ Args:
797
+ **kwargs: Keyword arguments containing text and metadata
798
+
799
+ Returns:
800
+ Text chunks and metadata
801
+ """
802
+ return self.execute(kwargs)
803
+
804
+ def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
805
+ """Split text into chunks using configured strategy.
806
+
807
+ Args:
808
+ inputs: Text and optional metadata
809
+
810
+ Returns:
811
+ Text chunks and metadata
812
+
813
+ Raises:
814
+ NodeExecutionError: If splitting fails
815
+ """
816
+ try:
817
+ text = inputs.get("text", "")
818
+ metadata = inputs.get("metadata", {})
819
+
820
+ if not text:
821
+ return {"chunks": [], "chunk_metadata": []}
822
+
823
+ strategy = self.config.get("strategy", "recursive")
824
+
825
+ if strategy == "recursive":
826
+ chunks = self._recursive_split(text)
827
+ elif strategy == "character":
828
+ chunks = self._character_split(text)
829
+ elif strategy == "sentence":
830
+ chunks = self._sentence_split(text)
831
+ elif strategy == "token":
832
+ chunks = self._token_split(text)
833
+ else:
834
+ raise ValueError(f"Unknown strategy: {strategy}")
835
+
836
+ # Create metadata for each chunk
837
+ chunk_metadata = []
838
+ for i, chunk in enumerate(chunks):
839
+ chunk_meta = metadata.copy()
840
+ chunk_meta.update(
841
+ {
842
+ "chunk_index": i,
843
+ "chunk_size": len(chunk),
844
+ "total_chunks": len(chunks),
845
+ }
846
+ )
847
+ chunk_metadata.append(chunk_meta)
848
+
849
+ return {
850
+ "chunks": chunks,
851
+ "chunk_metadata": chunk_metadata,
852
+ "total_chunks": len(chunks),
853
+ }
854
+ except Exception as e:
855
+ raise NodeExecutionError(f"Text splitting failed: {str(e)}")
856
+
857
+ def _recursive_split(self, text: str) -> List[str]:
858
+ """Split text recursively using multiple separators.
859
+
860
+ Args:
861
+ text: Text to split
862
+
863
+ Returns:
864
+ List of text chunks
865
+ """
866
+ # Placeholder implementation
867
+ chunk_size = self.config.get("chunk_size", 1000)
868
+ chunk_overlap = self.config.get("chunk_overlap", 200)
869
+
870
+ chunks = []
871
+ current_pos = 0
872
+
873
+ while current_pos < len(text):
874
+ end_pos = min(current_pos + chunk_size, len(text))
875
+ chunk = text[current_pos:end_pos]
876
+ chunks.append(chunk)
877
+ current_pos += chunk_size - chunk_overlap
878
+
879
+ return chunks
880
+
881
+ def _character_split(self, text: str) -> List[str]:
882
+ """Split text by character count.
883
+
884
+ Args:
885
+ text: Text to split
886
+
887
+ Returns:
888
+ List of text chunks
889
+ """
890
+ # Placeholder implementation
891
+ chunk_size = self.config.get("chunk_size", 1000)
892
+ separator = self.config.get("separator", "\n")
893
+
894
+ parts = text.split(separator)
895
+ chunks = []
896
+ current_chunk = ""
897
+
898
+ for part in parts:
899
+ if len(current_chunk + part) > chunk_size:
900
+ if current_chunk:
901
+ chunks.append(current_chunk)
902
+ current_chunk = part
903
+ else:
904
+ current_chunk += separator + part if current_chunk else part
905
+
906
+ if current_chunk:
907
+ chunks.append(current_chunk)
908
+
909
+ return chunks
910
+
911
+ def _sentence_split(self, text: str) -> List[str]:
912
+ """Split text by sentences.
913
+
914
+ Args:
915
+ text: Text to split
916
+
917
+ Returns:
918
+ List of text chunks
919
+ """
920
+ # Placeholder implementation - would use proper sentence tokenization
921
+ sentences = text.split(". ")
922
+ chunks = []
923
+ current_chunk = ""
924
+ chunk_size = self.config.get("chunk_size", 1000)
925
+
926
+ for sentence in sentences:
927
+ if len(current_chunk + sentence) > chunk_size:
928
+ if current_chunk:
929
+ chunks.append(current_chunk)
930
+ current_chunk = sentence + "."
931
+ else:
932
+ current_chunk += sentence + ". " if current_chunk else sentence + "."
933
+
934
+ if current_chunk:
935
+ chunks.append(current_chunk.rstrip())
936
+
937
+ return chunks
938
+
939
+ def _token_split(self, text: str) -> List[str]:
940
+ """Split text by token count.
941
+
942
+ Args:
943
+ text: Text to split
944
+
945
+ Returns:
946
+ List of text chunks
947
+ """
948
+ # Placeholder implementation - would use tokenizer
949
+ words = text.split()
950
+ chunks = []
951
+ current_chunk = []
952
+ chunk_size = self.config.get("chunk_size", 1000) // 4 # Rough token estimate
953
+
954
+ for word in words:
955
+ if len(current_chunk) >= chunk_size:
956
+ chunks.append(" ".join(current_chunk))
957
+ current_chunk = [word]
958
+ else:
959
+ current_chunk.append(word)
960
+
961
+ if current_chunk:
962
+ chunks.append(" ".join(current_chunk))
963
+
964
+ return chunks