kailash 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +31 -0
- kailash/__main__.py +11 -0
- kailash/cli/__init__.py +5 -0
- kailash/cli/commands.py +563 -0
- kailash/manifest.py +778 -0
- kailash/nodes/__init__.py +23 -0
- kailash/nodes/ai/__init__.py +26 -0
- kailash/nodes/ai/agents.py +417 -0
- kailash/nodes/ai/models.py +488 -0
- kailash/nodes/api/__init__.py +52 -0
- kailash/nodes/api/auth.py +567 -0
- kailash/nodes/api/graphql.py +480 -0
- kailash/nodes/api/http.py +598 -0
- kailash/nodes/api/rate_limiting.py +572 -0
- kailash/nodes/api/rest.py +665 -0
- kailash/nodes/base.py +1032 -0
- kailash/nodes/base_async.py +128 -0
- kailash/nodes/code/__init__.py +32 -0
- kailash/nodes/code/python.py +1021 -0
- kailash/nodes/data/__init__.py +125 -0
- kailash/nodes/data/readers.py +496 -0
- kailash/nodes/data/sharepoint_graph.py +623 -0
- kailash/nodes/data/sql.py +380 -0
- kailash/nodes/data/streaming.py +1168 -0
- kailash/nodes/data/vector_db.py +964 -0
- kailash/nodes/data/writers.py +529 -0
- kailash/nodes/logic/__init__.py +6 -0
- kailash/nodes/logic/async_operations.py +702 -0
- kailash/nodes/logic/operations.py +551 -0
- kailash/nodes/transform/__init__.py +5 -0
- kailash/nodes/transform/processors.py +379 -0
- kailash/runtime/__init__.py +6 -0
- kailash/runtime/async_local.py +356 -0
- kailash/runtime/docker.py +697 -0
- kailash/runtime/local.py +434 -0
- kailash/runtime/parallel.py +557 -0
- kailash/runtime/runner.py +110 -0
- kailash/runtime/testing.py +347 -0
- kailash/sdk_exceptions.py +307 -0
- kailash/tracking/__init__.py +7 -0
- kailash/tracking/manager.py +885 -0
- kailash/tracking/metrics_collector.py +342 -0
- kailash/tracking/models.py +535 -0
- kailash/tracking/storage/__init__.py +0 -0
- kailash/tracking/storage/base.py +113 -0
- kailash/tracking/storage/database.py +619 -0
- kailash/tracking/storage/filesystem.py +543 -0
- kailash/utils/__init__.py +0 -0
- kailash/utils/export.py +924 -0
- kailash/utils/templates.py +680 -0
- kailash/visualization/__init__.py +62 -0
- kailash/visualization/api.py +732 -0
- kailash/visualization/dashboard.py +951 -0
- kailash/visualization/performance.py +808 -0
- kailash/visualization/reports.py +1471 -0
- kailash/workflow/__init__.py +15 -0
- kailash/workflow/builder.py +245 -0
- kailash/workflow/graph.py +827 -0
- kailash/workflow/mermaid_visualizer.py +628 -0
- kailash/workflow/mock_registry.py +63 -0
- kailash/workflow/runner.py +302 -0
- kailash/workflow/state.py +238 -0
- kailash/workflow/visualization.py +588 -0
- kailash-0.1.0.dist-info/METADATA +710 -0
- kailash-0.1.0.dist-info/RECORD +69 -0
- kailash-0.1.0.dist-info/WHEEL +5 -0
- kailash-0.1.0.dist-info/entry_points.txt +2 -0
- kailash-0.1.0.dist-info/licenses/LICENSE +21 -0
- kailash-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,964 @@
|
|
1
|
+
"""Vector database and embedding nodes for the Kailash system.
|
2
|
+
|
3
|
+
This module provides nodes for interacting with vector databases and generating
|
4
|
+
embeddings. Key features include:
|
5
|
+
|
6
|
+
- Unified interface for various vector databases (Pinecone, Weaviate, Milvus, etc.)
|
7
|
+
- Embedding generation using various models (OpenAI, HuggingFace, etc.)
|
8
|
+
- Text chunking and preprocessing
|
9
|
+
- Vector similarity search
|
10
|
+
- Metadata filtering
|
11
|
+
|
12
|
+
Design Philosophy:
|
13
|
+
- Abstract away vector database differences
|
14
|
+
- Support multiple embedding models
|
15
|
+
- Provide flexible search capabilities
|
16
|
+
- Enable metadata-based filtering
|
17
|
+
- Handle text preprocessing
|
18
|
+
|
19
|
+
Common Use Cases:
|
20
|
+
- Semantic search applications
|
21
|
+
- RAG (Retrieval Augmented Generation) pipelines
|
22
|
+
- Content similarity analysis
|
23
|
+
- Document clustering
|
24
|
+
- Knowledge base retrieval
|
25
|
+
|
26
|
+
Example:
|
27
|
+
>>> # Generate embeddings
|
28
|
+
>>> embedder = EmbeddingNode()
|
29
|
+
>>> embedder.configure({"model": "openai", "model_name": "text-embedding-ada-002"})
|
30
|
+
>>> result = embedder.execute({"texts": ["Hello world", "Goodbye world"]})
|
31
|
+
>>>
|
32
|
+
>>> # Store in vector database
|
33
|
+
>>> vector_db = VectorDatabaseNode()
|
34
|
+
>>> vector_db.configure({
|
35
|
+
... "provider": "pinecone",
|
36
|
+
... "index_name": "my-index",
|
37
|
+
... "api_key": "your-api-key"
|
38
|
+
... })
|
39
|
+
>>> vector_db.execute({
|
40
|
+
... "operation": "upsert",
|
41
|
+
... "vectors": result["embeddings"],
|
42
|
+
... "ids": ["doc1", "doc2"],
|
43
|
+
... "metadata": [{"source": "file1"}, {"source": "file2"}]
|
44
|
+
... })
|
45
|
+
"""
|
46
|
+
|
47
|
+
from typing import Any, Dict, List
|
48
|
+
|
49
|
+
import numpy as np
|
50
|
+
|
51
|
+
from kailash.nodes.base import Node, NodeMetadata, NodeParameter, register_node
|
52
|
+
from kailash.sdk_exceptions import NodeConfigurationError, NodeExecutionError
|
53
|
+
|
54
|
+
|
55
|
+
@register_node()
|
56
|
+
class EmbeddingNode(Node):
|
57
|
+
"""Generates embeddings for text data using various embedding models.
|
58
|
+
|
59
|
+
This node provides a unified interface for generating text embeddings using
|
60
|
+
different models and providers (OpenAI, HuggingFace, Cohere, etc.). It handles
|
61
|
+
text preprocessing, batching, and error recovery.
|
62
|
+
|
63
|
+
Design Pattern:
|
64
|
+
- Strategy pattern for different embedding providers
|
65
|
+
- Facade pattern for unified interface
|
66
|
+
- Builder pattern for configuration
|
67
|
+
|
68
|
+
Features:
|
69
|
+
- Multiple embedding model support
|
70
|
+
- Automatic text truncation
|
71
|
+
- Batch processing
|
72
|
+
- Error handling with retries
|
73
|
+
- Model caching
|
74
|
+
|
75
|
+
Common Usage Patterns:
|
76
|
+
- Text to vector conversion for similarity search
|
77
|
+
- Document embedding for clustering
|
78
|
+
- Query embedding for semantic search
|
79
|
+
- Content analysis pipelines
|
80
|
+
|
81
|
+
Upstream Dependencies:
|
82
|
+
- Text preprocessing nodes (TextSplitter, TextCleaner)
|
83
|
+
- Document reader nodes (PDFReader, DocxReader)
|
84
|
+
- API configuration nodes
|
85
|
+
|
86
|
+
Downstream Consumers:
|
87
|
+
- VectorDatabaseNode
|
88
|
+
- SimilaritySearchNode
|
89
|
+
- ClusteringNode
|
90
|
+
- RAG pipeline nodes
|
91
|
+
|
92
|
+
Configuration:
|
93
|
+
model (str): Model provider ("openai", "huggingface", "cohere")
|
94
|
+
model_name (str): Specific model name (e.g., "text-embedding-ada-002")
|
95
|
+
api_key (str): API key for the provider (if required)
|
96
|
+
batch_size (int): Number of texts to process in one batch
|
97
|
+
max_tokens (int): Maximum tokens per text
|
98
|
+
normalize (bool): Whether to normalize embeddings
|
99
|
+
|
100
|
+
Inputs:
|
101
|
+
texts (List[str]): List of texts to embed
|
102
|
+
|
103
|
+
Outputs:
|
104
|
+
embeddings (List[List[float]]): Generated embeddings
|
105
|
+
model_info (Dict): Model metadata (dimensions, etc.)
|
106
|
+
|
107
|
+
Error Handling:
|
108
|
+
- Validates model availability
|
109
|
+
- Handles API rate limits
|
110
|
+
- Manages token limits
|
111
|
+
- Retries on transient failures
|
112
|
+
|
113
|
+
Example:
|
114
|
+
>>> embedder = EmbeddingNode()
|
115
|
+
>>> embedder.configure({
|
116
|
+
... "model": "openai",
|
117
|
+
... "model_name": "text-embedding-ada-002",
|
118
|
+
... "api_key": "your-api-key",
|
119
|
+
... "batch_size": 100,
|
120
|
+
... "normalize": True
|
121
|
+
... })
|
122
|
+
>>> result = embedder.execute({
|
123
|
+
... "texts": ["Sample text 1", "Sample text 2"]
|
124
|
+
... })
|
125
|
+
>>> print(f"Embedding dimensions: {len(result['embeddings'][0])}")
|
126
|
+
"""
|
127
|
+
|
128
|
+
metadata = NodeMetadata(
|
129
|
+
name="EmbeddingNode",
|
130
|
+
description="Generates embeddings for text data",
|
131
|
+
version="1.0.0",
|
132
|
+
tags={"embedding", "nlp", "vector"},
|
133
|
+
)
|
134
|
+
|
135
|
+
def __init__(self):
|
136
|
+
"""Initialize the embedding node.
|
137
|
+
|
138
|
+
Sets up the node with default configuration and prepares for
|
139
|
+
model initialization. The actual model is loaded during configuration.
|
140
|
+
"""
|
141
|
+
super().__init__()
|
142
|
+
self._model = None
|
143
|
+
self._model_info = {}
|
144
|
+
|
145
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
146
|
+
"""Define parameters for the embedding node."""
|
147
|
+
return {
|
148
|
+
"model": NodeParameter(
|
149
|
+
name="model",
|
150
|
+
type=str,
|
151
|
+
description="Model provider",
|
152
|
+
required=True,
|
153
|
+
default="openai",
|
154
|
+
),
|
155
|
+
"model_name": NodeParameter(
|
156
|
+
name="model_name",
|
157
|
+
type=str,
|
158
|
+
description="Specific model name",
|
159
|
+
required=True,
|
160
|
+
default="text-embedding-ada-002",
|
161
|
+
),
|
162
|
+
"api_key": NodeParameter(
|
163
|
+
name="api_key",
|
164
|
+
type=str,
|
165
|
+
description="API key for the provider",
|
166
|
+
required=False,
|
167
|
+
),
|
168
|
+
"batch_size": NodeParameter(
|
169
|
+
name="batch_size",
|
170
|
+
type=int,
|
171
|
+
description="Batch size for processing",
|
172
|
+
required=False,
|
173
|
+
default=100,
|
174
|
+
),
|
175
|
+
"max_tokens": NodeParameter(
|
176
|
+
name="max_tokens",
|
177
|
+
type=int,
|
178
|
+
description="Maximum tokens per text",
|
179
|
+
required=False,
|
180
|
+
default=8192,
|
181
|
+
),
|
182
|
+
"normalize": NodeParameter(
|
183
|
+
name="normalize",
|
184
|
+
type=bool,
|
185
|
+
description="Normalize embeddings",
|
186
|
+
required=False,
|
187
|
+
default=True,
|
188
|
+
),
|
189
|
+
}
|
190
|
+
|
191
|
+
def configure(self, config: Dict[str, Any]) -> None:
|
192
|
+
"""Configure the embedding node with model settings.
|
193
|
+
|
194
|
+
Validates configuration, initializes the embedding model, and
|
195
|
+
prepares for text processing. Different models require different
|
196
|
+
configuration parameters.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
config: Configuration dictionary with model settings
|
200
|
+
|
201
|
+
Raises:
|
202
|
+
NodeConfigurationError: If configuration is invalid
|
203
|
+
"""
|
204
|
+
super().configure(config)
|
205
|
+
|
206
|
+
# Initialize model based on provider
|
207
|
+
model_provider = self.config.get("model", "openai")
|
208
|
+
model_name = self.config.get("model_name")
|
209
|
+
|
210
|
+
if not model_name:
|
211
|
+
raise NodeConfigurationError("model_name is required")
|
212
|
+
|
213
|
+
try:
|
214
|
+
# Placeholder for actual model initialization
|
215
|
+
self._initialize_model(model_provider, model_name)
|
216
|
+
except Exception as e:
|
217
|
+
raise NodeConfigurationError(f"Failed to initialize model: {str(e)}")
|
218
|
+
|
219
|
+
def _initialize_model(self, provider: str, model_name: str) -> None:
|
220
|
+
"""Initialize the embedding model.
|
221
|
+
|
222
|
+
Loads the specified model and prepares it for use. This is a
|
223
|
+
placeholder for actual model initialization logic.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
provider: Model provider name
|
227
|
+
model_name: Specific model identifier
|
228
|
+
|
229
|
+
Raises:
|
230
|
+
ValueError: If provider is not supported
|
231
|
+
"""
|
232
|
+
# Placeholder for actual model initialization
|
233
|
+
if provider not in ["openai", "huggingface", "cohere", "custom"]:
|
234
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
235
|
+
|
236
|
+
self._model = f"{provider}:{model_name}" # Placeholder
|
237
|
+
self._model_info = {
|
238
|
+
"provider": provider,
|
239
|
+
"model_name": model_name,
|
240
|
+
"dimensions": 1536 if provider == "openai" else 768,
|
241
|
+
"max_tokens": self.config.get("max_tokens", 8192),
|
242
|
+
}
|
243
|
+
|
244
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
245
|
+
"""Generate embeddings for input texts.
|
246
|
+
|
247
|
+
Implementation of the abstract run method from the base Node class.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
**kwargs: Keyword arguments containing 'texts' list
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Dictionary containing embeddings and model info
|
254
|
+
"""
|
255
|
+
return self.execute(kwargs)
|
256
|
+
|
257
|
+
def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
258
|
+
"""Generate embeddings for input texts.
|
259
|
+
|
260
|
+
Processes the input texts through the configured embedding model,
|
261
|
+
handling batching, normalization, and error recovery.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
inputs: Dictionary containing 'texts' list
|
265
|
+
|
266
|
+
Returns:
|
267
|
+
Dictionary containing embeddings and model info
|
268
|
+
|
269
|
+
Raises:
|
270
|
+
NodeExecutionError: If embedding generation fails
|
271
|
+
"""
|
272
|
+
try:
|
273
|
+
texts = inputs.get("texts", [])
|
274
|
+
if not texts:
|
275
|
+
raise ValueError("No texts provided for embedding")
|
276
|
+
|
277
|
+
# Process texts in batches
|
278
|
+
batch_size = self.config.get("batch_size", 100)
|
279
|
+
all_embeddings = []
|
280
|
+
|
281
|
+
for i in range(0, len(texts), batch_size):
|
282
|
+
batch = texts[i : i + batch_size]
|
283
|
+
batch_embeddings = self._generate_embeddings(batch)
|
284
|
+
all_embeddings.extend(batch_embeddings)
|
285
|
+
|
286
|
+
# Normalize if requested
|
287
|
+
if self.config.get("normalize", True):
|
288
|
+
all_embeddings = self._normalize_embeddings(all_embeddings)
|
289
|
+
|
290
|
+
return {
|
291
|
+
"embeddings": all_embeddings,
|
292
|
+
"model_info": self._model_info.copy(),
|
293
|
+
"count": len(all_embeddings),
|
294
|
+
}
|
295
|
+
except Exception as e:
|
296
|
+
raise NodeExecutionError(f"Failed to generate embeddings: {str(e)}")
|
297
|
+
|
298
|
+
def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
299
|
+
"""Generate embeddings for a batch of texts.
|
300
|
+
|
301
|
+
This is a placeholder for actual embedding generation logic.
|
302
|
+
|
303
|
+
Args:
|
304
|
+
texts: List of texts to embed
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
List of embedding vectors
|
308
|
+
"""
|
309
|
+
# Placeholder implementation
|
310
|
+
dim = self._model_info.get("dimensions", 768)
|
311
|
+
return [np.random.randn(dim).tolist() for _ in texts]
|
312
|
+
|
313
|
+
def _normalize_embeddings(self, embeddings: List[List[float]]) -> List[List[float]]:
|
314
|
+
"""Normalize embedding vectors to unit length.
|
315
|
+
|
316
|
+
Normalizes each embedding vector to have a magnitude of 1.0,
|
317
|
+
which is useful for cosine similarity calculations.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
embeddings: List of embedding vectors
|
321
|
+
|
322
|
+
Returns:
|
323
|
+
List of normalized embedding vectors
|
324
|
+
"""
|
325
|
+
normalized = []
|
326
|
+
for embedding in embeddings:
|
327
|
+
vec = np.array(embedding)
|
328
|
+
norm = np.linalg.norm(vec)
|
329
|
+
if norm > 0:
|
330
|
+
normalized.append((vec / norm).tolist())
|
331
|
+
else:
|
332
|
+
normalized.append(embedding)
|
333
|
+
return normalized
|
334
|
+
|
335
|
+
|
336
|
+
@register_node()
|
337
|
+
class VectorDatabaseNode(Node):
|
338
|
+
"""Interacts with vector databases for storing and retrieving embeddings.
|
339
|
+
|
340
|
+
This node provides a unified interface for various vector databases including
|
341
|
+
Pinecone, Weaviate, Milvus, Qdrant, and others. It handles vector operations,
|
342
|
+
metadata management, and similarity search.
|
343
|
+
|
344
|
+
Design Pattern:
|
345
|
+
- Repository pattern for data access
|
346
|
+
- Adapter pattern for different backends
|
347
|
+
- Command pattern for operations
|
348
|
+
|
349
|
+
Features:
|
350
|
+
- Multiple vector database support
|
351
|
+
- CRUD operations on vectors
|
352
|
+
- Similarity search with filters
|
353
|
+
- Hybrid search (vector + keyword)
|
354
|
+
- Index management
|
355
|
+
- Backup and restore
|
356
|
+
|
357
|
+
Common Usage Patterns:
|
358
|
+
- Storing document embeddings
|
359
|
+
- Semantic search implementation
|
360
|
+
- Recommendation systems
|
361
|
+
- Content deduplication
|
362
|
+
- Knowledge graph augmentation
|
363
|
+
|
364
|
+
Upstream Dependencies:
|
365
|
+
- EmbeddingNode (vector generation)
|
366
|
+
- Data processing nodes
|
367
|
+
- Document extraction nodes
|
368
|
+
|
369
|
+
Downstream Consumers:
|
370
|
+
- Search interface nodes
|
371
|
+
- RAG pipeline nodes
|
372
|
+
- Analytics nodes
|
373
|
+
- Visualization nodes
|
374
|
+
|
375
|
+
Configuration:
|
376
|
+
provider (str): Vector database provider
|
377
|
+
connection_string (str): Database connection details
|
378
|
+
index_name (str): Name of the vector index
|
379
|
+
dimension (int): Vector dimension size
|
380
|
+
metric (str): Distance metric ("cosine", "euclidean", "dot")
|
381
|
+
|
382
|
+
Inputs:
|
383
|
+
operation (str): Operation to perform ("upsert", "query", "delete", "fetch")
|
384
|
+
vectors (List[List[float]]): Vectors for upsert operations
|
385
|
+
ids (List[str]): Vector IDs
|
386
|
+
metadata (List[Dict]): Associated metadata
|
387
|
+
query_vector (List[float]): Vector for similarity search
|
388
|
+
k (int): Number of results to return
|
389
|
+
filter (Dict): Metadata filter for search
|
390
|
+
|
391
|
+
Outputs:
|
392
|
+
results (List[Dict]): Operation results
|
393
|
+
status (str): Operation status
|
394
|
+
|
395
|
+
Error Handling:
|
396
|
+
- Connection validation
|
397
|
+
- Index existence checks
|
398
|
+
- Dimension mismatch detection
|
399
|
+
- Quota and limit management
|
400
|
+
|
401
|
+
Example:
|
402
|
+
>>> vector_db = VectorDatabaseNode()
|
403
|
+
>>> vector_db.configure({
|
404
|
+
... "provider": "pinecone",
|
405
|
+
... "index_name": "my-knowledge-base",
|
406
|
+
... "api_key": "your-api-key",
|
407
|
+
... "dimension": 1536,
|
408
|
+
... "metric": "cosine"
|
409
|
+
... })
|
410
|
+
>>>
|
411
|
+
>>> # Upsert vectors
|
412
|
+
>>> result = vector_db.execute({
|
413
|
+
... "operation": "upsert",
|
414
|
+
... "vectors": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
|
415
|
+
... "ids": ["doc1", "doc2"],
|
416
|
+
... "metadata": [{"title": "Document 1"}, {"title": "Document 2"}]
|
417
|
+
... })
|
418
|
+
>>>
|
419
|
+
>>> # Query similar vectors
|
420
|
+
>>> search_result = vector_db.execute({
|
421
|
+
... "operation": "query",
|
422
|
+
... "query_vector": [0.15, 0.25, ...],
|
423
|
+
... "k": 5,
|
424
|
+
... "filter": {"category": "technical"}
|
425
|
+
... })
|
426
|
+
"""
|
427
|
+
|
428
|
+
metadata = NodeMetadata(
|
429
|
+
name="VectorDatabaseNode",
|
430
|
+
description="Vector database operations",
|
431
|
+
version="1.0.0",
|
432
|
+
tags={"vector", "database", "storage"},
|
433
|
+
)
|
434
|
+
|
435
|
+
def __init__(self):
|
436
|
+
"""Initialize the vector database node.
|
437
|
+
|
438
|
+
Sets up the node and prepares for database connection.
|
439
|
+
The actual connection is established during configuration.
|
440
|
+
"""
|
441
|
+
super().__init__()
|
442
|
+
self._client = None
|
443
|
+
self._index = None
|
444
|
+
|
445
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
446
|
+
"""Define parameters for the vector database node."""
|
447
|
+
return {
|
448
|
+
"provider": NodeParameter(
|
449
|
+
name="provider",
|
450
|
+
type=str,
|
451
|
+
description="Vector database provider",
|
452
|
+
required=True,
|
453
|
+
),
|
454
|
+
"connection_string": NodeParameter(
|
455
|
+
name="connection_string",
|
456
|
+
type=str,
|
457
|
+
description="Database connection details",
|
458
|
+
required=False,
|
459
|
+
),
|
460
|
+
"index_name": NodeParameter(
|
461
|
+
name="index_name",
|
462
|
+
type=str,
|
463
|
+
description="Vector index name",
|
464
|
+
required=True,
|
465
|
+
),
|
466
|
+
"api_key": NodeParameter(
|
467
|
+
name="api_key",
|
468
|
+
type=str,
|
469
|
+
description="API key for cloud providers",
|
470
|
+
required=False,
|
471
|
+
),
|
472
|
+
"dimension": NodeParameter(
|
473
|
+
name="dimension",
|
474
|
+
type=int,
|
475
|
+
description="Vector dimension size",
|
476
|
+
required=True,
|
477
|
+
),
|
478
|
+
"metric": NodeParameter(
|
479
|
+
name="metric",
|
480
|
+
type=str,
|
481
|
+
description="Distance metric",
|
482
|
+
required=False,
|
483
|
+
default="cosine",
|
484
|
+
),
|
485
|
+
}
|
486
|
+
|
487
|
+
def configure(self, config: Dict[str, Any]) -> None:
|
488
|
+
"""Configure the vector database connection.
|
489
|
+
|
490
|
+
Establishes connection to the vector database, validates the index,
|
491
|
+
and prepares for vector operations.
|
492
|
+
|
493
|
+
Args:
|
494
|
+
config: Configuration with database settings
|
495
|
+
|
496
|
+
Raises:
|
497
|
+
NodeConfigurationError: If connection fails
|
498
|
+
"""
|
499
|
+
super().configure(config)
|
500
|
+
|
501
|
+
provider = self.config.get("provider")
|
502
|
+
index_name = self.config.get("index_name")
|
503
|
+
|
504
|
+
if not index_name:
|
505
|
+
raise NodeConfigurationError("index_name is required")
|
506
|
+
|
507
|
+
try:
|
508
|
+
# Placeholder for actual database connection
|
509
|
+
self._connect_to_database(provider)
|
510
|
+
except Exception as e:
|
511
|
+
raise NodeConfigurationError(f"Failed to connect to {provider}: {str(e)}")
|
512
|
+
|
513
|
+
def _connect_to_database(self, provider: str) -> None:
|
514
|
+
"""Connect to the vector database.
|
515
|
+
|
516
|
+
Establishes connection and prepares the index for operations.
|
517
|
+
This is a placeholder for actual connection logic.
|
518
|
+
|
519
|
+
Args:
|
520
|
+
provider: Database provider name
|
521
|
+
|
522
|
+
Raises:
|
523
|
+
ValueError: If provider is not supported
|
524
|
+
"""
|
525
|
+
if provider not in ["pinecone", "weaviate", "milvus", "qdrant", "chroma"]:
|
526
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
527
|
+
|
528
|
+
# Placeholder for actual connection
|
529
|
+
self._client = f"{provider}_client"
|
530
|
+
self._index = self.config.get("index_name")
|
531
|
+
|
532
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
533
|
+
"""Execute vector database operations.
|
534
|
+
|
535
|
+
Implementation of the abstract run method from the base Node class.
|
536
|
+
|
537
|
+
Args:
|
538
|
+
**kwargs: Keyword arguments for the operation
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
Operation results
|
542
|
+
"""
|
543
|
+
return self.execute(kwargs)
|
544
|
+
|
545
|
+
def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
546
|
+
"""Execute vector database operations.
|
547
|
+
|
548
|
+
Performs the requested operation (upsert, query, delete, fetch)
|
549
|
+
on the vector database.
|
550
|
+
|
551
|
+
Args:
|
552
|
+
inputs: Operation parameters
|
553
|
+
|
554
|
+
Returns:
|
555
|
+
Operation results
|
556
|
+
|
557
|
+
Raises:
|
558
|
+
NodeExecutionError: If operation fails
|
559
|
+
"""
|
560
|
+
try:
|
561
|
+
operation = inputs.get("operation", "query")
|
562
|
+
|
563
|
+
if operation == "upsert":
|
564
|
+
return self._upsert_vectors(inputs)
|
565
|
+
elif operation == "query":
|
566
|
+
return self._query_vectors(inputs)
|
567
|
+
elif operation == "delete":
|
568
|
+
return self._delete_vectors(inputs)
|
569
|
+
elif operation == "fetch":
|
570
|
+
return self._fetch_vectors(inputs)
|
571
|
+
else:
|
572
|
+
raise ValueError(f"Unknown operation: {operation}")
|
573
|
+
except Exception as e:
|
574
|
+
raise NodeExecutionError(f"Vector operation failed: {str(e)}")
|
575
|
+
|
576
|
+
def _upsert_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
577
|
+
"""Insert or update vectors in the database.
|
578
|
+
|
579
|
+
Args:
|
580
|
+
inputs: Vectors, IDs, and metadata
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
Upsert status
|
584
|
+
"""
|
585
|
+
vectors = inputs.get("vectors", [])
|
586
|
+
ids = inputs.get("ids", [])
|
587
|
+
metadata = inputs.get("metadata", [])
|
588
|
+
|
589
|
+
if not vectors or not ids:
|
590
|
+
raise ValueError("Vectors and IDs are required for upsert")
|
591
|
+
|
592
|
+
if len(vectors) != len(ids):
|
593
|
+
raise ValueError("Number of vectors must match number of IDs")
|
594
|
+
|
595
|
+
# Placeholder for actual upsert
|
596
|
+
return {
|
597
|
+
"operation": "upsert",
|
598
|
+
"status": "success",
|
599
|
+
"count": len(vectors),
|
600
|
+
"index": self._index,
|
601
|
+
}
|
602
|
+
|
603
|
+
def _query_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
604
|
+
"""Query similar vectors from the database.
|
605
|
+
|
606
|
+
Args:
|
607
|
+
inputs: Query vector and parameters
|
608
|
+
|
609
|
+
Returns:
|
610
|
+
Search results
|
611
|
+
"""
|
612
|
+
query_vector = inputs.get("query_vector")
|
613
|
+
k = inputs.get("k", 10)
|
614
|
+
filter_dict = inputs.get("filter", {})
|
615
|
+
|
616
|
+
if not query_vector:
|
617
|
+
raise ValueError("Query vector is required")
|
618
|
+
|
619
|
+
# Placeholder for actual query
|
620
|
+
return {
|
621
|
+
"operation": "query",
|
622
|
+
"status": "success",
|
623
|
+
"results": [
|
624
|
+
{
|
625
|
+
"id": f"doc_{i}",
|
626
|
+
"score": 0.95 - i * 0.05,
|
627
|
+
"metadata": {"title": f"Document {i}"},
|
628
|
+
}
|
629
|
+
for i in range(min(k, 5))
|
630
|
+
],
|
631
|
+
"count": min(k, 5),
|
632
|
+
}
|
633
|
+
|
634
|
+
def _delete_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
635
|
+
"""Delete vectors from the database.
|
636
|
+
|
637
|
+
Args:
|
638
|
+
inputs: Vector IDs to delete
|
639
|
+
|
640
|
+
Returns:
|
641
|
+
Deletion status
|
642
|
+
"""
|
643
|
+
ids = inputs.get("ids", [])
|
644
|
+
|
645
|
+
if not ids:
|
646
|
+
raise ValueError("IDs are required for deletion")
|
647
|
+
|
648
|
+
# Placeholder for actual deletion
|
649
|
+
return {"operation": "delete", "status": "success", "count": len(ids)}
|
650
|
+
|
651
|
+
def _fetch_vectors(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
652
|
+
"""Fetch specific vectors by ID.
|
653
|
+
|
654
|
+
Args:
|
655
|
+
inputs: Vector IDs to fetch
|
656
|
+
|
657
|
+
Returns:
|
658
|
+
Fetched vectors and metadata
|
659
|
+
"""
|
660
|
+
ids = inputs.get("ids", [])
|
661
|
+
|
662
|
+
if not ids:
|
663
|
+
raise ValueError("IDs are required for fetch")
|
664
|
+
|
665
|
+
# Placeholder for actual fetch
|
666
|
+
return {
|
667
|
+
"operation": "fetch",
|
668
|
+
"status": "success",
|
669
|
+
"vectors": {
|
670
|
+
id: {
|
671
|
+
"values": [0.1] * self.config.get("dimension", 768),
|
672
|
+
"metadata": {"id": id},
|
673
|
+
}
|
674
|
+
for id in ids
|
675
|
+
},
|
676
|
+
}
|
677
|
+
|
678
|
+
|
679
|
+
@register_node()
|
680
|
+
class TextSplitterNode(Node):
|
681
|
+
"""Splits text into chunks for embedding generation.
|
682
|
+
|
683
|
+
This node provides various text splitting strategies optimized for
|
684
|
+
embedding generation. It handles overlap, token counting, and
|
685
|
+
semantic boundaries to create meaningful chunks.
|
686
|
+
|
687
|
+
Design Pattern:
|
688
|
+
- Strategy pattern for splitting algorithms
|
689
|
+
- Chain of responsibility for preprocessing
|
690
|
+
|
691
|
+
Features:
|
692
|
+
- Multiple splitting strategies
|
693
|
+
- Configurable chunk size and overlap
|
694
|
+
- Token-aware splitting
|
695
|
+
- Semantic boundary detection
|
696
|
+
- Metadata preservation
|
697
|
+
|
698
|
+
Common Usage Patterns:
|
699
|
+
- Document chunking for RAG
|
700
|
+
- Long text preprocessing
|
701
|
+
- Context window management
|
702
|
+
- Batch processing optimization
|
703
|
+
|
704
|
+
Upstream Dependencies:
|
705
|
+
- Document reader nodes
|
706
|
+
- Text extraction nodes
|
707
|
+
- PDF/DOCX processors
|
708
|
+
|
709
|
+
Downstream Consumers:
|
710
|
+
- EmbeddingNode
|
711
|
+
- Text processing nodes
|
712
|
+
- Storage nodes
|
713
|
+
|
714
|
+
Configuration:
|
715
|
+
strategy (str): Splitting strategy
|
716
|
+
chunk_size (int): Maximum chunk size
|
717
|
+
chunk_overlap (int): Overlap between chunks
|
718
|
+
separator (str): Text separator
|
719
|
+
preserve_sentences (bool): Keep sentence boundaries
|
720
|
+
|
721
|
+
Inputs:
|
722
|
+
text (str): Text to split
|
723
|
+
metadata (Dict): Optional metadata to preserve
|
724
|
+
|
725
|
+
Outputs:
|
726
|
+
chunks (List[str]): Text chunks
|
727
|
+
chunk_metadata (List[Dict]): Metadata for each chunk
|
728
|
+
|
729
|
+
Example:
|
730
|
+
>>> splitter = TextSplitterNode()
|
731
|
+
>>> splitter.configure({
|
732
|
+
... "strategy": "recursive",
|
733
|
+
... "chunk_size": 1000,
|
734
|
+
... "chunk_overlap": 200,
|
735
|
+
... "preserve_sentences": True
|
736
|
+
... })
|
737
|
+
>>> result = splitter.execute({
|
738
|
+
... "text": "Long document text...",
|
739
|
+
... "metadata": {"source": "document.pdf"}
|
740
|
+
... })
|
741
|
+
>>> print(f"Created {len(result['chunks'])} chunks")
|
742
|
+
"""
|
743
|
+
|
744
|
+
metadata = NodeMetadata(
|
745
|
+
name="TextSplitterNode",
|
746
|
+
description="Splits text into chunks",
|
747
|
+
version="1.0.0",
|
748
|
+
tags={"text", "processing", "nlp"},
|
749
|
+
)
|
750
|
+
|
751
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
752
|
+
"""Define parameters for the text splitter node."""
|
753
|
+
return {
|
754
|
+
"strategy": NodeParameter(
|
755
|
+
name="strategy",
|
756
|
+
type=str,
|
757
|
+
description="Splitting strategy",
|
758
|
+
required=False,
|
759
|
+
default="recursive",
|
760
|
+
),
|
761
|
+
"chunk_size": NodeParameter(
|
762
|
+
name="chunk_size",
|
763
|
+
type=int,
|
764
|
+
description="Maximum chunk size",
|
765
|
+
required=False,
|
766
|
+
default=1000,
|
767
|
+
),
|
768
|
+
"chunk_overlap": NodeParameter(
|
769
|
+
name="chunk_overlap",
|
770
|
+
type=int,
|
771
|
+
description="Overlap between chunks",
|
772
|
+
required=False,
|
773
|
+
default=200,
|
774
|
+
),
|
775
|
+
"separator": NodeParameter(
|
776
|
+
name="separator",
|
777
|
+
type=str,
|
778
|
+
description="Text separator",
|
779
|
+
required=False,
|
780
|
+
default="\n",
|
781
|
+
),
|
782
|
+
"preserve_sentences": NodeParameter(
|
783
|
+
name="preserve_sentences",
|
784
|
+
type=bool,
|
785
|
+
description="Keep sentence boundaries",
|
786
|
+
required=False,
|
787
|
+
default=True,
|
788
|
+
),
|
789
|
+
}
|
790
|
+
|
791
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
792
|
+
"""Split text into chunks using configured strategy.
|
793
|
+
|
794
|
+
Implementation of the abstract run method from the base Node class.
|
795
|
+
|
796
|
+
Args:
|
797
|
+
**kwargs: Keyword arguments containing text and metadata
|
798
|
+
|
799
|
+
Returns:
|
800
|
+
Text chunks and metadata
|
801
|
+
"""
|
802
|
+
return self.execute(kwargs)
|
803
|
+
|
804
|
+
def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
805
|
+
"""Split text into chunks using configured strategy.
|
806
|
+
|
807
|
+
Args:
|
808
|
+
inputs: Text and optional metadata
|
809
|
+
|
810
|
+
Returns:
|
811
|
+
Text chunks and metadata
|
812
|
+
|
813
|
+
Raises:
|
814
|
+
NodeExecutionError: If splitting fails
|
815
|
+
"""
|
816
|
+
try:
|
817
|
+
text = inputs.get("text", "")
|
818
|
+
metadata = inputs.get("metadata", {})
|
819
|
+
|
820
|
+
if not text:
|
821
|
+
return {"chunks": [], "chunk_metadata": []}
|
822
|
+
|
823
|
+
strategy = self.config.get("strategy", "recursive")
|
824
|
+
|
825
|
+
if strategy == "recursive":
|
826
|
+
chunks = self._recursive_split(text)
|
827
|
+
elif strategy == "character":
|
828
|
+
chunks = self._character_split(text)
|
829
|
+
elif strategy == "sentence":
|
830
|
+
chunks = self._sentence_split(text)
|
831
|
+
elif strategy == "token":
|
832
|
+
chunks = self._token_split(text)
|
833
|
+
else:
|
834
|
+
raise ValueError(f"Unknown strategy: {strategy}")
|
835
|
+
|
836
|
+
# Create metadata for each chunk
|
837
|
+
chunk_metadata = []
|
838
|
+
for i, chunk in enumerate(chunks):
|
839
|
+
chunk_meta = metadata.copy()
|
840
|
+
chunk_meta.update(
|
841
|
+
{
|
842
|
+
"chunk_index": i,
|
843
|
+
"chunk_size": len(chunk),
|
844
|
+
"total_chunks": len(chunks),
|
845
|
+
}
|
846
|
+
)
|
847
|
+
chunk_metadata.append(chunk_meta)
|
848
|
+
|
849
|
+
return {
|
850
|
+
"chunks": chunks,
|
851
|
+
"chunk_metadata": chunk_metadata,
|
852
|
+
"total_chunks": len(chunks),
|
853
|
+
}
|
854
|
+
except Exception as e:
|
855
|
+
raise NodeExecutionError(f"Text splitting failed: {str(e)}")
|
856
|
+
|
857
|
+
def _recursive_split(self, text: str) -> List[str]:
|
858
|
+
"""Split text recursively using multiple separators.
|
859
|
+
|
860
|
+
Args:
|
861
|
+
text: Text to split
|
862
|
+
|
863
|
+
Returns:
|
864
|
+
List of text chunks
|
865
|
+
"""
|
866
|
+
# Placeholder implementation
|
867
|
+
chunk_size = self.config.get("chunk_size", 1000)
|
868
|
+
chunk_overlap = self.config.get("chunk_overlap", 200)
|
869
|
+
|
870
|
+
chunks = []
|
871
|
+
current_pos = 0
|
872
|
+
|
873
|
+
while current_pos < len(text):
|
874
|
+
end_pos = min(current_pos + chunk_size, len(text))
|
875
|
+
chunk = text[current_pos:end_pos]
|
876
|
+
chunks.append(chunk)
|
877
|
+
current_pos += chunk_size - chunk_overlap
|
878
|
+
|
879
|
+
return chunks
|
880
|
+
|
881
|
+
def _character_split(self, text: str) -> List[str]:
|
882
|
+
"""Split text by character count.
|
883
|
+
|
884
|
+
Args:
|
885
|
+
text: Text to split
|
886
|
+
|
887
|
+
Returns:
|
888
|
+
List of text chunks
|
889
|
+
"""
|
890
|
+
# Placeholder implementation
|
891
|
+
chunk_size = self.config.get("chunk_size", 1000)
|
892
|
+
separator = self.config.get("separator", "\n")
|
893
|
+
|
894
|
+
parts = text.split(separator)
|
895
|
+
chunks = []
|
896
|
+
current_chunk = ""
|
897
|
+
|
898
|
+
for part in parts:
|
899
|
+
if len(current_chunk + part) > chunk_size:
|
900
|
+
if current_chunk:
|
901
|
+
chunks.append(current_chunk)
|
902
|
+
current_chunk = part
|
903
|
+
else:
|
904
|
+
current_chunk += separator + part if current_chunk else part
|
905
|
+
|
906
|
+
if current_chunk:
|
907
|
+
chunks.append(current_chunk)
|
908
|
+
|
909
|
+
return chunks
|
910
|
+
|
911
|
+
def _sentence_split(self, text: str) -> List[str]:
|
912
|
+
"""Split text by sentences.
|
913
|
+
|
914
|
+
Args:
|
915
|
+
text: Text to split
|
916
|
+
|
917
|
+
Returns:
|
918
|
+
List of text chunks
|
919
|
+
"""
|
920
|
+
# Placeholder implementation - would use proper sentence tokenization
|
921
|
+
sentences = text.split(". ")
|
922
|
+
chunks = []
|
923
|
+
current_chunk = ""
|
924
|
+
chunk_size = self.config.get("chunk_size", 1000)
|
925
|
+
|
926
|
+
for sentence in sentences:
|
927
|
+
if len(current_chunk + sentence) > chunk_size:
|
928
|
+
if current_chunk:
|
929
|
+
chunks.append(current_chunk)
|
930
|
+
current_chunk = sentence + "."
|
931
|
+
else:
|
932
|
+
current_chunk += sentence + ". " if current_chunk else sentence + "."
|
933
|
+
|
934
|
+
if current_chunk:
|
935
|
+
chunks.append(current_chunk.rstrip())
|
936
|
+
|
937
|
+
return chunks
|
938
|
+
|
939
|
+
def _token_split(self, text: str) -> List[str]:
|
940
|
+
"""Split text by token count.
|
941
|
+
|
942
|
+
Args:
|
943
|
+
text: Text to split
|
944
|
+
|
945
|
+
Returns:
|
946
|
+
List of text chunks
|
947
|
+
"""
|
948
|
+
# Placeholder implementation - would use tokenizer
|
949
|
+
words = text.split()
|
950
|
+
chunks = []
|
951
|
+
current_chunk = []
|
952
|
+
chunk_size = self.config.get("chunk_size", 1000) // 4 # Rough token estimate
|
953
|
+
|
954
|
+
for word in words:
|
955
|
+
if len(current_chunk) >= chunk_size:
|
956
|
+
chunks.append(" ".join(current_chunk))
|
957
|
+
current_chunk = [word]
|
958
|
+
else:
|
959
|
+
current_chunk.append(word)
|
960
|
+
|
961
|
+
if current_chunk:
|
962
|
+
chunks.append(" ".join(current_chunk))
|
963
|
+
|
964
|
+
return chunks
|