kailash 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +31 -0
- kailash/__main__.py +11 -0
- kailash/cli/__init__.py +5 -0
- kailash/cli/commands.py +563 -0
- kailash/manifest.py +778 -0
- kailash/nodes/__init__.py +23 -0
- kailash/nodes/ai/__init__.py +26 -0
- kailash/nodes/ai/agents.py +417 -0
- kailash/nodes/ai/models.py +488 -0
- kailash/nodes/api/__init__.py +52 -0
- kailash/nodes/api/auth.py +567 -0
- kailash/nodes/api/graphql.py +480 -0
- kailash/nodes/api/http.py +598 -0
- kailash/nodes/api/rate_limiting.py +572 -0
- kailash/nodes/api/rest.py +665 -0
- kailash/nodes/base.py +1032 -0
- kailash/nodes/base_async.py +128 -0
- kailash/nodes/code/__init__.py +32 -0
- kailash/nodes/code/python.py +1021 -0
- kailash/nodes/data/__init__.py +125 -0
- kailash/nodes/data/readers.py +496 -0
- kailash/nodes/data/sharepoint_graph.py +623 -0
- kailash/nodes/data/sql.py +380 -0
- kailash/nodes/data/streaming.py +1168 -0
- kailash/nodes/data/vector_db.py +964 -0
- kailash/nodes/data/writers.py +529 -0
- kailash/nodes/logic/__init__.py +6 -0
- kailash/nodes/logic/async_operations.py +702 -0
- kailash/nodes/logic/operations.py +551 -0
- kailash/nodes/transform/__init__.py +5 -0
- kailash/nodes/transform/processors.py +379 -0
- kailash/runtime/__init__.py +6 -0
- kailash/runtime/async_local.py +356 -0
- kailash/runtime/docker.py +697 -0
- kailash/runtime/local.py +434 -0
- kailash/runtime/parallel.py +557 -0
- kailash/runtime/runner.py +110 -0
- kailash/runtime/testing.py +347 -0
- kailash/sdk_exceptions.py +307 -0
- kailash/tracking/__init__.py +7 -0
- kailash/tracking/manager.py +885 -0
- kailash/tracking/metrics_collector.py +342 -0
- kailash/tracking/models.py +535 -0
- kailash/tracking/storage/__init__.py +0 -0
- kailash/tracking/storage/base.py +113 -0
- kailash/tracking/storage/database.py +619 -0
- kailash/tracking/storage/filesystem.py +543 -0
- kailash/utils/__init__.py +0 -0
- kailash/utils/export.py +924 -0
- kailash/utils/templates.py +680 -0
- kailash/visualization/__init__.py +62 -0
- kailash/visualization/api.py +732 -0
- kailash/visualization/dashboard.py +951 -0
- kailash/visualization/performance.py +808 -0
- kailash/visualization/reports.py +1471 -0
- kailash/workflow/__init__.py +15 -0
- kailash/workflow/builder.py +245 -0
- kailash/workflow/graph.py +827 -0
- kailash/workflow/mermaid_visualizer.py +628 -0
- kailash/workflow/mock_registry.py +63 -0
- kailash/workflow/runner.py +302 -0
- kailash/workflow/state.py +238 -0
- kailash/workflow/visualization.py +588 -0
- kailash-0.1.0.dist-info/METADATA +710 -0
- kailash-0.1.0.dist-info/RECORD +69 -0
- kailash-0.1.0.dist-info/WHEEL +5 -0
- kailash-0.1.0.dist-info/entry_points.txt +2 -0
- kailash-0.1.0.dist-info/licenses/LICENSE +21 -0
- kailash-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
"""Data processing nodes for the Kailash SDK.
|
2
|
+
|
3
|
+
This package provides comprehensive data input/output nodes that serve as the
|
4
|
+
primary interface between the Kailash workflow system and external data sources.
|
5
|
+
These nodes form the foundation of most workflows by enabling data ingestion,
|
6
|
+
persistence, and real-time processing.
|
7
|
+
|
8
|
+
Module Organization:
|
9
|
+
- readers.py: Data source nodes for reading files
|
10
|
+
- writers.py: Data sink nodes for writing files
|
11
|
+
- sql.py: SQL database interaction nodes
|
12
|
+
- vector_db.py: Vector database and embedding nodes
|
13
|
+
- streaming.py: Real-time streaming data nodes
|
14
|
+
|
15
|
+
Design Philosophy:
|
16
|
+
1. Consistent interfaces across data sources
|
17
|
+
2. Type-safe parameter validation
|
18
|
+
3. Memory-efficient processing
|
19
|
+
4. Comprehensive error handling
|
20
|
+
5. Format-specific optimizations
|
21
|
+
6. Real-time and batch processing support
|
22
|
+
|
23
|
+
Node Categories:
|
24
|
+
- Readers: Bring external data into workflows
|
25
|
+
- Writers: Persist processed data to files
|
26
|
+
- SQL: Interact with relational databases
|
27
|
+
- Vector DB: Handle embeddings and similarity search
|
28
|
+
- Streaming: Process real-time data streams
|
29
|
+
|
30
|
+
Usage Patterns:
|
31
|
+
1. ETL pipelines: Read → Transform → Write
|
32
|
+
2. Data processing: Read → Analyze → Export
|
33
|
+
3. RAG pipelines: Text → Embed → Store → Search
|
34
|
+
4. Real-time analytics: Stream → Process → Aggregate
|
35
|
+
5. Database operations: Query → Transform → Insert
|
36
|
+
|
37
|
+
Integration Points:
|
38
|
+
- Upstream: File systems, APIs, databases, streams
|
39
|
+
- Downstream: Transform nodes, AI models, analytics
|
40
|
+
- Parallel: Other data nodes in workflow
|
41
|
+
|
42
|
+
Advanced Features:
|
43
|
+
- Connection pooling for databases
|
44
|
+
- Batch processing for efficiency
|
45
|
+
- Real-time streaming support
|
46
|
+
- Vector similarity search
|
47
|
+
- Event-driven architectures
|
48
|
+
|
49
|
+
Error Handling:
|
50
|
+
All nodes provide detailed error messages for:
|
51
|
+
- Connection failures
|
52
|
+
- Authentication errors
|
53
|
+
- Format/schema issues
|
54
|
+
- Rate limiting
|
55
|
+
- Resource constraints
|
56
|
+
|
57
|
+
Example Workflows:
|
58
|
+
# Traditional ETL
|
59
|
+
workflow = Workflow()
|
60
|
+
workflow.add_node('read', CSVReader(file_path='input.csv'))
|
61
|
+
workflow.add_node('transform', DataTransform())
|
62
|
+
workflow.add_node('write', JSONWriter(file_path='output.json'))
|
63
|
+
workflow.connect('read', 'transform')
|
64
|
+
workflow.connect('transform', 'write')
|
65
|
+
|
66
|
+
# RAG Pipeline
|
67
|
+
workflow = Workflow()
|
68
|
+
workflow.add_node('split', TextSplitterNode())
|
69
|
+
workflow.add_node('embed', EmbeddingNode())
|
70
|
+
workflow.add_node('store', VectorDatabaseNode())
|
71
|
+
workflow.connect('split', 'embed')
|
72
|
+
workflow.connect('embed', 'store')
|
73
|
+
|
74
|
+
# Real-time Processing
|
75
|
+
workflow = Workflow()
|
76
|
+
workflow.add_node('consume', KafkaConsumerNode())
|
77
|
+
workflow.add_node('process', StreamProcessor())
|
78
|
+
workflow.add_node('publish', StreamPublisherNode())
|
79
|
+
workflow.connect('consume', 'process')
|
80
|
+
workflow.connect('process', 'publish')
|
81
|
+
"""
|
82
|
+
|
83
|
+
from kailash.nodes.data.readers import CSVReader, JSONReader, TextReader
|
84
|
+
from kailash.nodes.data.sharepoint_graph import (
|
85
|
+
SharePointGraphReader,
|
86
|
+
SharePointGraphWriter,
|
87
|
+
)
|
88
|
+
from kailash.nodes.data.sql import SQLDatabaseNode, SQLQueryBuilderNode
|
89
|
+
from kailash.nodes.data.streaming import (
|
90
|
+
EventStreamNode,
|
91
|
+
KafkaConsumerNode,
|
92
|
+
StreamPublisherNode,
|
93
|
+
WebSocketNode,
|
94
|
+
)
|
95
|
+
from kailash.nodes.data.vector_db import (
|
96
|
+
EmbeddingNode,
|
97
|
+
TextSplitterNode,
|
98
|
+
VectorDatabaseNode,
|
99
|
+
)
|
100
|
+
from kailash.nodes.data.writers import CSVWriter, JSONWriter, TextWriter
|
101
|
+
|
102
|
+
__all__ = [
|
103
|
+
# Readers
|
104
|
+
"CSVReader",
|
105
|
+
"JSONReader",
|
106
|
+
"TextReader",
|
107
|
+
"SharePointGraphReader",
|
108
|
+
# Writers
|
109
|
+
"CSVWriter",
|
110
|
+
"JSONWriter",
|
111
|
+
"TextWriter",
|
112
|
+
"SharePointGraphWriter",
|
113
|
+
# SQL
|
114
|
+
"SQLDatabaseNode",
|
115
|
+
"SQLQueryBuilderNode",
|
116
|
+
# Vector DB
|
117
|
+
"EmbeddingNode",
|
118
|
+
"VectorDatabaseNode",
|
119
|
+
"TextSplitterNode",
|
120
|
+
# Streaming
|
121
|
+
"KafkaConsumerNode",
|
122
|
+
"StreamPublisherNode",
|
123
|
+
"WebSocketNode",
|
124
|
+
"EventStreamNode",
|
125
|
+
]
|
@@ -0,0 +1,496 @@
|
|
1
|
+
"""Data reader nodes for the Kailash SDK.
|
2
|
+
|
3
|
+
This module provides node implementations for reading data from various file formats.
|
4
|
+
These nodes serve as data sources in workflows, bringing external data into the
|
5
|
+
Kailash processing pipeline.
|
6
|
+
|
7
|
+
Design Philosophy:
|
8
|
+
1. Unified interface for different file formats
|
9
|
+
2. Consistent output format (always returns {"data": ...})
|
10
|
+
3. Robust error handling for file operations
|
11
|
+
4. Memory-efficient processing where possible
|
12
|
+
5. Type-safe parameter validation
|
13
|
+
|
14
|
+
Node Categories:
|
15
|
+
- CSVReader: Tabular data from CSV files
|
16
|
+
- JSONReader: Structured data from JSON files
|
17
|
+
- TextReader: Raw text from any text file
|
18
|
+
|
19
|
+
Upstream Components:
|
20
|
+
- FileSystem: Provides files to read
|
21
|
+
- Workflow: Creates and configures reader nodes
|
22
|
+
- User Input: Specifies file paths and options
|
23
|
+
|
24
|
+
Downstream Consumers:
|
25
|
+
- Transform nodes: Process the loaded data
|
26
|
+
- Writer nodes: Export data to different formats
|
27
|
+
- Logic nodes: Make decisions based on data
|
28
|
+
- AI nodes: Use data for model input
|
29
|
+
"""
|
30
|
+
|
31
|
+
import csv
|
32
|
+
import json
|
33
|
+
from typing import Any, Dict
|
34
|
+
|
35
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
36
|
+
|
37
|
+
|
38
|
+
@register_node()
|
39
|
+
class CSVReader(Node):
|
40
|
+
"""Reads data from a CSV file.
|
41
|
+
|
42
|
+
This node provides robust CSV file reading capabilities with support for
|
43
|
+
various delimiters, header detection, and encoding options. It's designed
|
44
|
+
to handle common CSV formats and edge cases.
|
45
|
+
|
46
|
+
Design Features:
|
47
|
+
1. Automatic header detection
|
48
|
+
2. Configurable delimiters
|
49
|
+
3. Memory-efficient line-by-line reading
|
50
|
+
4. Consistent dictionary output format
|
51
|
+
5. Unicode support through encoding parameter
|
52
|
+
|
53
|
+
Data Flow:
|
54
|
+
- Input: File path and configuration parameters
|
55
|
+
- Processing: Reads CSV line by line, converting to dictionaries
|
56
|
+
- Output: List of dictionaries (with headers) or list of lists
|
57
|
+
|
58
|
+
Common Usage Patterns:
|
59
|
+
1. Reading data exports from databases
|
60
|
+
2. Processing spreadsheet data
|
61
|
+
3. Loading configuration from CSV
|
62
|
+
4. Ingesting sensor data logs
|
63
|
+
|
64
|
+
Upstream Sources:
|
65
|
+
- File system paths from user input
|
66
|
+
- Output paths from previous nodes
|
67
|
+
- Configuration management systems
|
68
|
+
|
69
|
+
Downstream Consumers:
|
70
|
+
- DataTransformer: Processes tabular data
|
71
|
+
- Aggregator: Summarizes data
|
72
|
+
- CSVWriter: Reformats and saves
|
73
|
+
- Visualizer: Creates charts from data
|
74
|
+
|
75
|
+
Error Handling:
|
76
|
+
- FileNotFoundError: Invalid file path
|
77
|
+
- PermissionError: Insufficient read permissions
|
78
|
+
- UnicodeDecodeError: Encoding mismatch
|
79
|
+
- csv.Error: Malformed CSV data
|
80
|
+
|
81
|
+
Example:
|
82
|
+
# Read customer data with headers
|
83
|
+
reader = CSVReader(
|
84
|
+
file_path='customers.csv',
|
85
|
+
headers=True,
|
86
|
+
delimiter=','
|
87
|
+
)
|
88
|
+
result = reader.execute()
|
89
|
+
# result['data'] = [
|
90
|
+
# {'id': '1', 'name': 'John', 'age': '30'},
|
91
|
+
# {'id': '2', 'name': 'Jane', 'age': '25'}
|
92
|
+
# ]
|
93
|
+
"""
|
94
|
+
|
95
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
96
|
+
"""Define input parameters for CSV reading.
|
97
|
+
|
98
|
+
This method specifies the configuration options for reading CSV files,
|
99
|
+
providing flexibility while maintaining sensible defaults.
|
100
|
+
|
101
|
+
Parameter Design:
|
102
|
+
1. file_path: Required for locating the data source
|
103
|
+
2. headers: Optional with smart default (True)
|
104
|
+
3. delimiter: Optional with standard default (',')
|
105
|
+
4. index_column: Optional column to use as dictionary key
|
106
|
+
|
107
|
+
The parameters are designed to handle common CSV variants while
|
108
|
+
keeping the interface simple for typical use cases.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Dictionary of parameter definitions used by:
|
112
|
+
- Input validation during execution
|
113
|
+
- UI generation for configuration
|
114
|
+
- Workflow validation for connections
|
115
|
+
- Documentation and help systems
|
116
|
+
"""
|
117
|
+
return {
|
118
|
+
"file_path": NodeParameter(
|
119
|
+
name="file_path",
|
120
|
+
type=str,
|
121
|
+
required=True,
|
122
|
+
description="Path to the CSV file to read",
|
123
|
+
),
|
124
|
+
"headers": NodeParameter(
|
125
|
+
name="headers",
|
126
|
+
type=bool,
|
127
|
+
required=False,
|
128
|
+
default=True,
|
129
|
+
description="Whether the CSV has headers",
|
130
|
+
),
|
131
|
+
"delimiter": NodeParameter(
|
132
|
+
name="delimiter",
|
133
|
+
type=str,
|
134
|
+
required=False,
|
135
|
+
default=",",
|
136
|
+
description="CSV delimiter character",
|
137
|
+
),
|
138
|
+
"index_column": NodeParameter(
|
139
|
+
name="index_column",
|
140
|
+
type=str,
|
141
|
+
required=False,
|
142
|
+
description="Column to use as index for creating a dictionary",
|
143
|
+
),
|
144
|
+
}
|
145
|
+
|
146
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
147
|
+
"""Execute CSV reading operation.
|
148
|
+
|
149
|
+
This method performs the actual file reading, handling both headerless
|
150
|
+
and header-based CSV formats. It uses Python's csv module for robust
|
151
|
+
parsing of various CSV dialects.
|
152
|
+
|
153
|
+
Processing Steps:
|
154
|
+
1. Opens file with UTF-8 encoding (standard)
|
155
|
+
2. Creates csv.reader with specified delimiter
|
156
|
+
3. Processes headers if present
|
157
|
+
4. Converts rows to appropriate format
|
158
|
+
5. Returns standardized output
|
159
|
+
|
160
|
+
Memory Considerations:
|
161
|
+
- Loads entire file into memory
|
162
|
+
- Suitable for files up to ~100MB
|
163
|
+
- For larger files, consider streaming approach
|
164
|
+
|
165
|
+
Output Format:
|
166
|
+
- With headers: List of dictionaries
|
167
|
+
- Without headers: List of lists
|
168
|
+
- With index_column: Also returns dictionary indexed by the column
|
169
|
+
- Always wrapped in {"data": ...} for consistency
|
170
|
+
|
171
|
+
Args:
|
172
|
+
**kwargs: Validated parameters including:
|
173
|
+
- file_path: Path to CSV file
|
174
|
+
- headers: Whether to treat first row as headers
|
175
|
+
- delimiter: Character separating values
|
176
|
+
- index_column: Column to use as key for indexed dictionary
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
Dictionary with:
|
180
|
+
- 'data' key containing list of dicts or lists
|
181
|
+
- 'data_indexed' key (if index_column provided) containing dict
|
182
|
+
|
183
|
+
Raises:
|
184
|
+
FileNotFoundError: If file doesn't exist
|
185
|
+
PermissionError: If file can't be read
|
186
|
+
UnicodeDecodeError: If encoding is wrong
|
187
|
+
KeyError: If index_column doesn't exist in headers
|
188
|
+
|
189
|
+
Downstream usage:
|
190
|
+
- Transform nodes expect consistent data structure
|
191
|
+
- Writers can directly output the data
|
192
|
+
- Analyzers can process row-by-row
|
193
|
+
- data_indexed is useful for lookups and joins
|
194
|
+
"""
|
195
|
+
file_path = kwargs["file_path"]
|
196
|
+
headers = kwargs.get("headers", True)
|
197
|
+
delimiter = kwargs.get("delimiter", ",")
|
198
|
+
index_column = kwargs.get("index_column")
|
199
|
+
|
200
|
+
data = []
|
201
|
+
data_indexed = {}
|
202
|
+
|
203
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
204
|
+
reader = csv.reader(f, delimiter=delimiter)
|
205
|
+
|
206
|
+
if headers:
|
207
|
+
header_row = next(reader)
|
208
|
+
|
209
|
+
# Verify index_column exists if specified
|
210
|
+
if index_column and index_column not in header_row:
|
211
|
+
raise KeyError(
|
212
|
+
f"Index column '{index_column}' not found in headers: {header_row}"
|
213
|
+
)
|
214
|
+
|
215
|
+
index_pos = header_row.index(index_column) if index_column else None
|
216
|
+
|
217
|
+
for row in reader:
|
218
|
+
row_dict = dict(zip(header_row, row))
|
219
|
+
data.append(row_dict)
|
220
|
+
|
221
|
+
# If index column specified, add to indexed dictionary
|
222
|
+
if index_column and index_pos < len(row):
|
223
|
+
key = row[index_pos]
|
224
|
+
data_indexed[key] = row_dict
|
225
|
+
else:
|
226
|
+
for row in reader:
|
227
|
+
data.append(row)
|
228
|
+
|
229
|
+
result = {"data": data}
|
230
|
+
if index_column:
|
231
|
+
result["data_indexed"] = data_indexed
|
232
|
+
|
233
|
+
return result
|
234
|
+
|
235
|
+
|
236
|
+
@register_node()
|
237
|
+
class JSONReader(Node):
|
238
|
+
"""Reads data from a JSON file.
|
239
|
+
|
240
|
+
This node handles JSON file reading with support for complex nested
|
241
|
+
structures, arrays, and objects. It preserves the original JSON
|
242
|
+
structure while ensuring compatibility with downstream nodes.
|
243
|
+
|
244
|
+
Design Features:
|
245
|
+
1. Preserves JSON structure integrity
|
246
|
+
2. Handles nested objects and arrays
|
247
|
+
3. Unicode-safe reading
|
248
|
+
4. Automatic type preservation
|
249
|
+
5. Memory-efficient for reasonable file sizes
|
250
|
+
|
251
|
+
Data Flow:
|
252
|
+
- Input: JSON file path
|
253
|
+
- Processing: Parse JSON maintaining structure
|
254
|
+
- Output: Python objects matching JSON structure
|
255
|
+
|
256
|
+
Common Usage Patterns:
|
257
|
+
1. Loading configuration files
|
258
|
+
2. Reading API response caches
|
259
|
+
3. Processing structured data exports
|
260
|
+
4. Loading machine learning datasets
|
261
|
+
|
262
|
+
Upstream Sources:
|
263
|
+
- API response saves
|
264
|
+
- Configuration management
|
265
|
+
- Data export systems
|
266
|
+
- Previous JSONWriter outputs
|
267
|
+
|
268
|
+
Downstream Consumers:
|
269
|
+
- Transform nodes: Process structured data
|
270
|
+
- Logic nodes: Navigate JSON structure
|
271
|
+
- JSONWriter: Re-export with modifications
|
272
|
+
- AI nodes: Use as structured input
|
273
|
+
|
274
|
+
Error Handling:
|
275
|
+
- FileNotFoundError: Missing file
|
276
|
+
- json.JSONDecodeError: Invalid JSON syntax
|
277
|
+
- PermissionError: Access denied
|
278
|
+
- MemoryError: File too large
|
279
|
+
|
280
|
+
Example:
|
281
|
+
# Read API response data
|
282
|
+
reader = JSONReader(file_path='api_response.json')
|
283
|
+
result = reader.execute()
|
284
|
+
# result['data'] = {
|
285
|
+
# 'status': 'success',
|
286
|
+
# 'items': [{'id': 1, 'name': 'Item1'}],
|
287
|
+
# 'metadata': {'version': '1.0'}
|
288
|
+
# }
|
289
|
+
"""
|
290
|
+
|
291
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
292
|
+
"""Define input parameters for JSON reading.
|
293
|
+
|
294
|
+
Simple parameter definition reflecting JSON's self-describing nature.
|
295
|
+
Unlike CSV, JSON files don't require format configuration.
|
296
|
+
|
297
|
+
Design Choice:
|
298
|
+
- Single required parameter for simplicity
|
299
|
+
- No encoding parameter (UTF-8 standard for JSON)
|
300
|
+
- No structure hints needed (self-describing format)
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
Dictionary with single file_path parameter
|
304
|
+
"""
|
305
|
+
return {
|
306
|
+
"file_path": NodeParameter(
|
307
|
+
name="file_path",
|
308
|
+
type=str,
|
309
|
+
required=True,
|
310
|
+
description="Path to the JSON file to read",
|
311
|
+
)
|
312
|
+
}
|
313
|
+
|
314
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
315
|
+
"""Execute JSON reading operation.
|
316
|
+
|
317
|
+
Reads and parses JSON file, preserving the original structure
|
318
|
+
and types. The json.load() function handles the parsing and
|
319
|
+
type conversion automatically.
|
320
|
+
|
321
|
+
Processing Steps:
|
322
|
+
1. Opens file with UTF-8 encoding
|
323
|
+
2. Parses JSON to Python objects
|
324
|
+
3. Preserves structure (objects→dicts, arrays→lists)
|
325
|
+
4. Returns wrapped in standard format
|
326
|
+
|
327
|
+
Type Mappings:
|
328
|
+
- JSON objects → Python dicts
|
329
|
+
- JSON arrays → Python lists
|
330
|
+
- JSON strings → Python strings
|
331
|
+
- JSON numbers → Python int/float
|
332
|
+
- JSON booleans → Python bool
|
333
|
+
- JSON null → Python None
|
334
|
+
|
335
|
+
Args:
|
336
|
+
**kwargs: Validated parameters including:
|
337
|
+
- file_path: Path to JSON file
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
Dictionary with 'data' key containing the parsed JSON
|
341
|
+
|
342
|
+
Raises:
|
343
|
+
FileNotFoundError: If file doesn't exist
|
344
|
+
json.JSONDecodeError: If JSON is malformed
|
345
|
+
PermissionError: If file can't be read
|
346
|
+
|
347
|
+
Downstream usage:
|
348
|
+
- Structure can be directly navigated
|
349
|
+
- Compatible with JSONWriter for round-trip
|
350
|
+
- Transform nodes can process nested data
|
351
|
+
"""
|
352
|
+
file_path = kwargs["file_path"]
|
353
|
+
|
354
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
355
|
+
data = json.load(f)
|
356
|
+
|
357
|
+
return {"data": data}
|
358
|
+
|
359
|
+
|
360
|
+
@register_node()
|
361
|
+
class TextReader(Node):
|
362
|
+
"""Reads text from a file.
|
363
|
+
|
364
|
+
This node provides simple text file reading with encoding support.
|
365
|
+
It's designed for processing plain text files, logs, documents,
|
366
|
+
and any text-based format not handled by specialized readers.
|
367
|
+
|
368
|
+
Design Features:
|
369
|
+
1. Flexible encoding support
|
370
|
+
2. Reads entire file as single string
|
371
|
+
3. Preserves line endings and whitespace
|
372
|
+
4. Handles various text encodings
|
373
|
+
5. Simple, predictable output format
|
374
|
+
|
375
|
+
Data Flow:
|
376
|
+
- Input: File path and encoding
|
377
|
+
- Processing: Read entire file as text
|
378
|
+
- Output: Single text string
|
379
|
+
|
380
|
+
Common Usage Patterns:
|
381
|
+
1. Reading log files
|
382
|
+
2. Processing documentation
|
383
|
+
3. Loading templates
|
384
|
+
4. Reading configuration files
|
385
|
+
5. Processing natural language data
|
386
|
+
|
387
|
+
Upstream Sources:
|
388
|
+
- Log file generators
|
389
|
+
- Document management systems
|
390
|
+
- Template repositories
|
391
|
+
- Previous TextWriter outputs
|
392
|
+
|
393
|
+
Downstream Consumers:
|
394
|
+
- NLP processors: Analyze text content
|
395
|
+
- Pattern matchers: Search for patterns
|
396
|
+
- TextWriter: Save processed text
|
397
|
+
- AI models: Process natural language
|
398
|
+
|
399
|
+
Error Handling:
|
400
|
+
- FileNotFoundError: Missing file
|
401
|
+
- PermissionError: Access denied
|
402
|
+
- UnicodeDecodeError: Wrong encoding
|
403
|
+
- MemoryError: File too large
|
404
|
+
|
405
|
+
Example:
|
406
|
+
# Read a log file
|
407
|
+
reader = TextReader(
|
408
|
+
file_path='application.log',
|
409
|
+
encoding='utf-8'
|
410
|
+
)
|
411
|
+
result = reader.execute()
|
412
|
+
# result['text'] = "2024-01-01 INFO: Application started\\n..."
|
413
|
+
"""
|
414
|
+
|
415
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
416
|
+
"""Define input parameters for text reading.
|
417
|
+
|
418
|
+
Provides essential parameters for text file reading with
|
419
|
+
encoding flexibility to handle international text.
|
420
|
+
|
421
|
+
Parameter Design:
|
422
|
+
1. file_path: Required for file location
|
423
|
+
2. encoding: Optional with UTF-8 default
|
424
|
+
|
425
|
+
The encoding parameter is crucial for:
|
426
|
+
- International text support
|
427
|
+
- Legacy system compatibility
|
428
|
+
- Log file processing
|
429
|
+
- Cross-platform text handling
|
430
|
+
|
431
|
+
Returns:
|
432
|
+
Dictionary of parameter definitions
|
433
|
+
"""
|
434
|
+
return {
|
435
|
+
"file_path": NodeParameter(
|
436
|
+
name="file_path",
|
437
|
+
type=str,
|
438
|
+
required=True,
|
439
|
+
description="Path to the text file to read",
|
440
|
+
),
|
441
|
+
"encoding": NodeParameter(
|
442
|
+
name="encoding",
|
443
|
+
type=str,
|
444
|
+
required=False,
|
445
|
+
default="utf-8",
|
446
|
+
description="File encoding",
|
447
|
+
),
|
448
|
+
}
|
449
|
+
|
450
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
451
|
+
"""Execute text reading operation.
|
452
|
+
|
453
|
+
Reads entire text file into memory as a single string,
|
454
|
+
preserving all formatting, line endings, and whitespace.
|
455
|
+
|
456
|
+
Processing Steps:
|
457
|
+
1. Opens file with specified encoding
|
458
|
+
2. Reads entire content as string
|
459
|
+
3. Preserves original formatting
|
460
|
+
4. Returns in standard format
|
461
|
+
|
462
|
+
Memory Considerations:
|
463
|
+
- Loads entire file into memory
|
464
|
+
- Suitable for files up to ~10MB
|
465
|
+
- Large files may need streaming approach
|
466
|
+
|
467
|
+
Output Note:
|
468
|
+
- Returns {"text": ...} not {"data": ...}
|
469
|
+
- Different from CSV/JSON readers for clarity
|
470
|
+
- Text is unprocessed, raw content
|
471
|
+
|
472
|
+
Args:
|
473
|
+
**kwargs: Validated parameters including:
|
474
|
+
- file_path: Path to text file
|
475
|
+
- encoding: Character encoding
|
476
|
+
|
477
|
+
Returns:
|
478
|
+
Dictionary with 'text' key containing file content
|
479
|
+
|
480
|
+
Raises:
|
481
|
+
FileNotFoundError: If file doesn't exist
|
482
|
+
UnicodeDecodeError: If encoding is incorrect
|
483
|
+
PermissionError: If file can't be read
|
484
|
+
|
485
|
+
Downstream usage:
|
486
|
+
- NLP nodes can tokenize/analyze
|
487
|
+
- Pattern nodes can search content
|
488
|
+
- Writers can save processed text
|
489
|
+
"""
|
490
|
+
file_path = kwargs["file_path"]
|
491
|
+
encoding = kwargs.get("encoding", "utf-8")
|
492
|
+
|
493
|
+
with open(file_path, "r", encoding=encoding) as f:
|
494
|
+
text = f.read()
|
495
|
+
|
496
|
+
return {"text": text}
|