kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,741 @@
|
|
1
|
+
"""Batch processing node for optimized operations with rate limiting and progress tracking.
|
2
|
+
|
3
|
+
This module provides intelligent batch processing capabilities that optimize
|
4
|
+
operations for APIs, databases, and data processing tasks. It includes
|
5
|
+
rate limiting, parallel processing, progress tracking, and automatic
|
6
|
+
error recovery.
|
7
|
+
|
8
|
+
Key Features:
|
9
|
+
- Intelligent batching strategies
|
10
|
+
- API rate limit awareness
|
11
|
+
- Parallel processing support
|
12
|
+
- Progress tracking and reporting
|
13
|
+
- Automatic retry and error recovery
|
14
|
+
- Memory management
|
15
|
+
- Configurable batch sizes
|
16
|
+
"""
|
17
|
+
|
18
|
+
import asyncio
|
19
|
+
import math
|
20
|
+
import time
|
21
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22
|
+
from datetime import datetime, timedelta
|
23
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
24
|
+
|
25
|
+
from kailash.nodes.base import Node, NodeMetadata, NodeParameter, register_node
|
26
|
+
from kailash.sdk_exceptions import NodeConfigurationError, NodeExecutionError
|
27
|
+
|
28
|
+
|
29
|
+
@register_node()
|
30
|
+
class BatchProcessorNode(Node):
|
31
|
+
"""Node for intelligent batch processing with optimization and rate limiting.
|
32
|
+
|
33
|
+
This node processes large datasets or operations in optimized batches,
|
34
|
+
providing rate limiting, parallel processing, progress tracking, and
|
35
|
+
automatic error recovery for enterprise-scale operations.
|
36
|
+
|
37
|
+
Key capabilities:
|
38
|
+
1. Intelligent batching strategies
|
39
|
+
2. API rate limit awareness
|
40
|
+
3. Parallel processing support
|
41
|
+
4. Progress tracking and reporting
|
42
|
+
5. Automatic retry and error recovery
|
43
|
+
6. Memory management optimization
|
44
|
+
|
45
|
+
Example:
|
46
|
+
>>> processor = BatchProcessorNode()
|
47
|
+
>>> result = processor.execute(
|
48
|
+
... operation="process_data",
|
49
|
+
... data_items=large_dataset,
|
50
|
+
... batch_size=100,
|
51
|
+
... processing_function="data_transformation",
|
52
|
+
... rate_limit_per_second=10,
|
53
|
+
... parallel_workers=4,
|
54
|
+
... retry_failed_batches=True
|
55
|
+
... )
|
56
|
+
"""
|
57
|
+
|
58
|
+
def get_metadata(self) -> NodeMetadata:
|
59
|
+
"""Get node metadata for discovery and orchestration."""
|
60
|
+
return NodeMetadata(
|
61
|
+
name="Batch Processor Node",
|
62
|
+
description="Intelligent batch processing with optimization and rate limiting",
|
63
|
+
tags={"enterprise", "batch", "processing", "optimization", "parallel"},
|
64
|
+
version="1.0.0",
|
65
|
+
author="Kailash SDK",
|
66
|
+
)
|
67
|
+
|
68
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
69
|
+
"""Define input parameters for batch processing operations."""
|
70
|
+
return {
|
71
|
+
"operation": NodeParameter(
|
72
|
+
name="operation",
|
73
|
+
type=str,
|
74
|
+
required=False,
|
75
|
+
default="process_data",
|
76
|
+
description="Operation: process_data, api_batch_calls, database_batch_operations",
|
77
|
+
),
|
78
|
+
"data_items": NodeParameter(
|
79
|
+
name="data_items",
|
80
|
+
type=list,
|
81
|
+
required=False,
|
82
|
+
description="List of data items to process in batches",
|
83
|
+
),
|
84
|
+
"batch_size": NodeParameter(
|
85
|
+
name="batch_size",
|
86
|
+
type=int,
|
87
|
+
required=False,
|
88
|
+
default=100,
|
89
|
+
description="Number of items to process per batch",
|
90
|
+
),
|
91
|
+
"processing_function": NodeParameter(
|
92
|
+
name="processing_function",
|
93
|
+
type=str,
|
94
|
+
required=False,
|
95
|
+
description="Name of the processing function to apply to each batch",
|
96
|
+
),
|
97
|
+
"processing_code": NodeParameter(
|
98
|
+
name="processing_code",
|
99
|
+
type=str,
|
100
|
+
required=False,
|
101
|
+
description="Python code to execute for each batch",
|
102
|
+
),
|
103
|
+
"rate_limit_per_second": NodeParameter(
|
104
|
+
name="rate_limit_per_second",
|
105
|
+
type=float,
|
106
|
+
required=False,
|
107
|
+
default=10.0,
|
108
|
+
description="Maximum operations per second (rate limiting)",
|
109
|
+
),
|
110
|
+
"parallel_workers": NodeParameter(
|
111
|
+
name="parallel_workers",
|
112
|
+
type=int,
|
113
|
+
required=False,
|
114
|
+
default=1,
|
115
|
+
description="Number of parallel workers for processing",
|
116
|
+
),
|
117
|
+
"retry_failed_batches": NodeParameter(
|
118
|
+
name="retry_failed_batches",
|
119
|
+
type=bool,
|
120
|
+
required=False,
|
121
|
+
default=True,
|
122
|
+
description="Whether to retry failed batches",
|
123
|
+
),
|
124
|
+
"max_retries": NodeParameter(
|
125
|
+
name="max_retries",
|
126
|
+
type=int,
|
127
|
+
required=False,
|
128
|
+
default=3,
|
129
|
+
description="Maximum number of retries for failed batches",
|
130
|
+
),
|
131
|
+
"retry_delay": NodeParameter(
|
132
|
+
name="retry_delay",
|
133
|
+
type=float,
|
134
|
+
required=False,
|
135
|
+
default=1.0,
|
136
|
+
description="Delay between retries in seconds",
|
137
|
+
),
|
138
|
+
"progress_callback": NodeParameter(
|
139
|
+
name="progress_callback",
|
140
|
+
type=str,
|
141
|
+
required=False,
|
142
|
+
description="Function name to call for progress updates",
|
143
|
+
),
|
144
|
+
"memory_limit_mb": NodeParameter(
|
145
|
+
name="memory_limit_mb",
|
146
|
+
type=int,
|
147
|
+
required=False,
|
148
|
+
default=1024,
|
149
|
+
description="Memory limit in MB for batch processing",
|
150
|
+
),
|
151
|
+
"adaptive_batch_size": NodeParameter(
|
152
|
+
name="adaptive_batch_size",
|
153
|
+
type=bool,
|
154
|
+
required=False,
|
155
|
+
default=True,
|
156
|
+
description="Whether to adapt batch size based on performance",
|
157
|
+
),
|
158
|
+
"api_endpoint": NodeParameter(
|
159
|
+
name="api_endpoint",
|
160
|
+
type=str,
|
161
|
+
required=False,
|
162
|
+
description="API endpoint for batch API calls",
|
163
|
+
),
|
164
|
+
"api_headers": NodeParameter(
|
165
|
+
name="api_headers",
|
166
|
+
type=dict,
|
167
|
+
required=False,
|
168
|
+
default={},
|
169
|
+
description="Headers for API requests",
|
170
|
+
),
|
171
|
+
"database_config": NodeParameter(
|
172
|
+
name="database_config",
|
173
|
+
type=dict,
|
174
|
+
required=False,
|
175
|
+
description="Database configuration for batch database operations",
|
176
|
+
),
|
177
|
+
}
|
178
|
+
|
179
|
+
def __init__(self, **kwargs):
|
180
|
+
"""Initialize the BatchProcessorNode."""
|
181
|
+
super().__init__(**kwargs)
|
182
|
+
self._processing_stats = {
|
183
|
+
"total_items": 0,
|
184
|
+
"processed_items": 0,
|
185
|
+
"failed_items": 0,
|
186
|
+
"total_batches": 0,
|
187
|
+
"successful_batches": 0,
|
188
|
+
"failed_batches": 0,
|
189
|
+
"start_time": None,
|
190
|
+
"end_time": None,
|
191
|
+
"processing_rate": 0.0,
|
192
|
+
}
|
193
|
+
|
194
|
+
def _create_batches(
|
195
|
+
self, data_items: List[Any], batch_size: int
|
196
|
+
) -> List[List[Any]]:
|
197
|
+
"""Create batches from the data items."""
|
198
|
+
batches = []
|
199
|
+
for i in range(0, len(data_items), batch_size):
|
200
|
+
batch = data_items[i : i + batch_size]
|
201
|
+
batches.append(batch)
|
202
|
+
return batches
|
203
|
+
|
204
|
+
def _calculate_optimal_batch_size(
|
205
|
+
self,
|
206
|
+
total_items: int,
|
207
|
+
rate_limit: float,
|
208
|
+
parallel_workers: int,
|
209
|
+
memory_limit_mb: int,
|
210
|
+
) -> int:
|
211
|
+
"""Calculate optimal batch size based on constraints."""
|
212
|
+
# Estimate memory per item (rough heuristic: 1KB per item)
|
213
|
+
estimated_memory_per_item = 1024 # bytes
|
214
|
+
max_items_by_memory = (
|
215
|
+
memory_limit_mb * 1024 * 1024
|
216
|
+
) // estimated_memory_per_item
|
217
|
+
|
218
|
+
# Calculate based on rate limiting
|
219
|
+
# If we have parallel workers, we can process more items per second
|
220
|
+
effective_rate = rate_limit * parallel_workers
|
221
|
+
|
222
|
+
# Target processing time per batch (1-10 seconds)
|
223
|
+
target_batch_time = min(10.0, max(1.0, 60.0 / effective_rate))
|
224
|
+
rate_based_batch_size = int(effective_rate * target_batch_time)
|
225
|
+
|
226
|
+
# Use minimum of constraints, but ensure at least 1 item per batch
|
227
|
+
optimal_size = max(
|
228
|
+
1, min(max_items_by_memory, rate_based_batch_size, total_items)
|
229
|
+
)
|
230
|
+
|
231
|
+
return optimal_size
|
232
|
+
|
233
|
+
def _execute_processing_code(
|
234
|
+
self, batch: List[Any], processing_code: str
|
235
|
+
) -> Dict[str, Any]:
|
236
|
+
"""Execute custom processing code on a batch."""
|
237
|
+
try:
|
238
|
+
# Create execution context
|
239
|
+
exec_globals = {
|
240
|
+
"__builtins__": __builtins__,
|
241
|
+
"batch": batch,
|
242
|
+
"len": len,
|
243
|
+
"range": range,
|
244
|
+
"enumerate": enumerate,
|
245
|
+
"datetime": datetime,
|
246
|
+
}
|
247
|
+
|
248
|
+
# Execute the code
|
249
|
+
exec(processing_code, exec_globals)
|
250
|
+
|
251
|
+
# Get the result (expect 'result' variable to be set)
|
252
|
+
if "result" in exec_globals:
|
253
|
+
return {
|
254
|
+
"success": True,
|
255
|
+
"result": exec_globals["result"],
|
256
|
+
"processed_count": len(batch),
|
257
|
+
}
|
258
|
+
else:
|
259
|
+
return {
|
260
|
+
"success": False,
|
261
|
+
"error": "Processing code must set 'result' variable",
|
262
|
+
"processed_count": 0,
|
263
|
+
}
|
264
|
+
|
265
|
+
except Exception as e:
|
266
|
+
return {
|
267
|
+
"success": False,
|
268
|
+
"error": str(e),
|
269
|
+
"processed_count": 0,
|
270
|
+
}
|
271
|
+
|
272
|
+
def _process_single_batch(
|
273
|
+
self,
|
274
|
+
batch: List[Any],
|
275
|
+
batch_index: int,
|
276
|
+
processing_code: Optional[str] = None,
|
277
|
+
processing_function: Optional[str] = None,
|
278
|
+
) -> Dict[str, Any]:
|
279
|
+
"""Process a single batch of data."""
|
280
|
+
batch_start_time = time.time()
|
281
|
+
|
282
|
+
try:
|
283
|
+
if processing_code:
|
284
|
+
result = self._execute_processing_code(batch, processing_code)
|
285
|
+
elif processing_function:
|
286
|
+
# For this example, we'll use a simple default processing
|
287
|
+
# In a real implementation, this would look up the function
|
288
|
+
result = {
|
289
|
+
"success": True,
|
290
|
+
"result": [f"processed_{item}" for item in batch],
|
291
|
+
"processed_count": len(batch),
|
292
|
+
}
|
293
|
+
else:
|
294
|
+
# Default processing: just pass through with metadata
|
295
|
+
result = {
|
296
|
+
"success": True,
|
297
|
+
"result": batch,
|
298
|
+
"processed_count": len(batch),
|
299
|
+
}
|
300
|
+
|
301
|
+
batch_end_time = time.time()
|
302
|
+
processing_time = batch_end_time - batch_start_time
|
303
|
+
|
304
|
+
return {
|
305
|
+
"batch_index": batch_index,
|
306
|
+
"success": result["success"],
|
307
|
+
"result": result["result"],
|
308
|
+
"processed_count": result["processed_count"],
|
309
|
+
"processing_time": processing_time,
|
310
|
+
"items_per_second": (
|
311
|
+
len(batch) / processing_time
|
312
|
+
if processing_time > 0
|
313
|
+
else float("inf")
|
314
|
+
),
|
315
|
+
"error": result.get("error"),
|
316
|
+
}
|
317
|
+
|
318
|
+
except Exception as e:
|
319
|
+
batch_end_time = time.time()
|
320
|
+
processing_time = batch_end_time - batch_start_time
|
321
|
+
|
322
|
+
return {
|
323
|
+
"batch_index": batch_index,
|
324
|
+
"success": False,
|
325
|
+
"result": None,
|
326
|
+
"processed_count": 0,
|
327
|
+
"processing_time": processing_time,
|
328
|
+
"items_per_second": 0.0,
|
329
|
+
"error": str(e),
|
330
|
+
}
|
331
|
+
|
332
|
+
def _process_batch_with_retry(
|
333
|
+
self,
|
334
|
+
batch: List[Any],
|
335
|
+
batch_index: int,
|
336
|
+
max_retries: int,
|
337
|
+
retry_delay: float,
|
338
|
+
processing_code: Optional[str] = None,
|
339
|
+
processing_function: Optional[str] = None,
|
340
|
+
) -> Dict[str, Any]:
|
341
|
+
"""Process a batch with retry logic."""
|
342
|
+
last_result = None
|
343
|
+
|
344
|
+
for attempt in range(max_retries + 1):
|
345
|
+
if attempt > 0:
|
346
|
+
time.sleep(retry_delay * (2 ** (attempt - 1))) # Exponential backoff
|
347
|
+
|
348
|
+
result = self._process_single_batch(
|
349
|
+
batch, batch_index, processing_code, processing_function
|
350
|
+
)
|
351
|
+
|
352
|
+
if result["success"]:
|
353
|
+
if attempt > 0:
|
354
|
+
result["retry_attempts"] = attempt
|
355
|
+
return result
|
356
|
+
|
357
|
+
last_result = result
|
358
|
+
|
359
|
+
# All retries failed
|
360
|
+
last_result["retry_attempts"] = max_retries
|
361
|
+
last_result["final_failure"] = True
|
362
|
+
return last_result
|
363
|
+
|
364
|
+
def _update_progress(
|
365
|
+
self,
|
366
|
+
processed_batches: int,
|
367
|
+
total_batches: int,
|
368
|
+
processed_items: int,
|
369
|
+
total_items: int,
|
370
|
+
start_time: float,
|
371
|
+
progress_callback: Optional[str] = None,
|
372
|
+
):
|
373
|
+
"""Update progress and call progress callback if provided."""
|
374
|
+
current_time = time.time()
|
375
|
+
elapsed_time = current_time - start_time
|
376
|
+
|
377
|
+
batch_progress = processed_batches / total_batches if total_batches > 0 else 0
|
378
|
+
item_progress = processed_items / total_items if total_items > 0 else 0
|
379
|
+
|
380
|
+
# Calculate rates
|
381
|
+
batches_per_second = processed_batches / elapsed_time if elapsed_time > 0 else 0
|
382
|
+
items_per_second = processed_items / elapsed_time if elapsed_time > 0 else 0
|
383
|
+
|
384
|
+
# Estimate time remaining
|
385
|
+
if batches_per_second > 0:
|
386
|
+
remaining_batches = total_batches - processed_batches
|
387
|
+
estimated_time_remaining = remaining_batches / batches_per_second
|
388
|
+
else:
|
389
|
+
estimated_time_remaining = float("inf")
|
390
|
+
|
391
|
+
progress_info = {
|
392
|
+
"batch_progress": batch_progress,
|
393
|
+
"item_progress": item_progress,
|
394
|
+
"processed_batches": processed_batches,
|
395
|
+
"total_batches": total_batches,
|
396
|
+
"processed_items": processed_items,
|
397
|
+
"total_items": total_items,
|
398
|
+
"elapsed_time": elapsed_time,
|
399
|
+
"batches_per_second": batches_per_second,
|
400
|
+
"items_per_second": items_per_second,
|
401
|
+
"estimated_time_remaining": estimated_time_remaining,
|
402
|
+
}
|
403
|
+
|
404
|
+
# Call progress callback if provided
|
405
|
+
if progress_callback:
|
406
|
+
try:
|
407
|
+
callback_func = eval(progress_callback)
|
408
|
+
callback_func(progress_info)
|
409
|
+
except:
|
410
|
+
pass # Ignore callback errors
|
411
|
+
|
412
|
+
return progress_info
|
413
|
+
|
414
|
+
def _process_data_batches(
|
415
|
+
self,
|
416
|
+
data_items: List[Any],
|
417
|
+
batch_size: int,
|
418
|
+
processing_code: Optional[str] = None,
|
419
|
+
processing_function: Optional[str] = None,
|
420
|
+
rate_limit_per_second: float = 10.0,
|
421
|
+
parallel_workers: int = 1,
|
422
|
+
retry_failed_batches: bool = True,
|
423
|
+
max_retries: int = 3,
|
424
|
+
retry_delay: float = 1.0,
|
425
|
+
progress_callback: Optional[str] = None,
|
426
|
+
adaptive_batch_size: bool = True,
|
427
|
+
memory_limit_mb: int = 1024,
|
428
|
+
) -> Dict[str, Any]:
|
429
|
+
"""Process data items in batches."""
|
430
|
+
start_time = time.time()
|
431
|
+
|
432
|
+
# Calculate optimal batch size if adaptive
|
433
|
+
if adaptive_batch_size:
|
434
|
+
optimal_batch_size = self._calculate_optimal_batch_size(
|
435
|
+
len(data_items),
|
436
|
+
rate_limit_per_second,
|
437
|
+
parallel_workers,
|
438
|
+
memory_limit_mb,
|
439
|
+
)
|
440
|
+
batch_size = min(batch_size, optimal_batch_size)
|
441
|
+
|
442
|
+
# Create batches
|
443
|
+
batches = self._create_batches(data_items, batch_size)
|
444
|
+
total_batches = len(batches)
|
445
|
+
|
446
|
+
# Initialize tracking
|
447
|
+
self._processing_stats.update(
|
448
|
+
{
|
449
|
+
"total_items": len(data_items),
|
450
|
+
"total_batches": total_batches,
|
451
|
+
"start_time": start_time,
|
452
|
+
}
|
453
|
+
)
|
454
|
+
|
455
|
+
successful_results = []
|
456
|
+
failed_results = []
|
457
|
+
processed_items = 0
|
458
|
+
|
459
|
+
# Calculate delay between batches for rate limiting
|
460
|
+
batch_delay = (
|
461
|
+
1.0 / (rate_limit_per_second / batch_size)
|
462
|
+
if rate_limit_per_second > 0
|
463
|
+
else 0
|
464
|
+
)
|
465
|
+
|
466
|
+
if parallel_workers == 1:
|
467
|
+
# Sequential processing
|
468
|
+
for i, batch in enumerate(batches):
|
469
|
+
if i > 0 and batch_delay > 0:
|
470
|
+
time.sleep(batch_delay)
|
471
|
+
|
472
|
+
if retry_failed_batches:
|
473
|
+
result = self._process_batch_with_retry(
|
474
|
+
batch,
|
475
|
+
i,
|
476
|
+
max_retries,
|
477
|
+
retry_delay,
|
478
|
+
processing_code,
|
479
|
+
processing_function,
|
480
|
+
)
|
481
|
+
else:
|
482
|
+
result = self._process_single_batch(
|
483
|
+
batch, i, processing_code, processing_function
|
484
|
+
)
|
485
|
+
|
486
|
+
if result["success"]:
|
487
|
+
successful_results.append(result)
|
488
|
+
processed_items += result["processed_count"]
|
489
|
+
else:
|
490
|
+
failed_results.append(result)
|
491
|
+
|
492
|
+
# Update progress
|
493
|
+
self._update_progress(
|
494
|
+
i + 1,
|
495
|
+
total_batches,
|
496
|
+
processed_items,
|
497
|
+
len(data_items),
|
498
|
+
start_time,
|
499
|
+
progress_callback,
|
500
|
+
)
|
501
|
+
|
502
|
+
else:
|
503
|
+
# Parallel processing
|
504
|
+
with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
|
505
|
+
# Submit all batches
|
506
|
+
future_to_batch = {}
|
507
|
+
for i, batch in enumerate(batches):
|
508
|
+
if retry_failed_batches:
|
509
|
+
future = executor.submit(
|
510
|
+
self._process_batch_with_retry,
|
511
|
+
batch,
|
512
|
+
i,
|
513
|
+
max_retries,
|
514
|
+
retry_delay,
|
515
|
+
processing_code,
|
516
|
+
processing_function,
|
517
|
+
)
|
518
|
+
else:
|
519
|
+
future = executor.submit(
|
520
|
+
self._process_single_batch,
|
521
|
+
batch,
|
522
|
+
i,
|
523
|
+
processing_code,
|
524
|
+
processing_function,
|
525
|
+
)
|
526
|
+
future_to_batch[future] = (i, batch)
|
527
|
+
|
528
|
+
# Process completed batches
|
529
|
+
completed_batches = 0
|
530
|
+
for future in as_completed(future_to_batch):
|
531
|
+
batch_index, batch = future_to_batch[future]
|
532
|
+
|
533
|
+
try:
|
534
|
+
result = future.result()
|
535
|
+
|
536
|
+
if result["success"]:
|
537
|
+
successful_results.append(result)
|
538
|
+
processed_items += result["processed_count"]
|
539
|
+
else:
|
540
|
+
failed_results.append(result)
|
541
|
+
|
542
|
+
completed_batches += 1
|
543
|
+
|
544
|
+
# Update progress
|
545
|
+
self._update_progress(
|
546
|
+
completed_batches,
|
547
|
+
total_batches,
|
548
|
+
processed_items,
|
549
|
+
len(data_items),
|
550
|
+
start_time,
|
551
|
+
progress_callback,
|
552
|
+
)
|
553
|
+
|
554
|
+
# Rate limiting for parallel processing
|
555
|
+
if batch_delay > 0:
|
556
|
+
time.sleep(batch_delay / parallel_workers)
|
557
|
+
|
558
|
+
except Exception as e:
|
559
|
+
failed_results.append(
|
560
|
+
{
|
561
|
+
"batch_index": batch_index,
|
562
|
+
"success": False,
|
563
|
+
"error": str(e),
|
564
|
+
"processed_count": 0,
|
565
|
+
}
|
566
|
+
)
|
567
|
+
completed_batches += 1
|
568
|
+
|
569
|
+
end_time = time.time()
|
570
|
+
total_processing_time = end_time - start_time
|
571
|
+
|
572
|
+
# Update final stats
|
573
|
+
self._processing_stats.update(
|
574
|
+
{
|
575
|
+
"processed_items": processed_items,
|
576
|
+
"failed_items": len(data_items) - processed_items,
|
577
|
+
"successful_batches": len(successful_results),
|
578
|
+
"failed_batches": len(failed_results),
|
579
|
+
"end_time": end_time,
|
580
|
+
"processing_rate": (
|
581
|
+
processed_items / total_processing_time
|
582
|
+
if total_processing_time > 0
|
583
|
+
else 0
|
584
|
+
),
|
585
|
+
}
|
586
|
+
)
|
587
|
+
|
588
|
+
# Compile all results
|
589
|
+
all_successful_results = []
|
590
|
+
for result in successful_results:
|
591
|
+
if isinstance(result["result"], list):
|
592
|
+
all_successful_results.extend(result["result"])
|
593
|
+
else:
|
594
|
+
all_successful_results.append(result["result"])
|
595
|
+
|
596
|
+
return {
|
597
|
+
"success": len(failed_results) == 0,
|
598
|
+
"processed_items": processed_items,
|
599
|
+
"failed_items": len(data_items) - processed_items,
|
600
|
+
"total_batches": total_batches,
|
601
|
+
"successful_batches": len(successful_results),
|
602
|
+
"failed_batches": len(failed_results),
|
603
|
+
"processing_time": total_processing_time,
|
604
|
+
"processing_rate": (
|
605
|
+
processed_items / total_processing_time
|
606
|
+
if total_processing_time > 0
|
607
|
+
else 0
|
608
|
+
),
|
609
|
+
"batch_size_used": batch_size,
|
610
|
+
"results": all_successful_results,
|
611
|
+
"successful_batch_details": successful_results,
|
612
|
+
"failed_batch_details": failed_results,
|
613
|
+
"statistics": self._processing_stats,
|
614
|
+
}
|
615
|
+
|
616
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
617
|
+
"""Execute batch processing operation."""
|
618
|
+
operation = kwargs.get("operation", "process_data")
|
619
|
+
|
620
|
+
if operation == "process_data":
|
621
|
+
data_items = kwargs.get("data_items", [])
|
622
|
+
if not data_items:
|
623
|
+
raise NodeConfigurationError(
|
624
|
+
"data_items is required for process_data operation"
|
625
|
+
)
|
626
|
+
|
627
|
+
return self._process_data_batches(
|
628
|
+
data_items=data_items,
|
629
|
+
batch_size=kwargs.get("batch_size", 100),
|
630
|
+
processing_code=kwargs.get("processing_code"),
|
631
|
+
processing_function=kwargs.get("processing_function"),
|
632
|
+
rate_limit_per_second=kwargs.get("rate_limit_per_second", 10.0),
|
633
|
+
parallel_workers=kwargs.get("parallel_workers", 1),
|
634
|
+
retry_failed_batches=kwargs.get("retry_failed_batches", True),
|
635
|
+
max_retries=kwargs.get("max_retries", 3),
|
636
|
+
retry_delay=kwargs.get("retry_delay", 1.0),
|
637
|
+
progress_callback=kwargs.get("progress_callback"),
|
638
|
+
adaptive_batch_size=kwargs.get("adaptive_batch_size", True),
|
639
|
+
memory_limit_mb=kwargs.get("memory_limit_mb", 1024),
|
640
|
+
)
|
641
|
+
|
642
|
+
elif operation == "api_batch_calls":
|
643
|
+
# For API batch calls, we'd implement specific API handling
|
644
|
+
# This is a simplified version
|
645
|
+
data_items = kwargs.get("data_items", [])
|
646
|
+
api_endpoint = kwargs.get("api_endpoint")
|
647
|
+
|
648
|
+
if not api_endpoint:
|
649
|
+
raise NodeConfigurationError(
|
650
|
+
"api_endpoint is required for api_batch_calls operation"
|
651
|
+
)
|
652
|
+
|
653
|
+
# Create processing code for API calls
|
654
|
+
api_processing_code = f"""
|
655
|
+
import requests
|
656
|
+
import json
|
657
|
+
|
658
|
+
results = []
|
659
|
+
for item in batch:
|
660
|
+
try:
|
661
|
+
response = requests.post('{api_endpoint}', json=item, headers={kwargs.get('api_headers', {})})
|
662
|
+
if response.status_code == 200:
|
663
|
+
results.append(response.json())
|
664
|
+
else:
|
665
|
+
results.append({{'error': f'HTTP {{response.status_code}}', 'item': item}})
|
666
|
+
except Exception as e:
|
667
|
+
results.append({{'error': str(e), 'item': item}})
|
668
|
+
|
669
|
+
result = results
|
670
|
+
"""
|
671
|
+
|
672
|
+
return self._process_data_batches(
|
673
|
+
data_items=data_items,
|
674
|
+
batch_size=kwargs.get(
|
675
|
+
"batch_size", 10
|
676
|
+
), # Smaller batches for API calls
|
677
|
+
processing_code=api_processing_code,
|
678
|
+
rate_limit_per_second=kwargs.get(
|
679
|
+
"rate_limit_per_second", 5.0
|
680
|
+
), # More conservative for APIs
|
681
|
+
parallel_workers=kwargs.get("parallel_workers", 2),
|
682
|
+
retry_failed_batches=kwargs.get("retry_failed_batches", True),
|
683
|
+
max_retries=kwargs.get("max_retries", 3),
|
684
|
+
retry_delay=kwargs.get("retry_delay", 2.0),
|
685
|
+
progress_callback=kwargs.get("progress_callback"),
|
686
|
+
adaptive_batch_size=kwargs.get("adaptive_batch_size", True),
|
687
|
+
memory_limit_mb=kwargs.get("memory_limit_mb", 512),
|
688
|
+
)
|
689
|
+
|
690
|
+
elif operation == "database_batch_operations":
|
691
|
+
# For database batch operations
|
692
|
+
data_items = kwargs.get("data_items", [])
|
693
|
+
database_config = kwargs.get("database_config", {})
|
694
|
+
|
695
|
+
if not database_config:
|
696
|
+
raise NodeConfigurationError(
|
697
|
+
"database_config is required for database_batch_operations"
|
698
|
+
)
|
699
|
+
|
700
|
+
# Create processing code for database operations
|
701
|
+
db_processing_code = """
|
702
|
+
# This would typically use actual database connections
|
703
|
+
# For this example, we'll simulate database operations
|
704
|
+
|
705
|
+
import time
|
706
|
+
results = []
|
707
|
+
|
708
|
+
for item in batch:
|
709
|
+
# Simulate database operation
|
710
|
+
time.sleep(0.01) # Simulate processing time
|
711
|
+
results.append({
|
712
|
+
'id': item.get('id', 'unknown'),
|
713
|
+
'status': 'processed',
|
714
|
+
'timestamp': datetime.now().isoformat()
|
715
|
+
})
|
716
|
+
|
717
|
+
result = results
|
718
|
+
"""
|
719
|
+
|
720
|
+
return self._process_data_batches(
|
721
|
+
data_items=data_items,
|
722
|
+
batch_size=kwargs.get(
|
723
|
+
"batch_size", 1000
|
724
|
+
), # Larger batches for DB operations
|
725
|
+
processing_code=db_processing_code,
|
726
|
+
rate_limit_per_second=kwargs.get("rate_limit_per_second", 50.0),
|
727
|
+
parallel_workers=kwargs.get("parallel_workers", 4),
|
728
|
+
retry_failed_batches=kwargs.get("retry_failed_batches", True),
|
729
|
+
max_retries=kwargs.get("max_retries", 2),
|
730
|
+
retry_delay=kwargs.get("retry_delay", 1.0),
|
731
|
+
progress_callback=kwargs.get("progress_callback"),
|
732
|
+
adaptive_batch_size=kwargs.get("adaptive_batch_size", True),
|
733
|
+
memory_limit_mb=kwargs.get("memory_limit_mb", 2048),
|
734
|
+
)
|
735
|
+
|
736
|
+
else:
|
737
|
+
raise NodeConfigurationError(f"Invalid operation: {operation}")
|
738
|
+
|
739
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
740
|
+
"""Async execution method for enterprise integration."""
|
741
|
+
return self.run(**kwargs)
|