kailash 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/api/custom_nodes_secure.py +2 -2
- kailash/api/studio_secure.py +1 -1
- kailash/mcp/client_new.py +1 -1
- kailash/nodes/ai/a2a.py +1 -1
- kailash/nodes/api/__init__.py +26 -0
- kailash/nodes/api/monitoring.py +463 -0
- kailash/nodes/api/security.py +822 -0
- kailash/nodes/base.py +3 -3
- kailash/nodes/code/python.py +6 -0
- kailash/nodes/data/__init__.py +9 -0
- kailash/nodes/data/directory.py +278 -0
- kailash/nodes/data/event_generation.py +297 -0
- kailash/nodes/data/file_discovery.py +601 -0
- kailash/nodes/data/sql.py +2 -2
- kailash/nodes/transform/processors.py +32 -1
- kailash/runtime/async_local.py +1 -1
- kailash/runtime/docker.py +4 -4
- kailash/runtime/local.py +41 -4
- kailash/runtime/parallel.py +2 -2
- kailash/runtime/parallel_cyclic.py +2 -2
- kailash/runtime/testing.py +2 -2
- kailash/utils/templates.py +6 -6
- kailash/visualization/performance.py +16 -3
- kailash/visualization/reports.py +5 -1
- kailash/workflow/convergence.py +1 -1
- kailash/workflow/cycle_analyzer.py +8 -1
- kailash/workflow/cyclic_runner.py +1 -1
- kailash/workflow/graph.py +33 -6
- kailash/workflow/visualization.py +10 -2
- kailash-0.3.0.dist-info/METADATA +428 -0
- {kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/RECORD +36 -31
- kailash-0.2.1.dist-info/METADATA +0 -1617
- {kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/WHEEL +0 -0
- {kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,601 @@
|
|
1
|
+
"""File discovery and analysis nodes for file system operations."""
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import mimetypes
|
5
|
+
import os
|
6
|
+
import time
|
7
|
+
from datetime import datetime, timezone
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
10
|
+
|
11
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
12
|
+
|
13
|
+
|
14
|
+
@register_node()
|
15
|
+
class FileDiscoveryNode(Node):
|
16
|
+
"""
|
17
|
+
Discovers and analyzes files and directories in the file system.
|
18
|
+
|
19
|
+
This node provides comprehensive file discovery capabilities, replacing
|
20
|
+
DataTransformer with embedded Python code for file processing tasks.
|
21
|
+
It can scan directories, analyze file properties, detect file types,
|
22
|
+
and generate detailed file system reports.
|
23
|
+
|
24
|
+
Design Philosophy:
|
25
|
+
File system operations require robust discovery and analysis capabilities.
|
26
|
+
This node eliminates the need for custom file processing code in
|
27
|
+
DataTransformer nodes by providing dedicated, configurable file
|
28
|
+
discovery with filtering, analysis, and reporting features.
|
29
|
+
|
30
|
+
Upstream Dependencies:
|
31
|
+
- Path configuration nodes
|
32
|
+
- Filter criteria nodes
|
33
|
+
- Authentication/permission nodes
|
34
|
+
- Schedule/trigger nodes
|
35
|
+
|
36
|
+
Downstream Consumers:
|
37
|
+
- File processing nodes
|
38
|
+
- Content analysis nodes
|
39
|
+
- Backup and archival nodes
|
40
|
+
- Security scanning nodes
|
41
|
+
- Compliance reporting nodes
|
42
|
+
|
43
|
+
Configuration:
|
44
|
+
- Search paths and patterns
|
45
|
+
- File type filters
|
46
|
+
- Size and date criteria
|
47
|
+
- Analysis depth and options
|
48
|
+
- Output format preferences
|
49
|
+
|
50
|
+
Implementation Details:
|
51
|
+
- Recursive directory traversal
|
52
|
+
- File metadata extraction
|
53
|
+
- Content type detection
|
54
|
+
- Permission and ownership analysis
|
55
|
+
- Hash calculation for integrity
|
56
|
+
|
57
|
+
Error Handling:
|
58
|
+
- Permission denied gracefully handled
|
59
|
+
- Broken symlinks detected
|
60
|
+
- Invalid paths reported
|
61
|
+
- Partial results on errors
|
62
|
+
|
63
|
+
Side Effects:
|
64
|
+
- File system access (read-only by default)
|
65
|
+
- Temporary file creation for analysis
|
66
|
+
- Metadata caching for performance
|
67
|
+
- Logging of discovery activities
|
68
|
+
|
69
|
+
Examples:
|
70
|
+
>>> # Discover all Python files in a project
|
71
|
+
>>> discovery = FileDiscoveryNode(
|
72
|
+
... search_paths=['/path/to/project'],
|
73
|
+
... file_patterns=['*.py'],
|
74
|
+
... include_metadata=True,
|
75
|
+
... max_depth=5
|
76
|
+
... )
|
77
|
+
>>> result = discovery.execute()
|
78
|
+
>>> assert 'discovered_files' in result
|
79
|
+
>>> assert all(f['name'].endswith('.py') for f in result['discovered_files'])
|
80
|
+
>>>
|
81
|
+
>>> # Find large files for cleanup
|
82
|
+
>>> discovery = FileDiscoveryNode(
|
83
|
+
... search_paths=['/var/log', '/tmp'],
|
84
|
+
... min_size_mb=100,
|
85
|
+
... older_than_days=30,
|
86
|
+
... include_checksums=True
|
87
|
+
... )
|
88
|
+
>>> result = discovery.execute()
|
89
|
+
>>> large_files = result['discovered_files']
|
90
|
+
"""
|
91
|
+
|
92
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
93
|
+
return {
|
94
|
+
"search_paths": NodeParameter(
|
95
|
+
name="search_paths",
|
96
|
+
type=list,
|
97
|
+
required=True,
|
98
|
+
description="List of paths to search for files",
|
99
|
+
),
|
100
|
+
"file_patterns": NodeParameter(
|
101
|
+
name="file_patterns",
|
102
|
+
type=list,
|
103
|
+
required=False,
|
104
|
+
default=["*"],
|
105
|
+
description="File name patterns to match (glob-style)",
|
106
|
+
),
|
107
|
+
"exclude_patterns": NodeParameter(
|
108
|
+
name="exclude_patterns",
|
109
|
+
type=list,
|
110
|
+
required=False,
|
111
|
+
default=[],
|
112
|
+
description="File name patterns to exclude",
|
113
|
+
),
|
114
|
+
"max_depth": NodeParameter(
|
115
|
+
name="max_depth",
|
116
|
+
type=int,
|
117
|
+
required=False,
|
118
|
+
default=10,
|
119
|
+
description="Maximum directory depth to search",
|
120
|
+
),
|
121
|
+
"include_metadata": NodeParameter(
|
122
|
+
name="include_metadata",
|
123
|
+
type=bool,
|
124
|
+
required=False,
|
125
|
+
default=True,
|
126
|
+
description="Include detailed file metadata",
|
127
|
+
),
|
128
|
+
"include_checksums": NodeParameter(
|
129
|
+
name="include_checksums",
|
130
|
+
type=bool,
|
131
|
+
required=False,
|
132
|
+
default=False,
|
133
|
+
description="Calculate file checksums (slower but more thorough)",
|
134
|
+
),
|
135
|
+
"min_size_mb": NodeParameter(
|
136
|
+
name="min_size_mb",
|
137
|
+
type=float,
|
138
|
+
required=False,
|
139
|
+
description="Minimum file size in megabytes",
|
140
|
+
),
|
141
|
+
"max_size_mb": NodeParameter(
|
142
|
+
name="max_size_mb",
|
143
|
+
type=float,
|
144
|
+
required=False,
|
145
|
+
description="Maximum file size in megabytes",
|
146
|
+
),
|
147
|
+
"older_than_days": NodeParameter(
|
148
|
+
name="older_than_days",
|
149
|
+
type=int,
|
150
|
+
required=False,
|
151
|
+
description="Only include files older than N days",
|
152
|
+
),
|
153
|
+
"newer_than_days": NodeParameter(
|
154
|
+
name="newer_than_days",
|
155
|
+
type=int,
|
156
|
+
required=False,
|
157
|
+
description="Only include files newer than N days",
|
158
|
+
),
|
159
|
+
"follow_symlinks": NodeParameter(
|
160
|
+
name="follow_symlinks",
|
161
|
+
type=bool,
|
162
|
+
required=False,
|
163
|
+
default=False,
|
164
|
+
description="Follow symbolic links during traversal",
|
165
|
+
),
|
166
|
+
}
|
167
|
+
|
168
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
169
|
+
search_paths = kwargs["search_paths"]
|
170
|
+
file_patterns = kwargs.get("file_patterns", ["*"])
|
171
|
+
exclude_patterns = kwargs.get("exclude_patterns", [])
|
172
|
+
max_depth = kwargs.get("max_depth", 10)
|
173
|
+
include_metadata = kwargs.get("include_metadata", True)
|
174
|
+
include_checksums = kwargs.get("include_checksums", False)
|
175
|
+
min_size_mb = kwargs.get("min_size_mb")
|
176
|
+
max_size_mb = kwargs.get("max_size_mb")
|
177
|
+
older_than_days = kwargs.get("older_than_days")
|
178
|
+
newer_than_days = kwargs.get("newer_than_days")
|
179
|
+
follow_symlinks = kwargs.get("follow_symlinks", False)
|
180
|
+
|
181
|
+
start_time = time.time()
|
182
|
+
discovered_files = []
|
183
|
+
discovery_stats = {
|
184
|
+
"total_directories_scanned": 0,
|
185
|
+
"total_files_found": 0,
|
186
|
+
"total_files_matching": 0,
|
187
|
+
"access_errors": 0,
|
188
|
+
"broken_symlinks": 0,
|
189
|
+
}
|
190
|
+
|
191
|
+
for search_path in search_paths:
|
192
|
+
try:
|
193
|
+
path_files, path_stats = self._discover_files_in_path(
|
194
|
+
search_path=search_path,
|
195
|
+
file_patterns=file_patterns,
|
196
|
+
exclude_patterns=exclude_patterns,
|
197
|
+
max_depth=max_depth,
|
198
|
+
include_metadata=include_metadata,
|
199
|
+
include_checksums=include_checksums,
|
200
|
+
min_size_mb=min_size_mb,
|
201
|
+
max_size_mb=max_size_mb,
|
202
|
+
older_than_days=older_than_days,
|
203
|
+
newer_than_days=newer_than_days,
|
204
|
+
follow_symlinks=follow_symlinks,
|
205
|
+
)
|
206
|
+
|
207
|
+
discovered_files.extend(path_files)
|
208
|
+
|
209
|
+
# Aggregate stats
|
210
|
+
for key, value in path_stats.items():
|
211
|
+
discovery_stats[key] += value
|
212
|
+
|
213
|
+
except Exception as e:
|
214
|
+
discovery_stats["access_errors"] += 1
|
215
|
+
# Add error entry to results
|
216
|
+
discovered_files.append(
|
217
|
+
{
|
218
|
+
"type": "discovery_error",
|
219
|
+
"path": search_path,
|
220
|
+
"error": str(e),
|
221
|
+
"timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
222
|
+
}
|
223
|
+
)
|
224
|
+
|
225
|
+
execution_time = time.time() - start_time
|
226
|
+
|
227
|
+
# Generate summary
|
228
|
+
summary = self._generate_discovery_summary(
|
229
|
+
discovered_files, discovery_stats, execution_time
|
230
|
+
)
|
231
|
+
|
232
|
+
return {
|
233
|
+
"discovered_files": discovered_files,
|
234
|
+
"discovery_summary": summary,
|
235
|
+
"discovery_stats": discovery_stats,
|
236
|
+
"total_files": len(
|
237
|
+
[f for f in discovered_files if f.get("type") != "discovery_error"]
|
238
|
+
),
|
239
|
+
"execution_time": execution_time,
|
240
|
+
"timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
241
|
+
}
|
242
|
+
|
243
|
+
def _discover_files_in_path(
|
244
|
+
self,
|
245
|
+
search_path: str,
|
246
|
+
file_patterns: List[str],
|
247
|
+
exclude_patterns: List[str],
|
248
|
+
max_depth: int,
|
249
|
+
include_metadata: bool,
|
250
|
+
include_checksums: bool,
|
251
|
+
min_size_mb: Optional[float],
|
252
|
+
max_size_mb: Optional[float],
|
253
|
+
older_than_days: Optional[int],
|
254
|
+
newer_than_days: Optional[int],
|
255
|
+
follow_symlinks: bool,
|
256
|
+
) -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
|
257
|
+
"""Discover files in a specific path."""
|
258
|
+
|
259
|
+
discovered_files = []
|
260
|
+
stats = {
|
261
|
+
"total_directories_scanned": 0,
|
262
|
+
"total_files_found": 0,
|
263
|
+
"total_files_matching": 0,
|
264
|
+
"access_errors": 0,
|
265
|
+
"broken_symlinks": 0,
|
266
|
+
}
|
267
|
+
|
268
|
+
try:
|
269
|
+
search_path_obj = Path(search_path)
|
270
|
+
if not search_path_obj.exists():
|
271
|
+
raise FileNotFoundError(f"Search path does not exist: {search_path}")
|
272
|
+
|
273
|
+
# Walk the directory tree
|
274
|
+
for root, dirs, files in os.walk(search_path, followlinks=follow_symlinks):
|
275
|
+
current_depth = len(Path(root).relative_to(search_path_obj).parts)
|
276
|
+
|
277
|
+
# Skip if max depth exceeded
|
278
|
+
if current_depth > max_depth:
|
279
|
+
dirs[:] = [] # Don't descend further
|
280
|
+
continue
|
281
|
+
|
282
|
+
stats["total_directories_scanned"] += 1
|
283
|
+
|
284
|
+
for file_name in files:
|
285
|
+
file_path = os.path.join(root, file_name)
|
286
|
+
stats["total_files_found"] += 1
|
287
|
+
|
288
|
+
try:
|
289
|
+
# Check if file matches patterns
|
290
|
+
if not self._matches_patterns(
|
291
|
+
file_name, file_patterns, exclude_patterns
|
292
|
+
):
|
293
|
+
continue
|
294
|
+
|
295
|
+
file_info = self._analyze_file(
|
296
|
+
file_path=file_path,
|
297
|
+
include_metadata=include_metadata,
|
298
|
+
include_checksums=include_checksums,
|
299
|
+
)
|
300
|
+
|
301
|
+
# Apply size filters
|
302
|
+
if min_size_mb is not None:
|
303
|
+
if file_info.get("size_mb", 0) < min_size_mb:
|
304
|
+
continue
|
305
|
+
|
306
|
+
if max_size_mb is not None:
|
307
|
+
if file_info.get("size_mb", 0) > max_size_mb:
|
308
|
+
continue
|
309
|
+
|
310
|
+
# Apply date filters
|
311
|
+
if older_than_days is not None or newer_than_days is not None:
|
312
|
+
if not self._matches_date_criteria(
|
313
|
+
file_info, older_than_days, newer_than_days
|
314
|
+
):
|
315
|
+
continue
|
316
|
+
|
317
|
+
discovered_files.append(file_info)
|
318
|
+
stats["total_files_matching"] += 1
|
319
|
+
|
320
|
+
except (OSError, PermissionError) as e:
|
321
|
+
stats["access_errors"] += 1
|
322
|
+
# Add error info for this specific file
|
323
|
+
discovered_files.append(
|
324
|
+
{
|
325
|
+
"type": "file_access_error",
|
326
|
+
"path": file_path,
|
327
|
+
"name": file_name,
|
328
|
+
"error": str(e),
|
329
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
330
|
+
+ "Z",
|
331
|
+
}
|
332
|
+
)
|
333
|
+
|
334
|
+
except Exception:
|
335
|
+
stats["access_errors"] += 1
|
336
|
+
raise
|
337
|
+
|
338
|
+
return discovered_files, stats
|
339
|
+
|
340
|
+
def _matches_patterns(
|
341
|
+
self, file_name: str, include_patterns: List[str], exclude_patterns: List[str]
|
342
|
+
) -> bool:
|
343
|
+
"""Check if filename matches include patterns and doesn't match exclude patterns."""
|
344
|
+
import fnmatch
|
345
|
+
|
346
|
+
# Check exclude patterns first
|
347
|
+
for pattern in exclude_patterns:
|
348
|
+
if fnmatch.fnmatch(file_name, pattern):
|
349
|
+
return False
|
350
|
+
|
351
|
+
# Check include patterns
|
352
|
+
if not include_patterns or include_patterns == ["*"]:
|
353
|
+
return True
|
354
|
+
|
355
|
+
for pattern in include_patterns:
|
356
|
+
if fnmatch.fnmatch(file_name, pattern):
|
357
|
+
return True
|
358
|
+
|
359
|
+
return False
|
360
|
+
|
361
|
+
def _analyze_file(
|
362
|
+
self, file_path: str, include_metadata: bool, include_checksums: bool
|
363
|
+
) -> Dict[str, Any]:
|
364
|
+
"""Analyze a single file and return its information."""
|
365
|
+
|
366
|
+
file_path_obj = Path(file_path)
|
367
|
+
file_info = {
|
368
|
+
"type": "file",
|
369
|
+
"path": str(file_path),
|
370
|
+
"name": file_path_obj.name,
|
371
|
+
"directory": str(file_path_obj.parent),
|
372
|
+
}
|
373
|
+
|
374
|
+
try:
|
375
|
+
# Basic file stats
|
376
|
+
stat_info = file_path_obj.stat()
|
377
|
+
|
378
|
+
file_info.update(
|
379
|
+
{
|
380
|
+
"size_bytes": stat_info.st_size,
|
381
|
+
"size_mb": stat_info.st_size / (1024 * 1024),
|
382
|
+
"created_timestamp": stat_info.st_ctime,
|
383
|
+
"modified_timestamp": stat_info.st_mtime,
|
384
|
+
"accessed_timestamp": stat_info.st_atime,
|
385
|
+
"created_date": datetime.fromtimestamp(
|
386
|
+
stat_info.st_ctime, timezone.utc
|
387
|
+
).isoformat()
|
388
|
+
+ "Z",
|
389
|
+
"modified_date": datetime.fromtimestamp(
|
390
|
+
stat_info.st_mtime, timezone.utc
|
391
|
+
).isoformat()
|
392
|
+
+ "Z",
|
393
|
+
"accessed_date": datetime.fromtimestamp(
|
394
|
+
stat_info.st_atime, timezone.utc
|
395
|
+
).isoformat()
|
396
|
+
+ "Z",
|
397
|
+
}
|
398
|
+
)
|
399
|
+
|
400
|
+
if include_metadata:
|
401
|
+
# File type detection
|
402
|
+
mime_type, encoding = mimetypes.guess_type(file_path)
|
403
|
+
file_info.update(
|
404
|
+
{
|
405
|
+
"mime_type": mime_type,
|
406
|
+
"encoding": encoding,
|
407
|
+
"extension": file_path_obj.suffix.lower(),
|
408
|
+
}
|
409
|
+
)
|
410
|
+
|
411
|
+
# File permissions
|
412
|
+
file_info.update(
|
413
|
+
{
|
414
|
+
"permissions": oct(stat_info.st_mode)[-3:],
|
415
|
+
"owner_uid": stat_info.st_uid,
|
416
|
+
"group_gid": stat_info.st_gid,
|
417
|
+
"is_readable": os.access(file_path, os.R_OK),
|
418
|
+
"is_writable": os.access(file_path, os.W_OK),
|
419
|
+
"is_executable": os.access(file_path, os.X_OK),
|
420
|
+
}
|
421
|
+
)
|
422
|
+
|
423
|
+
# Symbolic link detection
|
424
|
+
if file_path_obj.is_symlink():
|
425
|
+
try:
|
426
|
+
link_target = os.readlink(file_path)
|
427
|
+
file_info.update(
|
428
|
+
{
|
429
|
+
"is_symlink": True,
|
430
|
+
"link_target": link_target,
|
431
|
+
"link_target_exists": os.path.exists(link_target),
|
432
|
+
}
|
433
|
+
)
|
434
|
+
except OSError:
|
435
|
+
file_info.update(
|
436
|
+
{
|
437
|
+
"is_symlink": True,
|
438
|
+
"link_target": None,
|
439
|
+
"link_target_exists": False,
|
440
|
+
}
|
441
|
+
)
|
442
|
+
else:
|
443
|
+
file_info["is_symlink"] = False
|
444
|
+
|
445
|
+
# Content analysis for text files
|
446
|
+
if mime_type and mime_type.startswith("text/"):
|
447
|
+
try:
|
448
|
+
with open(
|
449
|
+
file_path, "r", encoding="utf-8", errors="ignore"
|
450
|
+
) as f:
|
451
|
+
content_sample = f.read(1024) # Read first 1KB
|
452
|
+
file_info.update(
|
453
|
+
{
|
454
|
+
"line_count": len(content_sample.splitlines()),
|
455
|
+
"character_count": len(content_sample),
|
456
|
+
"content_sample": (
|
457
|
+
content_sample[:200] + "..."
|
458
|
+
if len(content_sample) > 200
|
459
|
+
else content_sample
|
460
|
+
),
|
461
|
+
}
|
462
|
+
)
|
463
|
+
except (UnicodeDecodeError, PermissionError):
|
464
|
+
pass
|
465
|
+
|
466
|
+
if include_checksums:
|
467
|
+
# Calculate file hashes
|
468
|
+
file_info.update(self._calculate_checksums(file_path))
|
469
|
+
|
470
|
+
except (OSError, PermissionError) as e:
|
471
|
+
file_info.update(
|
472
|
+
{
|
473
|
+
"error": str(e),
|
474
|
+
"accessible": False,
|
475
|
+
}
|
476
|
+
)
|
477
|
+
|
478
|
+
file_info["timestamp"] = datetime.now(timezone.utc).isoformat() + "Z"
|
479
|
+
return file_info
|
480
|
+
|
481
|
+
def _calculate_checksums(self, file_path: str) -> Dict[str, str]:
|
482
|
+
"""Calculate MD5 and SHA256 checksums for a file."""
|
483
|
+
checksums = {}
|
484
|
+
|
485
|
+
try:
|
486
|
+
md5_hash = hashlib.md5()
|
487
|
+
sha256_hash = hashlib.sha256()
|
488
|
+
|
489
|
+
with open(file_path, "rb") as f:
|
490
|
+
# Read file in chunks to handle large files efficiently
|
491
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
492
|
+
md5_hash.update(chunk)
|
493
|
+
sha256_hash.update(chunk)
|
494
|
+
|
495
|
+
checksums.update(
|
496
|
+
{
|
497
|
+
"md5": md5_hash.hexdigest(),
|
498
|
+
"sha256": sha256_hash.hexdigest(),
|
499
|
+
}
|
500
|
+
)
|
501
|
+
except (OSError, PermissionError) as e:
|
502
|
+
checksums.update(
|
503
|
+
{
|
504
|
+
"checksum_error": str(e),
|
505
|
+
}
|
506
|
+
)
|
507
|
+
|
508
|
+
return checksums
|
509
|
+
|
510
|
+
def _matches_date_criteria(
|
511
|
+
self,
|
512
|
+
file_info: Dict[str, Any],
|
513
|
+
older_than_days: Optional[int],
|
514
|
+
newer_than_days: Optional[int],
|
515
|
+
) -> bool:
|
516
|
+
"""Check if file matches date criteria."""
|
517
|
+
|
518
|
+
modified_timestamp = file_info.get("modified_timestamp")
|
519
|
+
if modified_timestamp is None:
|
520
|
+
return True
|
521
|
+
|
522
|
+
now = time.time()
|
523
|
+
file_age_days = (now - modified_timestamp) / (24 * 3600)
|
524
|
+
|
525
|
+
if older_than_days is not None and file_age_days < older_than_days:
|
526
|
+
return False
|
527
|
+
|
528
|
+
if newer_than_days is not None and file_age_days > newer_than_days:
|
529
|
+
return False
|
530
|
+
|
531
|
+
return True
|
532
|
+
|
533
|
+
def _generate_discovery_summary(
|
534
|
+
self,
|
535
|
+
discovered_files: List[Dict],
|
536
|
+
discovery_stats: Dict[str, int],
|
537
|
+
execution_time: float,
|
538
|
+
) -> Dict[str, Any]:
|
539
|
+
"""Generate summary of file discovery results."""
|
540
|
+
|
541
|
+
# Count files by type/extension
|
542
|
+
extension_counts = {}
|
543
|
+
mime_type_counts = {}
|
544
|
+
size_distribution = {"small": 0, "medium": 0, "large": 0, "very_large": 0}
|
545
|
+
|
546
|
+
total_size_mb = 0
|
547
|
+
error_count = 0
|
548
|
+
|
549
|
+
for file_info in discovered_files:
|
550
|
+
if file_info.get("type") in ["discovery_error", "file_access_error"]:
|
551
|
+
error_count += 1
|
552
|
+
continue
|
553
|
+
|
554
|
+
# Extension analysis
|
555
|
+
extension = file_info.get("extension", "")
|
556
|
+
extension_counts[extension] = extension_counts.get(extension, 0) + 1
|
557
|
+
|
558
|
+
# MIME type analysis
|
559
|
+
mime_type = file_info.get("mime_type", "unknown")
|
560
|
+
mime_type_counts[mime_type] = mime_type_counts.get(mime_type, 0) + 1
|
561
|
+
|
562
|
+
# Size distribution
|
563
|
+
size_mb = file_info.get("size_mb", 0)
|
564
|
+
total_size_mb += size_mb
|
565
|
+
|
566
|
+
if size_mb < 1:
|
567
|
+
size_distribution["small"] += 1
|
568
|
+
elif size_mb < 50:
|
569
|
+
size_distribution["medium"] += 1
|
570
|
+
elif size_mb < 500:
|
571
|
+
size_distribution["large"] += 1
|
572
|
+
else:
|
573
|
+
size_distribution["very_large"] += 1
|
574
|
+
|
575
|
+
# Find largest files
|
576
|
+
file_sizes = [
|
577
|
+
(f.get("size_mb", 0), f.get("path", ""))
|
578
|
+
for f in discovered_files
|
579
|
+
if f.get("type") == "file"
|
580
|
+
]
|
581
|
+
largest_files = sorted(file_sizes, reverse=True)[:10]
|
582
|
+
|
583
|
+
return {
|
584
|
+
"execution_time": execution_time,
|
585
|
+
"total_files_discovered": len(discovered_files) - error_count,
|
586
|
+
"total_errors": error_count,
|
587
|
+
"total_size_mb": total_size_mb,
|
588
|
+
"average_file_size_mb": total_size_mb
|
589
|
+
/ max(1, len(discovered_files) - error_count),
|
590
|
+
"extension_distribution": dict(
|
591
|
+
sorted(extension_counts.items(), key=lambda x: x[1], reverse=True)
|
592
|
+
),
|
593
|
+
"mime_type_distribution": dict(
|
594
|
+
sorted(mime_type_counts.items(), key=lambda x: x[1], reverse=True)
|
595
|
+
),
|
596
|
+
"size_distribution": size_distribution,
|
597
|
+
"largest_files": [
|
598
|
+
{"size_mb": size, "path": path} for size, path in largest_files[:5]
|
599
|
+
],
|
600
|
+
"discovery_stats": discovery_stats,
|
601
|
+
}
|
kailash/nodes/data/sql.py
CHANGED
@@ -166,10 +166,10 @@ class SQLDatabaseNode(Node):
|
|
166
166
|
Example:
|
167
167
|
>>> # Initialize with project configuration
|
168
168
|
>>> SQLDatabaseNode.initialize('kailash_project.yaml')
|
169
|
-
>>>
|
169
|
+
>>>
|
170
170
|
>>> # Create node with database connection configuration
|
171
171
|
>>> sql_node = SQLDatabaseNode(connection='customer_db')
|
172
|
-
>>>
|
172
|
+
>>>
|
173
173
|
>>> # Execute multiple queries with the same node
|
174
174
|
>>> result1 = sql_node.run(
|
175
175
|
... query='SELECT * FROM customers WHERE active = ?',
|
@@ -311,7 +311,7 @@ class DataTransformer(Node):
|
|
311
311
|
return {
|
312
312
|
"data": NodeParameter(
|
313
313
|
name="data",
|
314
|
-
type=
|
314
|
+
type=Any,
|
315
315
|
required=False,
|
316
316
|
description="Primary input data to transform",
|
317
317
|
),
|
@@ -332,12 +332,35 @@ class DataTransformer(Node):
|
|
332
332
|
}, # Support for up to 5 additional arguments
|
333
333
|
}
|
334
334
|
|
335
|
+
def validate_inputs(self, **kwargs) -> Dict[str, Any]:
|
336
|
+
"""Override validate_inputs to accept arbitrary parameters for transformations.
|
337
|
+
|
338
|
+
DataTransformer needs to accept any input parameters that might be mapped
|
339
|
+
from other nodes, not just the predefined parameters in get_parameters().
|
340
|
+
This enables flexible data flow in workflows.
|
341
|
+
"""
|
342
|
+
# First, do the standard validation for defined parameters
|
343
|
+
validated = super().validate_inputs(**kwargs)
|
344
|
+
|
345
|
+
# Then, add any extra parameters that aren't in the schema
|
346
|
+
# These will be passed to the transformation context
|
347
|
+
defined_params = set(self.get_parameters().keys())
|
348
|
+
for key, value in kwargs.items():
|
349
|
+
if key not in defined_params:
|
350
|
+
validated[key] = value # Accept arbitrary additional parameters
|
351
|
+
|
352
|
+
return validated
|
353
|
+
|
335
354
|
def run(self, **kwargs) -> Dict[str, Any]:
|
336
355
|
# Extract the transformation functions
|
337
356
|
transformations = kwargs.get("transformations", [])
|
338
357
|
if not transformations:
|
339
358
|
return {"result": kwargs.get("data", [])}
|
340
359
|
|
360
|
+
# Debug: Check what kwargs we received
|
361
|
+
print(f"DATATRANSFORMER RUN DEBUG: kwargs keys = {list(kwargs.keys())}")
|
362
|
+
print(f"DATATRANSFORMER RUN DEBUG: kwargs = {kwargs}")
|
363
|
+
|
341
364
|
# Get all input data
|
342
365
|
input_data = {}
|
343
366
|
for key, value in kwargs.items():
|
@@ -371,6 +394,14 @@ class DataTransformer(Node):
|
|
371
394
|
local_vars = input_data.copy()
|
372
395
|
local_vars["result"] = result
|
373
396
|
|
397
|
+
# Debug: Print available variables
|
398
|
+
print(
|
399
|
+
f"DataTransformer DEBUG - Available variables: {list(local_vars.keys())}"
|
400
|
+
)
|
401
|
+
print(
|
402
|
+
f"DataTransformer DEBUG - Input data keys: {list(input_data.keys())}"
|
403
|
+
)
|
404
|
+
|
374
405
|
# Execute the code block
|
375
406
|
exec(transform_str, safe_globals, local_vars) # noqa: S102
|
376
407
|
|
kailash/runtime/async_local.py
CHANGED
@@ -227,7 +227,7 @@ class AsyncLocalRuntime:
|
|
227
227
|
outputs = await node_instance.execute_async(**inputs)
|
228
228
|
else:
|
229
229
|
# Fall back to synchronous execution
|
230
|
-
outputs = node_instance.
|
230
|
+
outputs = node_instance.run(**inputs)
|
231
231
|
|
232
232
|
execution_time = (
|
233
233
|
datetime.now(timezone.utc) - start_time
|