kailash 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,601 @@
1
+ """File discovery and analysis nodes for file system operations."""
2
+
3
+ import hashlib
4
+ import mimetypes
5
+ import os
6
+ import time
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ from kailash.nodes.base import Node, NodeParameter, register_node
12
+
13
+
14
+ @register_node()
15
+ class FileDiscoveryNode(Node):
16
+ """
17
+ Discovers and analyzes files and directories in the file system.
18
+
19
+ This node provides comprehensive file discovery capabilities, replacing
20
+ DataTransformer with embedded Python code for file processing tasks.
21
+ It can scan directories, analyze file properties, detect file types,
22
+ and generate detailed file system reports.
23
+
24
+ Design Philosophy:
25
+ File system operations require robust discovery and analysis capabilities.
26
+ This node eliminates the need for custom file processing code in
27
+ DataTransformer nodes by providing dedicated, configurable file
28
+ discovery with filtering, analysis, and reporting features.
29
+
30
+ Upstream Dependencies:
31
+ - Path configuration nodes
32
+ - Filter criteria nodes
33
+ - Authentication/permission nodes
34
+ - Schedule/trigger nodes
35
+
36
+ Downstream Consumers:
37
+ - File processing nodes
38
+ - Content analysis nodes
39
+ - Backup and archival nodes
40
+ - Security scanning nodes
41
+ - Compliance reporting nodes
42
+
43
+ Configuration:
44
+ - Search paths and patterns
45
+ - File type filters
46
+ - Size and date criteria
47
+ - Analysis depth and options
48
+ - Output format preferences
49
+
50
+ Implementation Details:
51
+ - Recursive directory traversal
52
+ - File metadata extraction
53
+ - Content type detection
54
+ - Permission and ownership analysis
55
+ - Hash calculation for integrity
56
+
57
+ Error Handling:
58
+ - Permission denied gracefully handled
59
+ - Broken symlinks detected
60
+ - Invalid paths reported
61
+ - Partial results on errors
62
+
63
+ Side Effects:
64
+ - File system access (read-only by default)
65
+ - Temporary file creation for analysis
66
+ - Metadata caching for performance
67
+ - Logging of discovery activities
68
+
69
+ Examples:
70
+ >>> # Discover all Python files in a project
71
+ >>> discovery = FileDiscoveryNode(
72
+ ... search_paths=['/path/to/project'],
73
+ ... file_patterns=['*.py'],
74
+ ... include_metadata=True,
75
+ ... max_depth=5
76
+ ... )
77
+ >>> result = discovery.execute()
78
+ >>> assert 'discovered_files' in result
79
+ >>> assert all(f['name'].endswith('.py') for f in result['discovered_files'])
80
+ >>>
81
+ >>> # Find large files for cleanup
82
+ >>> discovery = FileDiscoveryNode(
83
+ ... search_paths=['/var/log', '/tmp'],
84
+ ... min_size_mb=100,
85
+ ... older_than_days=30,
86
+ ... include_checksums=True
87
+ ... )
88
+ >>> result = discovery.execute()
89
+ >>> large_files = result['discovered_files']
90
+ """
91
+
92
+ def get_parameters(self) -> Dict[str, NodeParameter]:
93
+ return {
94
+ "search_paths": NodeParameter(
95
+ name="search_paths",
96
+ type=list,
97
+ required=True,
98
+ description="List of paths to search for files",
99
+ ),
100
+ "file_patterns": NodeParameter(
101
+ name="file_patterns",
102
+ type=list,
103
+ required=False,
104
+ default=["*"],
105
+ description="File name patterns to match (glob-style)",
106
+ ),
107
+ "exclude_patterns": NodeParameter(
108
+ name="exclude_patterns",
109
+ type=list,
110
+ required=False,
111
+ default=[],
112
+ description="File name patterns to exclude",
113
+ ),
114
+ "max_depth": NodeParameter(
115
+ name="max_depth",
116
+ type=int,
117
+ required=False,
118
+ default=10,
119
+ description="Maximum directory depth to search",
120
+ ),
121
+ "include_metadata": NodeParameter(
122
+ name="include_metadata",
123
+ type=bool,
124
+ required=False,
125
+ default=True,
126
+ description="Include detailed file metadata",
127
+ ),
128
+ "include_checksums": NodeParameter(
129
+ name="include_checksums",
130
+ type=bool,
131
+ required=False,
132
+ default=False,
133
+ description="Calculate file checksums (slower but more thorough)",
134
+ ),
135
+ "min_size_mb": NodeParameter(
136
+ name="min_size_mb",
137
+ type=float,
138
+ required=False,
139
+ description="Minimum file size in megabytes",
140
+ ),
141
+ "max_size_mb": NodeParameter(
142
+ name="max_size_mb",
143
+ type=float,
144
+ required=False,
145
+ description="Maximum file size in megabytes",
146
+ ),
147
+ "older_than_days": NodeParameter(
148
+ name="older_than_days",
149
+ type=int,
150
+ required=False,
151
+ description="Only include files older than N days",
152
+ ),
153
+ "newer_than_days": NodeParameter(
154
+ name="newer_than_days",
155
+ type=int,
156
+ required=False,
157
+ description="Only include files newer than N days",
158
+ ),
159
+ "follow_symlinks": NodeParameter(
160
+ name="follow_symlinks",
161
+ type=bool,
162
+ required=False,
163
+ default=False,
164
+ description="Follow symbolic links during traversal",
165
+ ),
166
+ }
167
+
168
+ def run(self, **kwargs) -> Dict[str, Any]:
169
+ search_paths = kwargs["search_paths"]
170
+ file_patterns = kwargs.get("file_patterns", ["*"])
171
+ exclude_patterns = kwargs.get("exclude_patterns", [])
172
+ max_depth = kwargs.get("max_depth", 10)
173
+ include_metadata = kwargs.get("include_metadata", True)
174
+ include_checksums = kwargs.get("include_checksums", False)
175
+ min_size_mb = kwargs.get("min_size_mb")
176
+ max_size_mb = kwargs.get("max_size_mb")
177
+ older_than_days = kwargs.get("older_than_days")
178
+ newer_than_days = kwargs.get("newer_than_days")
179
+ follow_symlinks = kwargs.get("follow_symlinks", False)
180
+
181
+ start_time = time.time()
182
+ discovered_files = []
183
+ discovery_stats = {
184
+ "total_directories_scanned": 0,
185
+ "total_files_found": 0,
186
+ "total_files_matching": 0,
187
+ "access_errors": 0,
188
+ "broken_symlinks": 0,
189
+ }
190
+
191
+ for search_path in search_paths:
192
+ try:
193
+ path_files, path_stats = self._discover_files_in_path(
194
+ search_path=search_path,
195
+ file_patterns=file_patterns,
196
+ exclude_patterns=exclude_patterns,
197
+ max_depth=max_depth,
198
+ include_metadata=include_metadata,
199
+ include_checksums=include_checksums,
200
+ min_size_mb=min_size_mb,
201
+ max_size_mb=max_size_mb,
202
+ older_than_days=older_than_days,
203
+ newer_than_days=newer_than_days,
204
+ follow_symlinks=follow_symlinks,
205
+ )
206
+
207
+ discovered_files.extend(path_files)
208
+
209
+ # Aggregate stats
210
+ for key, value in path_stats.items():
211
+ discovery_stats[key] += value
212
+
213
+ except Exception as e:
214
+ discovery_stats["access_errors"] += 1
215
+ # Add error entry to results
216
+ discovered_files.append(
217
+ {
218
+ "type": "discovery_error",
219
+ "path": search_path,
220
+ "error": str(e),
221
+ "timestamp": datetime.now(timezone.utc).isoformat() + "Z",
222
+ }
223
+ )
224
+
225
+ execution_time = time.time() - start_time
226
+
227
+ # Generate summary
228
+ summary = self._generate_discovery_summary(
229
+ discovered_files, discovery_stats, execution_time
230
+ )
231
+
232
+ return {
233
+ "discovered_files": discovered_files,
234
+ "discovery_summary": summary,
235
+ "discovery_stats": discovery_stats,
236
+ "total_files": len(
237
+ [f for f in discovered_files if f.get("type") != "discovery_error"]
238
+ ),
239
+ "execution_time": execution_time,
240
+ "timestamp": datetime.now(timezone.utc).isoformat() + "Z",
241
+ }
242
+
243
+ def _discover_files_in_path(
244
+ self,
245
+ search_path: str,
246
+ file_patterns: List[str],
247
+ exclude_patterns: List[str],
248
+ max_depth: int,
249
+ include_metadata: bool,
250
+ include_checksums: bool,
251
+ min_size_mb: Optional[float],
252
+ max_size_mb: Optional[float],
253
+ older_than_days: Optional[int],
254
+ newer_than_days: Optional[int],
255
+ follow_symlinks: bool,
256
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
257
+ """Discover files in a specific path."""
258
+
259
+ discovered_files = []
260
+ stats = {
261
+ "total_directories_scanned": 0,
262
+ "total_files_found": 0,
263
+ "total_files_matching": 0,
264
+ "access_errors": 0,
265
+ "broken_symlinks": 0,
266
+ }
267
+
268
+ try:
269
+ search_path_obj = Path(search_path)
270
+ if not search_path_obj.exists():
271
+ raise FileNotFoundError(f"Search path does not exist: {search_path}")
272
+
273
+ # Walk the directory tree
274
+ for root, dirs, files in os.walk(search_path, followlinks=follow_symlinks):
275
+ current_depth = len(Path(root).relative_to(search_path_obj).parts)
276
+
277
+ # Skip if max depth exceeded
278
+ if current_depth > max_depth:
279
+ dirs[:] = [] # Don't descend further
280
+ continue
281
+
282
+ stats["total_directories_scanned"] += 1
283
+
284
+ for file_name in files:
285
+ file_path = os.path.join(root, file_name)
286
+ stats["total_files_found"] += 1
287
+
288
+ try:
289
+ # Check if file matches patterns
290
+ if not self._matches_patterns(
291
+ file_name, file_patterns, exclude_patterns
292
+ ):
293
+ continue
294
+
295
+ file_info = self._analyze_file(
296
+ file_path=file_path,
297
+ include_metadata=include_metadata,
298
+ include_checksums=include_checksums,
299
+ )
300
+
301
+ # Apply size filters
302
+ if min_size_mb is not None:
303
+ if file_info.get("size_mb", 0) < min_size_mb:
304
+ continue
305
+
306
+ if max_size_mb is not None:
307
+ if file_info.get("size_mb", 0) > max_size_mb:
308
+ continue
309
+
310
+ # Apply date filters
311
+ if older_than_days is not None or newer_than_days is not None:
312
+ if not self._matches_date_criteria(
313
+ file_info, older_than_days, newer_than_days
314
+ ):
315
+ continue
316
+
317
+ discovered_files.append(file_info)
318
+ stats["total_files_matching"] += 1
319
+
320
+ except (OSError, PermissionError) as e:
321
+ stats["access_errors"] += 1
322
+ # Add error info for this specific file
323
+ discovered_files.append(
324
+ {
325
+ "type": "file_access_error",
326
+ "path": file_path,
327
+ "name": file_name,
328
+ "error": str(e),
329
+ "timestamp": datetime.now(timezone.utc).isoformat()
330
+ + "Z",
331
+ }
332
+ )
333
+
334
+ except Exception:
335
+ stats["access_errors"] += 1
336
+ raise
337
+
338
+ return discovered_files, stats
339
+
340
+ def _matches_patterns(
341
+ self, file_name: str, include_patterns: List[str], exclude_patterns: List[str]
342
+ ) -> bool:
343
+ """Check if filename matches include patterns and doesn't match exclude patterns."""
344
+ import fnmatch
345
+
346
+ # Check exclude patterns first
347
+ for pattern in exclude_patterns:
348
+ if fnmatch.fnmatch(file_name, pattern):
349
+ return False
350
+
351
+ # Check include patterns
352
+ if not include_patterns or include_patterns == ["*"]:
353
+ return True
354
+
355
+ for pattern in include_patterns:
356
+ if fnmatch.fnmatch(file_name, pattern):
357
+ return True
358
+
359
+ return False
360
+
361
+ def _analyze_file(
362
+ self, file_path: str, include_metadata: bool, include_checksums: bool
363
+ ) -> Dict[str, Any]:
364
+ """Analyze a single file and return its information."""
365
+
366
+ file_path_obj = Path(file_path)
367
+ file_info = {
368
+ "type": "file",
369
+ "path": str(file_path),
370
+ "name": file_path_obj.name,
371
+ "directory": str(file_path_obj.parent),
372
+ }
373
+
374
+ try:
375
+ # Basic file stats
376
+ stat_info = file_path_obj.stat()
377
+
378
+ file_info.update(
379
+ {
380
+ "size_bytes": stat_info.st_size,
381
+ "size_mb": stat_info.st_size / (1024 * 1024),
382
+ "created_timestamp": stat_info.st_ctime,
383
+ "modified_timestamp": stat_info.st_mtime,
384
+ "accessed_timestamp": stat_info.st_atime,
385
+ "created_date": datetime.fromtimestamp(
386
+ stat_info.st_ctime, timezone.utc
387
+ ).isoformat()
388
+ + "Z",
389
+ "modified_date": datetime.fromtimestamp(
390
+ stat_info.st_mtime, timezone.utc
391
+ ).isoformat()
392
+ + "Z",
393
+ "accessed_date": datetime.fromtimestamp(
394
+ stat_info.st_atime, timezone.utc
395
+ ).isoformat()
396
+ + "Z",
397
+ }
398
+ )
399
+
400
+ if include_metadata:
401
+ # File type detection
402
+ mime_type, encoding = mimetypes.guess_type(file_path)
403
+ file_info.update(
404
+ {
405
+ "mime_type": mime_type,
406
+ "encoding": encoding,
407
+ "extension": file_path_obj.suffix.lower(),
408
+ }
409
+ )
410
+
411
+ # File permissions
412
+ file_info.update(
413
+ {
414
+ "permissions": oct(stat_info.st_mode)[-3:],
415
+ "owner_uid": stat_info.st_uid,
416
+ "group_gid": stat_info.st_gid,
417
+ "is_readable": os.access(file_path, os.R_OK),
418
+ "is_writable": os.access(file_path, os.W_OK),
419
+ "is_executable": os.access(file_path, os.X_OK),
420
+ }
421
+ )
422
+
423
+ # Symbolic link detection
424
+ if file_path_obj.is_symlink():
425
+ try:
426
+ link_target = os.readlink(file_path)
427
+ file_info.update(
428
+ {
429
+ "is_symlink": True,
430
+ "link_target": link_target,
431
+ "link_target_exists": os.path.exists(link_target),
432
+ }
433
+ )
434
+ except OSError:
435
+ file_info.update(
436
+ {
437
+ "is_symlink": True,
438
+ "link_target": None,
439
+ "link_target_exists": False,
440
+ }
441
+ )
442
+ else:
443
+ file_info["is_symlink"] = False
444
+
445
+ # Content analysis for text files
446
+ if mime_type and mime_type.startswith("text/"):
447
+ try:
448
+ with open(
449
+ file_path, "r", encoding="utf-8", errors="ignore"
450
+ ) as f:
451
+ content_sample = f.read(1024) # Read first 1KB
452
+ file_info.update(
453
+ {
454
+ "line_count": len(content_sample.splitlines()),
455
+ "character_count": len(content_sample),
456
+ "content_sample": (
457
+ content_sample[:200] + "..."
458
+ if len(content_sample) > 200
459
+ else content_sample
460
+ ),
461
+ }
462
+ )
463
+ except (UnicodeDecodeError, PermissionError):
464
+ pass
465
+
466
+ if include_checksums:
467
+ # Calculate file hashes
468
+ file_info.update(self._calculate_checksums(file_path))
469
+
470
+ except (OSError, PermissionError) as e:
471
+ file_info.update(
472
+ {
473
+ "error": str(e),
474
+ "accessible": False,
475
+ }
476
+ )
477
+
478
+ file_info["timestamp"] = datetime.now(timezone.utc).isoformat() + "Z"
479
+ return file_info
480
+
481
+ def _calculate_checksums(self, file_path: str) -> Dict[str, str]:
482
+ """Calculate MD5 and SHA256 checksums for a file."""
483
+ checksums = {}
484
+
485
+ try:
486
+ md5_hash = hashlib.md5()
487
+ sha256_hash = hashlib.sha256()
488
+
489
+ with open(file_path, "rb") as f:
490
+ # Read file in chunks to handle large files efficiently
491
+ for chunk in iter(lambda: f.read(4096), b""):
492
+ md5_hash.update(chunk)
493
+ sha256_hash.update(chunk)
494
+
495
+ checksums.update(
496
+ {
497
+ "md5": md5_hash.hexdigest(),
498
+ "sha256": sha256_hash.hexdigest(),
499
+ }
500
+ )
501
+ except (OSError, PermissionError) as e:
502
+ checksums.update(
503
+ {
504
+ "checksum_error": str(e),
505
+ }
506
+ )
507
+
508
+ return checksums
509
+
510
+ def _matches_date_criteria(
511
+ self,
512
+ file_info: Dict[str, Any],
513
+ older_than_days: Optional[int],
514
+ newer_than_days: Optional[int],
515
+ ) -> bool:
516
+ """Check if file matches date criteria."""
517
+
518
+ modified_timestamp = file_info.get("modified_timestamp")
519
+ if modified_timestamp is None:
520
+ return True
521
+
522
+ now = time.time()
523
+ file_age_days = (now - modified_timestamp) / (24 * 3600)
524
+
525
+ if older_than_days is not None and file_age_days < older_than_days:
526
+ return False
527
+
528
+ if newer_than_days is not None and file_age_days > newer_than_days:
529
+ return False
530
+
531
+ return True
532
+
533
+ def _generate_discovery_summary(
534
+ self,
535
+ discovered_files: List[Dict],
536
+ discovery_stats: Dict[str, int],
537
+ execution_time: float,
538
+ ) -> Dict[str, Any]:
539
+ """Generate summary of file discovery results."""
540
+
541
+ # Count files by type/extension
542
+ extension_counts = {}
543
+ mime_type_counts = {}
544
+ size_distribution = {"small": 0, "medium": 0, "large": 0, "very_large": 0}
545
+
546
+ total_size_mb = 0
547
+ error_count = 0
548
+
549
+ for file_info in discovered_files:
550
+ if file_info.get("type") in ["discovery_error", "file_access_error"]:
551
+ error_count += 1
552
+ continue
553
+
554
+ # Extension analysis
555
+ extension = file_info.get("extension", "")
556
+ extension_counts[extension] = extension_counts.get(extension, 0) + 1
557
+
558
+ # MIME type analysis
559
+ mime_type = file_info.get("mime_type", "unknown")
560
+ mime_type_counts[mime_type] = mime_type_counts.get(mime_type, 0) + 1
561
+
562
+ # Size distribution
563
+ size_mb = file_info.get("size_mb", 0)
564
+ total_size_mb += size_mb
565
+
566
+ if size_mb < 1:
567
+ size_distribution["small"] += 1
568
+ elif size_mb < 50:
569
+ size_distribution["medium"] += 1
570
+ elif size_mb < 500:
571
+ size_distribution["large"] += 1
572
+ else:
573
+ size_distribution["very_large"] += 1
574
+
575
+ # Find largest files
576
+ file_sizes = [
577
+ (f.get("size_mb", 0), f.get("path", ""))
578
+ for f in discovered_files
579
+ if f.get("type") == "file"
580
+ ]
581
+ largest_files = sorted(file_sizes, reverse=True)[:10]
582
+
583
+ return {
584
+ "execution_time": execution_time,
585
+ "total_files_discovered": len(discovered_files) - error_count,
586
+ "total_errors": error_count,
587
+ "total_size_mb": total_size_mb,
588
+ "average_file_size_mb": total_size_mb
589
+ / max(1, len(discovered_files) - error_count),
590
+ "extension_distribution": dict(
591
+ sorted(extension_counts.items(), key=lambda x: x[1], reverse=True)
592
+ ),
593
+ "mime_type_distribution": dict(
594
+ sorted(mime_type_counts.items(), key=lambda x: x[1], reverse=True)
595
+ ),
596
+ "size_distribution": size_distribution,
597
+ "largest_files": [
598
+ {"size_mb": size, "path": path} for size, path in largest_files[:5]
599
+ ],
600
+ "discovery_stats": discovery_stats,
601
+ }
@@ -311,7 +311,7 @@ class DataTransformer(Node):
311
311
  return {
312
312
  "data": NodeParameter(
313
313
  name="data",
314
- type=list,
314
+ type=Any,
315
315
  required=False,
316
316
  description="Primary input data to transform",
317
317
  ),
@@ -227,7 +227,7 @@ class AsyncLocalRuntime:
227
227
  outputs = await node_instance.execute_async(**inputs)
228
228
  else:
229
229
  # Fall back to synchronous execution
230
- outputs = node_instance.execute(**inputs)
230
+ outputs = node_instance.run(**inputs)
231
231
 
232
232
  execution_time = (
233
233
  datetime.now(timezone.utc) - start_time
kailash/runtime/docker.py CHANGED
@@ -174,7 +174,7 @@ def main():
174
174
  logger.info(f"Loaded configuration for {node_data['class']} node")
175
175
 
176
176
  # Load runtime inputs if available
177
- input_path = Path("/examples/data/input/inputs.json")
177
+ input_path = Path("/data/inputs/json/inputs.json")
178
178
  runtime_inputs = {}
179
179
  if input_path.exists():
180
180
  logger.info(f"Loading inputs from {input_path}")
@@ -206,7 +206,7 @@ def main():
206
206
  except Exception as e:
207
207
  logger.error(f"Node execution failed: {e}")
208
208
  # Save error information
209
- with open("/examples/data/output/error.json", 'w') as f:
209
+ with open("/data/outputs/json/error.json", 'w') as f:
210
210
  json.dump({
211
211
  "error": str(e),
212
212
  "type": e.__class__.__name__
@@ -216,7 +216,7 @@ def main():
216
216
  # Save results
217
217
  logger.info("Saving execution results")
218
218
  try:
219
- result_path = Path("/examples/data/output/result.json")
219
+ result_path = Path("/data/outputs/json/result.json")
220
220
  with open(result_path, 'w') as f:
221
221
  # Handle non-serializable objects with basic conversion
222
222
  try:
@@ -590,7 +590,7 @@ class DockerRuntime:
590
590
 
591
591
  try:
592
592
  # Validate workflow
593
- workflow.validate()
593
+ workflow.validate(runtime_parameters=inputs)
594
594
 
595
595
  # Get execution order
596
596
  execution_order = workflow.get_execution_order()