kailash 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. kailash/__init__.py +1 -1
  2. kailash/access_control.py +40 -39
  3. kailash/api/auth.py +26 -32
  4. kailash/api/custom_nodes.py +29 -29
  5. kailash/api/custom_nodes_secure.py +35 -35
  6. kailash/api/database.py +17 -17
  7. kailash/api/gateway.py +19 -19
  8. kailash/api/mcp_integration.py +24 -23
  9. kailash/api/studio.py +45 -45
  10. kailash/api/workflow_api.py +8 -8
  11. kailash/cli/commands.py +5 -8
  12. kailash/manifest.py +42 -42
  13. kailash/mcp/__init__.py +1 -1
  14. kailash/mcp/ai_registry_server.py +20 -20
  15. kailash/mcp/client.py +9 -11
  16. kailash/mcp/client_new.py +10 -10
  17. kailash/mcp/server.py +1 -2
  18. kailash/mcp/server_enhanced.py +449 -0
  19. kailash/mcp/servers/ai_registry.py +6 -6
  20. kailash/mcp/utils/__init__.py +31 -0
  21. kailash/mcp/utils/cache.py +267 -0
  22. kailash/mcp/utils/config.py +263 -0
  23. kailash/mcp/utils/formatters.py +293 -0
  24. kailash/mcp/utils/metrics.py +418 -0
  25. kailash/nodes/ai/agents.py +9 -9
  26. kailash/nodes/ai/ai_providers.py +33 -34
  27. kailash/nodes/ai/embedding_generator.py +31 -32
  28. kailash/nodes/ai/intelligent_agent_orchestrator.py +62 -66
  29. kailash/nodes/ai/iterative_llm_agent.py +48 -48
  30. kailash/nodes/ai/llm_agent.py +32 -33
  31. kailash/nodes/ai/models.py +13 -13
  32. kailash/nodes/ai/self_organizing.py +44 -44
  33. kailash/nodes/api/__init__.py +5 -0
  34. kailash/nodes/api/auth.py +11 -11
  35. kailash/nodes/api/graphql.py +13 -13
  36. kailash/nodes/api/http.py +19 -19
  37. kailash/nodes/api/monitoring.py +463 -0
  38. kailash/nodes/api/rate_limiting.py +9 -13
  39. kailash/nodes/api/rest.py +29 -29
  40. kailash/nodes/api/security.py +819 -0
  41. kailash/nodes/base.py +24 -26
  42. kailash/nodes/base_async.py +7 -7
  43. kailash/nodes/base_cycle_aware.py +12 -12
  44. kailash/nodes/base_with_acl.py +5 -5
  45. kailash/nodes/code/python.py +56 -55
  46. kailash/nodes/data/__init__.py +6 -0
  47. kailash/nodes/data/directory.py +6 -6
  48. kailash/nodes/data/event_generation.py +297 -0
  49. kailash/nodes/data/file_discovery.py +598 -0
  50. kailash/nodes/data/readers.py +8 -8
  51. kailash/nodes/data/retrieval.py +10 -10
  52. kailash/nodes/data/sharepoint_graph.py +17 -17
  53. kailash/nodes/data/sources.py +5 -5
  54. kailash/nodes/data/sql.py +13 -13
  55. kailash/nodes/data/streaming.py +25 -25
  56. kailash/nodes/data/vector_db.py +22 -22
  57. kailash/nodes/data/writers.py +7 -7
  58. kailash/nodes/logic/async_operations.py +17 -17
  59. kailash/nodes/logic/convergence.py +11 -11
  60. kailash/nodes/logic/loop.py +4 -4
  61. kailash/nodes/logic/operations.py +11 -11
  62. kailash/nodes/logic/workflow.py +8 -9
  63. kailash/nodes/mixins/mcp.py +17 -17
  64. kailash/nodes/mixins.py +8 -10
  65. kailash/nodes/transform/chunkers.py +3 -3
  66. kailash/nodes/transform/formatters.py +7 -7
  67. kailash/nodes/transform/processors.py +11 -11
  68. kailash/runtime/access_controlled.py +18 -18
  69. kailash/runtime/async_local.py +18 -20
  70. kailash/runtime/docker.py +24 -26
  71. kailash/runtime/local.py +55 -31
  72. kailash/runtime/parallel.py +25 -25
  73. kailash/runtime/parallel_cyclic.py +29 -29
  74. kailash/runtime/runner.py +6 -6
  75. kailash/runtime/testing.py +22 -22
  76. kailash/sdk_exceptions.py +0 -58
  77. kailash/security.py +14 -26
  78. kailash/tracking/manager.py +38 -38
  79. kailash/tracking/metrics_collector.py +15 -14
  80. kailash/tracking/models.py +53 -53
  81. kailash/tracking/storage/base.py +7 -17
  82. kailash/tracking/storage/database.py +22 -23
  83. kailash/tracking/storage/filesystem.py +38 -40
  84. kailash/utils/export.py +21 -21
  85. kailash/utils/templates.py +8 -9
  86. kailash/visualization/api.py +30 -34
  87. kailash/visualization/dashboard.py +17 -17
  88. kailash/visualization/performance.py +32 -19
  89. kailash/visualization/reports.py +30 -28
  90. kailash/workflow/builder.py +8 -8
  91. kailash/workflow/convergence.py +13 -12
  92. kailash/workflow/cycle_analyzer.py +38 -33
  93. kailash/workflow/cycle_builder.py +12 -12
  94. kailash/workflow/cycle_config.py +16 -15
  95. kailash/workflow/cycle_debugger.py +40 -40
  96. kailash/workflow/cycle_exceptions.py +29 -29
  97. kailash/workflow/cycle_profiler.py +21 -21
  98. kailash/workflow/cycle_state.py +20 -22
  99. kailash/workflow/cyclic_runner.py +45 -45
  100. kailash/workflow/graph.py +57 -45
  101. kailash/workflow/mermaid_visualizer.py +9 -11
  102. kailash/workflow/migration.py +22 -22
  103. kailash/workflow/mock_registry.py +6 -6
  104. kailash/workflow/runner.py +9 -9
  105. kailash/workflow/safety.py +12 -13
  106. kailash/workflow/state.py +8 -11
  107. kailash/workflow/templates.py +19 -19
  108. kailash/workflow/validation.py +14 -14
  109. kailash/workflow/visualization.py +32 -24
  110. kailash-0.3.1.dist-info/METADATA +476 -0
  111. kailash-0.3.1.dist-info/RECORD +136 -0
  112. kailash-0.2.2.dist-info/METADATA +0 -121
  113. kailash-0.2.2.dist-info/RECORD +0 -126
  114. {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/WHEEL +0 -0
  115. {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/entry_points.txt +0 -0
  116. {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/licenses/LICENSE +0 -0
  117. {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,598 @@
1
+ """File discovery and analysis nodes for file system operations."""
2
+
3
+ import hashlib
4
+ import mimetypes
5
+ import os
6
+ import time
7
+ from datetime import UTC, datetime
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from kailash.nodes.base import Node, NodeParameter, register_node
12
+
13
+
14
+ @register_node()
15
+ class FileDiscoveryNode(Node):
16
+ """
17
+ Discovers and analyzes files and directories in the file system.
18
+
19
+ This node provides comprehensive file discovery capabilities, replacing
20
+ DataTransformer with embedded Python code for file processing tasks.
21
+ It can scan directories, analyze file properties, detect file types,
22
+ and generate detailed file system reports.
23
+
24
+ Design Philosophy:
25
+ File system operations require robust discovery and analysis capabilities.
26
+ This node eliminates the need for custom file processing code in
27
+ DataTransformer nodes by providing dedicated, configurable file
28
+ discovery with filtering, analysis, and reporting features.
29
+
30
+ Upstream Dependencies:
31
+ - Path configuration nodes
32
+ - Filter criteria nodes
33
+ - Authentication/permission nodes
34
+ - Schedule/trigger nodes
35
+
36
+ Downstream Consumers:
37
+ - File processing nodes
38
+ - Content analysis nodes
39
+ - Backup and archival nodes
40
+ - Security scanning nodes
41
+ - Compliance reporting nodes
42
+
43
+ Configuration:
44
+ - Search paths and patterns
45
+ - File type filters
46
+ - Size and date criteria
47
+ - Analysis depth and options
48
+ - Output format preferences
49
+
50
+ Implementation Details:
51
+ - Recursive directory traversal
52
+ - File metadata extraction
53
+ - Content type detection
54
+ - Permission and ownership analysis
55
+ - Hash calculation for integrity
56
+
57
+ Error Handling:
58
+ - Permission denied gracefully handled
59
+ - Broken symlinks detected
60
+ - Invalid paths reported
61
+ - Partial results on errors
62
+
63
+ Side Effects:
64
+ - File system access (read-only by default)
65
+ - Temporary file creation for analysis
66
+ - Metadata caching for performance
67
+ - Logging of discovery activities
68
+
69
+ Examples:
70
+ >>> # Discover all Python files in a project
71
+ >>> discovery = FileDiscoveryNode(
72
+ ... search_paths=['/path/to/project'],
73
+ ... file_patterns=['*.py'],
74
+ ... include_metadata=True,
75
+ ... max_depth=5
76
+ ... )
77
+ >>> result = discovery.execute()
78
+ >>> assert 'discovered_files' in result
79
+ >>> assert all(f['name'].endswith('.py') for f in result['discovered_files'])
80
+ >>>
81
+ >>> # Find large files for cleanup
82
+ >>> discovery = FileDiscoveryNode(
83
+ ... search_paths=['/var/log', '/tmp'],
84
+ ... min_size_mb=100,
85
+ ... older_than_days=30,
86
+ ... include_checksums=True
87
+ ... )
88
+ >>> result = discovery.execute()
89
+ >>> large_files = result['discovered_files']
90
+ """
91
+
92
+ def get_parameters(self) -> dict[str, NodeParameter]:
93
+ return {
94
+ "search_paths": NodeParameter(
95
+ name="search_paths",
96
+ type=list,
97
+ required=True,
98
+ description="List of paths to search for files",
99
+ ),
100
+ "file_patterns": NodeParameter(
101
+ name="file_patterns",
102
+ type=list,
103
+ required=False,
104
+ default=["*"],
105
+ description="File name patterns to match (glob-style)",
106
+ ),
107
+ "exclude_patterns": NodeParameter(
108
+ name="exclude_patterns",
109
+ type=list,
110
+ required=False,
111
+ default=[],
112
+ description="File name patterns to exclude",
113
+ ),
114
+ "max_depth": NodeParameter(
115
+ name="max_depth",
116
+ type=int,
117
+ required=False,
118
+ default=10,
119
+ description="Maximum directory depth to search",
120
+ ),
121
+ "include_metadata": NodeParameter(
122
+ name="include_metadata",
123
+ type=bool,
124
+ required=False,
125
+ default=True,
126
+ description="Include detailed file metadata",
127
+ ),
128
+ "include_checksums": NodeParameter(
129
+ name="include_checksums",
130
+ type=bool,
131
+ required=False,
132
+ default=False,
133
+ description="Calculate file checksums (slower but more thorough)",
134
+ ),
135
+ "min_size_mb": NodeParameter(
136
+ name="min_size_mb",
137
+ type=float,
138
+ required=False,
139
+ description="Minimum file size in megabytes",
140
+ ),
141
+ "max_size_mb": NodeParameter(
142
+ name="max_size_mb",
143
+ type=float,
144
+ required=False,
145
+ description="Maximum file size in megabytes",
146
+ ),
147
+ "older_than_days": NodeParameter(
148
+ name="older_than_days",
149
+ type=int,
150
+ required=False,
151
+ description="Only include files older than N days",
152
+ ),
153
+ "newer_than_days": NodeParameter(
154
+ name="newer_than_days",
155
+ type=int,
156
+ required=False,
157
+ description="Only include files newer than N days",
158
+ ),
159
+ "follow_symlinks": NodeParameter(
160
+ name="follow_symlinks",
161
+ type=bool,
162
+ required=False,
163
+ default=False,
164
+ description="Follow symbolic links during traversal",
165
+ ),
166
+ }
167
+
168
+ def run(self, **kwargs) -> dict[str, Any]:
169
+ search_paths = kwargs["search_paths"]
170
+ file_patterns = kwargs.get("file_patterns", ["*"])
171
+ exclude_patterns = kwargs.get("exclude_patterns", [])
172
+ max_depth = kwargs.get("max_depth", 10)
173
+ include_metadata = kwargs.get("include_metadata", True)
174
+ include_checksums = kwargs.get("include_checksums", False)
175
+ min_size_mb = kwargs.get("min_size_mb")
176
+ max_size_mb = kwargs.get("max_size_mb")
177
+ older_than_days = kwargs.get("older_than_days")
178
+ newer_than_days = kwargs.get("newer_than_days")
179
+ follow_symlinks = kwargs.get("follow_symlinks", False)
180
+
181
+ start_time = time.time()
182
+ discovered_files = []
183
+ discovery_stats = {
184
+ "total_directories_scanned": 0,
185
+ "total_files_found": 0,
186
+ "total_files_matching": 0,
187
+ "access_errors": 0,
188
+ "broken_symlinks": 0,
189
+ }
190
+
191
+ for search_path in search_paths:
192
+ try:
193
+ path_files, path_stats = self._discover_files_in_path(
194
+ search_path=search_path,
195
+ file_patterns=file_patterns,
196
+ exclude_patterns=exclude_patterns,
197
+ max_depth=max_depth,
198
+ include_metadata=include_metadata,
199
+ include_checksums=include_checksums,
200
+ min_size_mb=min_size_mb,
201
+ max_size_mb=max_size_mb,
202
+ older_than_days=older_than_days,
203
+ newer_than_days=newer_than_days,
204
+ follow_symlinks=follow_symlinks,
205
+ )
206
+
207
+ discovered_files.extend(path_files)
208
+
209
+ # Aggregate stats
210
+ for key, value in path_stats.items():
211
+ discovery_stats[key] += value
212
+
213
+ except Exception as e:
214
+ discovery_stats["access_errors"] += 1
215
+ # Add error entry to results
216
+ discovered_files.append(
217
+ {
218
+ "type": "discovery_error",
219
+ "path": search_path,
220
+ "error": str(e),
221
+ "timestamp": datetime.now(UTC).isoformat() + "Z",
222
+ }
223
+ )
224
+
225
+ execution_time = time.time() - start_time
226
+
227
+ # Generate summary
228
+ summary = self._generate_discovery_summary(
229
+ discovered_files, discovery_stats, execution_time
230
+ )
231
+
232
+ return {
233
+ "discovered_files": discovered_files,
234
+ "discovery_summary": summary,
235
+ "discovery_stats": discovery_stats,
236
+ "total_files": len(
237
+ [f for f in discovered_files if f.get("type") != "discovery_error"]
238
+ ),
239
+ "execution_time": execution_time,
240
+ "timestamp": datetime.now(UTC).isoformat() + "Z",
241
+ }
242
+
243
+ def _discover_files_in_path(
244
+ self,
245
+ search_path: str,
246
+ file_patterns: list[str],
247
+ exclude_patterns: list[str],
248
+ max_depth: int,
249
+ include_metadata: bool,
250
+ include_checksums: bool,
251
+ min_size_mb: float | None,
252
+ max_size_mb: float | None,
253
+ older_than_days: int | None,
254
+ newer_than_days: int | None,
255
+ follow_symlinks: bool,
256
+ ) -> tuple[list[dict[str, Any]], dict[str, int]]:
257
+ """Discover files in a specific path."""
258
+
259
+ discovered_files = []
260
+ stats = {
261
+ "total_directories_scanned": 0,
262
+ "total_files_found": 0,
263
+ "total_files_matching": 0,
264
+ "access_errors": 0,
265
+ "broken_symlinks": 0,
266
+ }
267
+
268
+ try:
269
+ search_path_obj = Path(search_path)
270
+ if not search_path_obj.exists():
271
+ raise FileNotFoundError(f"Search path does not exist: {search_path}")
272
+
273
+ # Walk the directory tree
274
+ for root, dirs, files in os.walk(search_path, followlinks=follow_symlinks):
275
+ current_depth = len(Path(root).relative_to(search_path_obj).parts)
276
+
277
+ # Skip if max depth exceeded
278
+ if current_depth > max_depth:
279
+ dirs[:] = [] # Don't descend further
280
+ continue
281
+
282
+ stats["total_directories_scanned"] += 1
283
+
284
+ for file_name in files:
285
+ file_path = os.path.join(root, file_name)
286
+ stats["total_files_found"] += 1
287
+
288
+ try:
289
+ # Check if file matches patterns
290
+ if not self._matches_patterns(
291
+ file_name, file_patterns, exclude_patterns
292
+ ):
293
+ continue
294
+
295
+ file_info = self._analyze_file(
296
+ file_path=file_path,
297
+ include_metadata=include_metadata,
298
+ include_checksums=include_checksums,
299
+ )
300
+
301
+ # Apply size filters
302
+ if min_size_mb is not None:
303
+ if file_info.get("size_mb", 0) < min_size_mb:
304
+ continue
305
+
306
+ if max_size_mb is not None:
307
+ if file_info.get("size_mb", 0) > max_size_mb:
308
+ continue
309
+
310
+ # Apply date filters
311
+ if older_than_days is not None or newer_than_days is not None:
312
+ if not self._matches_date_criteria(
313
+ file_info, older_than_days, newer_than_days
314
+ ):
315
+ continue
316
+
317
+ discovered_files.append(file_info)
318
+ stats["total_files_matching"] += 1
319
+
320
+ except (OSError, PermissionError) as e:
321
+ stats["access_errors"] += 1
322
+ # Add error info for this specific file
323
+ discovered_files.append(
324
+ {
325
+ "type": "file_access_error",
326
+ "path": file_path,
327
+ "name": file_name,
328
+ "error": str(e),
329
+ "timestamp": datetime.now(UTC).isoformat() + "Z",
330
+ }
331
+ )
332
+
333
+ except Exception:
334
+ stats["access_errors"] += 1
335
+ raise
336
+
337
+ return discovered_files, stats
338
+
339
+ def _matches_patterns(
340
+ self, file_name: str, include_patterns: list[str], exclude_patterns: list[str]
341
+ ) -> bool:
342
+ """Check if filename matches include patterns and doesn't match exclude patterns."""
343
+ import fnmatch
344
+
345
+ # Check exclude patterns first
346
+ for pattern in exclude_patterns:
347
+ if fnmatch.fnmatch(file_name, pattern):
348
+ return False
349
+
350
+ # Check include patterns
351
+ if not include_patterns or include_patterns == ["*"]:
352
+ return True
353
+
354
+ for pattern in include_patterns:
355
+ if fnmatch.fnmatch(file_name, pattern):
356
+ return True
357
+
358
+ return False
359
+
360
+ def _analyze_file(
361
+ self, file_path: str, include_metadata: bool, include_checksums: bool
362
+ ) -> dict[str, Any]:
363
+ """Analyze a single file and return its information."""
364
+
365
+ file_path_obj = Path(file_path)
366
+ file_info = {
367
+ "type": "file",
368
+ "path": str(file_path),
369
+ "name": file_path_obj.name,
370
+ "directory": str(file_path_obj.parent),
371
+ }
372
+
373
+ try:
374
+ # Basic file stats
375
+ stat_info = file_path_obj.stat()
376
+
377
+ file_info.update(
378
+ {
379
+ "size_bytes": stat_info.st_size,
380
+ "size_mb": stat_info.st_size / (1024 * 1024),
381
+ "created_timestamp": stat_info.st_ctime,
382
+ "modified_timestamp": stat_info.st_mtime,
383
+ "accessed_timestamp": stat_info.st_atime,
384
+ "created_date": datetime.fromtimestamp(
385
+ stat_info.st_ctime, UTC
386
+ ).isoformat()
387
+ + "Z",
388
+ "modified_date": datetime.fromtimestamp(
389
+ stat_info.st_mtime, UTC
390
+ ).isoformat()
391
+ + "Z",
392
+ "accessed_date": datetime.fromtimestamp(
393
+ stat_info.st_atime, UTC
394
+ ).isoformat()
395
+ + "Z",
396
+ }
397
+ )
398
+
399
+ if include_metadata:
400
+ # File type detection
401
+ mime_type, encoding = mimetypes.guess_type(file_path)
402
+ file_info.update(
403
+ {
404
+ "mime_type": mime_type,
405
+ "encoding": encoding,
406
+ "extension": file_path_obj.suffix.lower(),
407
+ }
408
+ )
409
+
410
+ # File permissions
411
+ file_info.update(
412
+ {
413
+ "permissions": oct(stat_info.st_mode)[-3:],
414
+ "owner_uid": stat_info.st_uid,
415
+ "group_gid": stat_info.st_gid,
416
+ "is_readable": os.access(file_path, os.R_OK),
417
+ "is_writable": os.access(file_path, os.W_OK),
418
+ "is_executable": os.access(file_path, os.X_OK),
419
+ }
420
+ )
421
+
422
+ # Symbolic link detection
423
+ if file_path_obj.is_symlink():
424
+ try:
425
+ link_target = os.readlink(file_path)
426
+ file_info.update(
427
+ {
428
+ "is_symlink": True,
429
+ "link_target": link_target,
430
+ "link_target_exists": os.path.exists(link_target),
431
+ }
432
+ )
433
+ except OSError:
434
+ file_info.update(
435
+ {
436
+ "is_symlink": True,
437
+ "link_target": None,
438
+ "link_target_exists": False,
439
+ }
440
+ )
441
+ else:
442
+ file_info["is_symlink"] = False
443
+
444
+ # Content analysis for text files
445
+ if mime_type and mime_type.startswith("text/"):
446
+ try:
447
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
448
+ content_sample = f.read(1024) # Read first 1KB
449
+ file_info.update(
450
+ {
451
+ "line_count": len(content_sample.splitlines()),
452
+ "character_count": len(content_sample),
453
+ "content_sample": (
454
+ content_sample[:200] + "..."
455
+ if len(content_sample) > 200
456
+ else content_sample
457
+ ),
458
+ }
459
+ )
460
+ except (UnicodeDecodeError, PermissionError):
461
+ pass
462
+
463
+ if include_checksums:
464
+ # Calculate file hashes
465
+ file_info.update(self._calculate_checksums(file_path))
466
+
467
+ except (OSError, PermissionError) as e:
468
+ file_info.update(
469
+ {
470
+ "error": str(e),
471
+ "accessible": False,
472
+ }
473
+ )
474
+
475
+ file_info["timestamp"] = datetime.now(UTC).isoformat() + "Z"
476
+ return file_info
477
+
478
+ def _calculate_checksums(self, file_path: str) -> dict[str, str]:
479
+ """Calculate MD5 and SHA256 checksums for a file."""
480
+ checksums = {}
481
+
482
+ try:
483
+ md5_hash = hashlib.md5()
484
+ sha256_hash = hashlib.sha256()
485
+
486
+ with open(file_path, "rb") as f:
487
+ # Read file in chunks to handle large files efficiently
488
+ for chunk in iter(lambda: f.read(4096), b""):
489
+ md5_hash.update(chunk)
490
+ sha256_hash.update(chunk)
491
+
492
+ checksums.update(
493
+ {
494
+ "md5": md5_hash.hexdigest(),
495
+ "sha256": sha256_hash.hexdigest(),
496
+ }
497
+ )
498
+ except (OSError, PermissionError) as e:
499
+ checksums.update(
500
+ {
501
+ "checksum_error": str(e),
502
+ }
503
+ )
504
+
505
+ return checksums
506
+
507
+ def _matches_date_criteria(
508
+ self,
509
+ file_info: dict[str, Any],
510
+ older_than_days: int | None,
511
+ newer_than_days: int | None,
512
+ ) -> bool:
513
+ """Check if file matches date criteria."""
514
+
515
+ modified_timestamp = file_info.get("modified_timestamp")
516
+ if modified_timestamp is None:
517
+ return True
518
+
519
+ now = time.time()
520
+ file_age_days = (now - modified_timestamp) / (24 * 3600)
521
+
522
+ if older_than_days is not None and file_age_days < older_than_days:
523
+ return False
524
+
525
+ if newer_than_days is not None and file_age_days > newer_than_days:
526
+ return False
527
+
528
+ return True
529
+
530
+ def _generate_discovery_summary(
531
+ self,
532
+ discovered_files: list[dict],
533
+ discovery_stats: dict[str, int],
534
+ execution_time: float,
535
+ ) -> dict[str, Any]:
536
+ """Generate summary of file discovery results."""
537
+
538
+ # Count files by type/extension
539
+ extension_counts = {}
540
+ mime_type_counts = {}
541
+ size_distribution = {"small": 0, "medium": 0, "large": 0, "very_large": 0}
542
+
543
+ total_size_mb = 0
544
+ error_count = 0
545
+
546
+ for file_info in discovered_files:
547
+ if file_info.get("type") in ["discovery_error", "file_access_error"]:
548
+ error_count += 1
549
+ continue
550
+
551
+ # Extension analysis
552
+ extension = file_info.get("extension", "")
553
+ extension_counts[extension] = extension_counts.get(extension, 0) + 1
554
+
555
+ # MIME type analysis
556
+ mime_type = file_info.get("mime_type", "unknown")
557
+ mime_type_counts[mime_type] = mime_type_counts.get(mime_type, 0) + 1
558
+
559
+ # Size distribution
560
+ size_mb = file_info.get("size_mb", 0)
561
+ total_size_mb += size_mb
562
+
563
+ if size_mb < 1:
564
+ size_distribution["small"] += 1
565
+ elif size_mb < 50:
566
+ size_distribution["medium"] += 1
567
+ elif size_mb < 500:
568
+ size_distribution["large"] += 1
569
+ else:
570
+ size_distribution["very_large"] += 1
571
+
572
+ # Find largest files
573
+ file_sizes = [
574
+ (f.get("size_mb", 0), f.get("path", ""))
575
+ for f in discovered_files
576
+ if f.get("type") == "file"
577
+ ]
578
+ largest_files = sorted(file_sizes, reverse=True)[:10]
579
+
580
+ return {
581
+ "execution_time": execution_time,
582
+ "total_files_discovered": len(discovered_files) - error_count,
583
+ "total_errors": error_count,
584
+ "total_size_mb": total_size_mb,
585
+ "average_file_size_mb": total_size_mb
586
+ / max(1, len(discovered_files) - error_count),
587
+ "extension_distribution": dict(
588
+ sorted(extension_counts.items(), key=lambda x: x[1], reverse=True)
589
+ ),
590
+ "mime_type_distribution": dict(
591
+ sorted(mime_type_counts.items(), key=lambda x: x[1], reverse=True)
592
+ ),
593
+ "size_distribution": size_distribution,
594
+ "largest_files": [
595
+ {"size_mb": size, "path": path} for size, path in largest_files[:5]
596
+ ],
597
+ "discovery_stats": discovery_stats,
598
+ }
@@ -30,7 +30,7 @@ Downstream Consumers:
30
30
 
31
31
  import csv
32
32
  import json
33
- from typing import Any, Dict
33
+ from typing import Any
34
34
 
35
35
  from kailash.nodes.base import Node, NodeParameter, register_node
36
36
  from kailash.security import safe_open, validate_file_path
@@ -146,7 +146,7 @@ class CSVReaderNode(Node):
146
146
  ... )
147
147
  """
148
148
 
149
- def get_parameters(self) -> Dict[str, NodeParameter]:
149
+ def get_parameters(self) -> dict[str, NodeParameter]:
150
150
  """Define input parameters for CSV reading.
151
151
 
152
152
  This method specifies the configuration options for reading CSV files,
@@ -197,7 +197,7 @@ class CSVReaderNode(Node):
197
197
  ),
198
198
  }
199
199
 
200
- def run(self, **kwargs) -> Dict[str, Any]:
200
+ def run(self, **kwargs) -> dict[str, Any]:
201
201
  """Execute CSV reading operation.
202
202
 
203
203
  This method performs the actual file reading, handling both headerless
@@ -272,7 +272,7 @@ class CSVReaderNode(Node):
272
272
  index_pos = header_row.index(index_column) if index_column else None
273
273
 
274
274
  for row in reader:
275
- row_dict = dict(zip(header_row, row))
275
+ row_dict = dict(zip(header_row, row, strict=False))
276
276
  data.append(row_dict)
277
277
 
278
278
  # If index column specified, add to indexed dictionary
@@ -345,7 +345,7 @@ class JSONReaderNode(Node):
345
345
  # }
346
346
  """
347
347
 
348
- def get_parameters(self) -> Dict[str, NodeParameter]:
348
+ def get_parameters(self) -> dict[str, NodeParameter]:
349
349
  """Define input parameters for JSON reading.
350
350
 
351
351
  Simple parameter definition reflecting JSON's self-describing nature.
@@ -368,7 +368,7 @@ class JSONReaderNode(Node):
368
368
  )
369
369
  }
370
370
 
371
- def run(self, **kwargs) -> Dict[str, Any]:
371
+ def run(self, **kwargs) -> dict[str, Any]:
372
372
  """Execute JSON reading operation.
373
373
 
374
374
  Reads and parses JSON file, preserving the original structure
@@ -472,7 +472,7 @@ class TextReaderNode(Node):
472
472
  >>> # result['text'] = "2024-01-01 INFO: Application started\\n..."
473
473
  """
474
474
 
475
- def get_parameters(self) -> Dict[str, NodeParameter]:
475
+ def get_parameters(self) -> dict[str, NodeParameter]:
476
476
  """Define input parameters for text reading.
477
477
 
478
478
  Provides essential parameters for text file reading with
@@ -507,7 +507,7 @@ class TextReaderNode(Node):
507
507
  ),
508
508
  }
509
509
 
510
- def run(self, **kwargs) -> Dict[str, Any]:
510
+ def run(self, **kwargs) -> dict[str, Any]:
511
511
  """Execute text reading operation.
512
512
 
513
513
  Reads entire text file into memory as a single string,