kailash 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/access_control.py +40 -39
- kailash/api/auth.py +26 -32
- kailash/api/custom_nodes.py +29 -29
- kailash/api/custom_nodes_secure.py +35 -35
- kailash/api/database.py +17 -17
- kailash/api/gateway.py +19 -19
- kailash/api/mcp_integration.py +24 -23
- kailash/api/studio.py +45 -45
- kailash/api/workflow_api.py +8 -8
- kailash/cli/commands.py +5 -8
- kailash/manifest.py +42 -42
- kailash/mcp/__init__.py +1 -1
- kailash/mcp/ai_registry_server.py +20 -20
- kailash/mcp/client.py +9 -11
- kailash/mcp/client_new.py +10 -10
- kailash/mcp/server.py +1 -2
- kailash/mcp/server_enhanced.py +449 -0
- kailash/mcp/servers/ai_registry.py +6 -6
- kailash/mcp/utils/__init__.py +31 -0
- kailash/mcp/utils/cache.py +267 -0
- kailash/mcp/utils/config.py +263 -0
- kailash/mcp/utils/formatters.py +293 -0
- kailash/mcp/utils/metrics.py +418 -0
- kailash/nodes/ai/agents.py +9 -9
- kailash/nodes/ai/ai_providers.py +33 -34
- kailash/nodes/ai/embedding_generator.py +31 -32
- kailash/nodes/ai/intelligent_agent_orchestrator.py +62 -66
- kailash/nodes/ai/iterative_llm_agent.py +48 -48
- kailash/nodes/ai/llm_agent.py +32 -33
- kailash/nodes/ai/models.py +13 -13
- kailash/nodes/ai/self_organizing.py +44 -44
- kailash/nodes/api/__init__.py +5 -0
- kailash/nodes/api/auth.py +11 -11
- kailash/nodes/api/graphql.py +13 -13
- kailash/nodes/api/http.py +19 -19
- kailash/nodes/api/monitoring.py +463 -0
- kailash/nodes/api/rate_limiting.py +9 -13
- kailash/nodes/api/rest.py +29 -29
- kailash/nodes/api/security.py +819 -0
- kailash/nodes/base.py +24 -26
- kailash/nodes/base_async.py +7 -7
- kailash/nodes/base_cycle_aware.py +12 -12
- kailash/nodes/base_with_acl.py +5 -5
- kailash/nodes/code/python.py +56 -55
- kailash/nodes/data/__init__.py +6 -0
- kailash/nodes/data/directory.py +6 -6
- kailash/nodes/data/event_generation.py +297 -0
- kailash/nodes/data/file_discovery.py +598 -0
- kailash/nodes/data/readers.py +8 -8
- kailash/nodes/data/retrieval.py +10 -10
- kailash/nodes/data/sharepoint_graph.py +17 -17
- kailash/nodes/data/sources.py +5 -5
- kailash/nodes/data/sql.py +13 -13
- kailash/nodes/data/streaming.py +25 -25
- kailash/nodes/data/vector_db.py +22 -22
- kailash/nodes/data/writers.py +7 -7
- kailash/nodes/logic/async_operations.py +17 -17
- kailash/nodes/logic/convergence.py +11 -11
- kailash/nodes/logic/loop.py +4 -4
- kailash/nodes/logic/operations.py +11 -11
- kailash/nodes/logic/workflow.py +8 -9
- kailash/nodes/mixins/mcp.py +17 -17
- kailash/nodes/mixins.py +8 -10
- kailash/nodes/transform/chunkers.py +3 -3
- kailash/nodes/transform/formatters.py +7 -7
- kailash/nodes/transform/processors.py +11 -11
- kailash/runtime/access_controlled.py +18 -18
- kailash/runtime/async_local.py +18 -20
- kailash/runtime/docker.py +24 -26
- kailash/runtime/local.py +55 -31
- kailash/runtime/parallel.py +25 -25
- kailash/runtime/parallel_cyclic.py +29 -29
- kailash/runtime/runner.py +6 -6
- kailash/runtime/testing.py +22 -22
- kailash/sdk_exceptions.py +0 -58
- kailash/security.py +14 -26
- kailash/tracking/manager.py +38 -38
- kailash/tracking/metrics_collector.py +15 -14
- kailash/tracking/models.py +53 -53
- kailash/tracking/storage/base.py +7 -17
- kailash/tracking/storage/database.py +22 -23
- kailash/tracking/storage/filesystem.py +38 -40
- kailash/utils/export.py +21 -21
- kailash/utils/templates.py +8 -9
- kailash/visualization/api.py +30 -34
- kailash/visualization/dashboard.py +17 -17
- kailash/visualization/performance.py +32 -19
- kailash/visualization/reports.py +30 -28
- kailash/workflow/builder.py +8 -8
- kailash/workflow/convergence.py +13 -12
- kailash/workflow/cycle_analyzer.py +38 -33
- kailash/workflow/cycle_builder.py +12 -12
- kailash/workflow/cycle_config.py +16 -15
- kailash/workflow/cycle_debugger.py +40 -40
- kailash/workflow/cycle_exceptions.py +29 -29
- kailash/workflow/cycle_profiler.py +21 -21
- kailash/workflow/cycle_state.py +20 -22
- kailash/workflow/cyclic_runner.py +45 -45
- kailash/workflow/graph.py +57 -45
- kailash/workflow/mermaid_visualizer.py +9 -11
- kailash/workflow/migration.py +22 -22
- kailash/workflow/mock_registry.py +6 -6
- kailash/workflow/runner.py +9 -9
- kailash/workflow/safety.py +12 -13
- kailash/workflow/state.py +8 -11
- kailash/workflow/templates.py +19 -19
- kailash/workflow/validation.py +14 -14
- kailash/workflow/visualization.py +32 -24
- kailash-0.3.1.dist-info/METADATA +476 -0
- kailash-0.3.1.dist-info/RECORD +136 -0
- kailash-0.2.2.dist-info/METADATA +0 -121
- kailash-0.2.2.dist-info/RECORD +0 -126
- {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/WHEEL +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/entry_points.txt +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,598 @@
|
|
1
|
+
"""File discovery and analysis nodes for file system operations."""
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import mimetypes
|
5
|
+
import os
|
6
|
+
import time
|
7
|
+
from datetime import UTC, datetime
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
12
|
+
|
13
|
+
|
14
|
+
@register_node()
|
15
|
+
class FileDiscoveryNode(Node):
|
16
|
+
"""
|
17
|
+
Discovers and analyzes files and directories in the file system.
|
18
|
+
|
19
|
+
This node provides comprehensive file discovery capabilities, replacing
|
20
|
+
DataTransformer with embedded Python code for file processing tasks.
|
21
|
+
It can scan directories, analyze file properties, detect file types,
|
22
|
+
and generate detailed file system reports.
|
23
|
+
|
24
|
+
Design Philosophy:
|
25
|
+
File system operations require robust discovery and analysis capabilities.
|
26
|
+
This node eliminates the need for custom file processing code in
|
27
|
+
DataTransformer nodes by providing dedicated, configurable file
|
28
|
+
discovery with filtering, analysis, and reporting features.
|
29
|
+
|
30
|
+
Upstream Dependencies:
|
31
|
+
- Path configuration nodes
|
32
|
+
- Filter criteria nodes
|
33
|
+
- Authentication/permission nodes
|
34
|
+
- Schedule/trigger nodes
|
35
|
+
|
36
|
+
Downstream Consumers:
|
37
|
+
- File processing nodes
|
38
|
+
- Content analysis nodes
|
39
|
+
- Backup and archival nodes
|
40
|
+
- Security scanning nodes
|
41
|
+
- Compliance reporting nodes
|
42
|
+
|
43
|
+
Configuration:
|
44
|
+
- Search paths and patterns
|
45
|
+
- File type filters
|
46
|
+
- Size and date criteria
|
47
|
+
- Analysis depth and options
|
48
|
+
- Output format preferences
|
49
|
+
|
50
|
+
Implementation Details:
|
51
|
+
- Recursive directory traversal
|
52
|
+
- File metadata extraction
|
53
|
+
- Content type detection
|
54
|
+
- Permission and ownership analysis
|
55
|
+
- Hash calculation for integrity
|
56
|
+
|
57
|
+
Error Handling:
|
58
|
+
- Permission denied gracefully handled
|
59
|
+
- Broken symlinks detected
|
60
|
+
- Invalid paths reported
|
61
|
+
- Partial results on errors
|
62
|
+
|
63
|
+
Side Effects:
|
64
|
+
- File system access (read-only by default)
|
65
|
+
- Temporary file creation for analysis
|
66
|
+
- Metadata caching for performance
|
67
|
+
- Logging of discovery activities
|
68
|
+
|
69
|
+
Examples:
|
70
|
+
>>> # Discover all Python files in a project
|
71
|
+
>>> discovery = FileDiscoveryNode(
|
72
|
+
... search_paths=['/path/to/project'],
|
73
|
+
... file_patterns=['*.py'],
|
74
|
+
... include_metadata=True,
|
75
|
+
... max_depth=5
|
76
|
+
... )
|
77
|
+
>>> result = discovery.execute()
|
78
|
+
>>> assert 'discovered_files' in result
|
79
|
+
>>> assert all(f['name'].endswith('.py') for f in result['discovered_files'])
|
80
|
+
>>>
|
81
|
+
>>> # Find large files for cleanup
|
82
|
+
>>> discovery = FileDiscoveryNode(
|
83
|
+
... search_paths=['/var/log', '/tmp'],
|
84
|
+
... min_size_mb=100,
|
85
|
+
... older_than_days=30,
|
86
|
+
... include_checksums=True
|
87
|
+
... )
|
88
|
+
>>> result = discovery.execute()
|
89
|
+
>>> large_files = result['discovered_files']
|
90
|
+
"""
|
91
|
+
|
92
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
93
|
+
return {
|
94
|
+
"search_paths": NodeParameter(
|
95
|
+
name="search_paths",
|
96
|
+
type=list,
|
97
|
+
required=True,
|
98
|
+
description="List of paths to search for files",
|
99
|
+
),
|
100
|
+
"file_patterns": NodeParameter(
|
101
|
+
name="file_patterns",
|
102
|
+
type=list,
|
103
|
+
required=False,
|
104
|
+
default=["*"],
|
105
|
+
description="File name patterns to match (glob-style)",
|
106
|
+
),
|
107
|
+
"exclude_patterns": NodeParameter(
|
108
|
+
name="exclude_patterns",
|
109
|
+
type=list,
|
110
|
+
required=False,
|
111
|
+
default=[],
|
112
|
+
description="File name patterns to exclude",
|
113
|
+
),
|
114
|
+
"max_depth": NodeParameter(
|
115
|
+
name="max_depth",
|
116
|
+
type=int,
|
117
|
+
required=False,
|
118
|
+
default=10,
|
119
|
+
description="Maximum directory depth to search",
|
120
|
+
),
|
121
|
+
"include_metadata": NodeParameter(
|
122
|
+
name="include_metadata",
|
123
|
+
type=bool,
|
124
|
+
required=False,
|
125
|
+
default=True,
|
126
|
+
description="Include detailed file metadata",
|
127
|
+
),
|
128
|
+
"include_checksums": NodeParameter(
|
129
|
+
name="include_checksums",
|
130
|
+
type=bool,
|
131
|
+
required=False,
|
132
|
+
default=False,
|
133
|
+
description="Calculate file checksums (slower but more thorough)",
|
134
|
+
),
|
135
|
+
"min_size_mb": NodeParameter(
|
136
|
+
name="min_size_mb",
|
137
|
+
type=float,
|
138
|
+
required=False,
|
139
|
+
description="Minimum file size in megabytes",
|
140
|
+
),
|
141
|
+
"max_size_mb": NodeParameter(
|
142
|
+
name="max_size_mb",
|
143
|
+
type=float,
|
144
|
+
required=False,
|
145
|
+
description="Maximum file size in megabytes",
|
146
|
+
),
|
147
|
+
"older_than_days": NodeParameter(
|
148
|
+
name="older_than_days",
|
149
|
+
type=int,
|
150
|
+
required=False,
|
151
|
+
description="Only include files older than N days",
|
152
|
+
),
|
153
|
+
"newer_than_days": NodeParameter(
|
154
|
+
name="newer_than_days",
|
155
|
+
type=int,
|
156
|
+
required=False,
|
157
|
+
description="Only include files newer than N days",
|
158
|
+
),
|
159
|
+
"follow_symlinks": NodeParameter(
|
160
|
+
name="follow_symlinks",
|
161
|
+
type=bool,
|
162
|
+
required=False,
|
163
|
+
default=False,
|
164
|
+
description="Follow symbolic links during traversal",
|
165
|
+
),
|
166
|
+
}
|
167
|
+
|
168
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
169
|
+
search_paths = kwargs["search_paths"]
|
170
|
+
file_patterns = kwargs.get("file_patterns", ["*"])
|
171
|
+
exclude_patterns = kwargs.get("exclude_patterns", [])
|
172
|
+
max_depth = kwargs.get("max_depth", 10)
|
173
|
+
include_metadata = kwargs.get("include_metadata", True)
|
174
|
+
include_checksums = kwargs.get("include_checksums", False)
|
175
|
+
min_size_mb = kwargs.get("min_size_mb")
|
176
|
+
max_size_mb = kwargs.get("max_size_mb")
|
177
|
+
older_than_days = kwargs.get("older_than_days")
|
178
|
+
newer_than_days = kwargs.get("newer_than_days")
|
179
|
+
follow_symlinks = kwargs.get("follow_symlinks", False)
|
180
|
+
|
181
|
+
start_time = time.time()
|
182
|
+
discovered_files = []
|
183
|
+
discovery_stats = {
|
184
|
+
"total_directories_scanned": 0,
|
185
|
+
"total_files_found": 0,
|
186
|
+
"total_files_matching": 0,
|
187
|
+
"access_errors": 0,
|
188
|
+
"broken_symlinks": 0,
|
189
|
+
}
|
190
|
+
|
191
|
+
for search_path in search_paths:
|
192
|
+
try:
|
193
|
+
path_files, path_stats = self._discover_files_in_path(
|
194
|
+
search_path=search_path,
|
195
|
+
file_patterns=file_patterns,
|
196
|
+
exclude_patterns=exclude_patterns,
|
197
|
+
max_depth=max_depth,
|
198
|
+
include_metadata=include_metadata,
|
199
|
+
include_checksums=include_checksums,
|
200
|
+
min_size_mb=min_size_mb,
|
201
|
+
max_size_mb=max_size_mb,
|
202
|
+
older_than_days=older_than_days,
|
203
|
+
newer_than_days=newer_than_days,
|
204
|
+
follow_symlinks=follow_symlinks,
|
205
|
+
)
|
206
|
+
|
207
|
+
discovered_files.extend(path_files)
|
208
|
+
|
209
|
+
# Aggregate stats
|
210
|
+
for key, value in path_stats.items():
|
211
|
+
discovery_stats[key] += value
|
212
|
+
|
213
|
+
except Exception as e:
|
214
|
+
discovery_stats["access_errors"] += 1
|
215
|
+
# Add error entry to results
|
216
|
+
discovered_files.append(
|
217
|
+
{
|
218
|
+
"type": "discovery_error",
|
219
|
+
"path": search_path,
|
220
|
+
"error": str(e),
|
221
|
+
"timestamp": datetime.now(UTC).isoformat() + "Z",
|
222
|
+
}
|
223
|
+
)
|
224
|
+
|
225
|
+
execution_time = time.time() - start_time
|
226
|
+
|
227
|
+
# Generate summary
|
228
|
+
summary = self._generate_discovery_summary(
|
229
|
+
discovered_files, discovery_stats, execution_time
|
230
|
+
)
|
231
|
+
|
232
|
+
return {
|
233
|
+
"discovered_files": discovered_files,
|
234
|
+
"discovery_summary": summary,
|
235
|
+
"discovery_stats": discovery_stats,
|
236
|
+
"total_files": len(
|
237
|
+
[f for f in discovered_files if f.get("type") != "discovery_error"]
|
238
|
+
),
|
239
|
+
"execution_time": execution_time,
|
240
|
+
"timestamp": datetime.now(UTC).isoformat() + "Z",
|
241
|
+
}
|
242
|
+
|
243
|
+
def _discover_files_in_path(
|
244
|
+
self,
|
245
|
+
search_path: str,
|
246
|
+
file_patterns: list[str],
|
247
|
+
exclude_patterns: list[str],
|
248
|
+
max_depth: int,
|
249
|
+
include_metadata: bool,
|
250
|
+
include_checksums: bool,
|
251
|
+
min_size_mb: float | None,
|
252
|
+
max_size_mb: float | None,
|
253
|
+
older_than_days: int | None,
|
254
|
+
newer_than_days: int | None,
|
255
|
+
follow_symlinks: bool,
|
256
|
+
) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
257
|
+
"""Discover files in a specific path."""
|
258
|
+
|
259
|
+
discovered_files = []
|
260
|
+
stats = {
|
261
|
+
"total_directories_scanned": 0,
|
262
|
+
"total_files_found": 0,
|
263
|
+
"total_files_matching": 0,
|
264
|
+
"access_errors": 0,
|
265
|
+
"broken_symlinks": 0,
|
266
|
+
}
|
267
|
+
|
268
|
+
try:
|
269
|
+
search_path_obj = Path(search_path)
|
270
|
+
if not search_path_obj.exists():
|
271
|
+
raise FileNotFoundError(f"Search path does not exist: {search_path}")
|
272
|
+
|
273
|
+
# Walk the directory tree
|
274
|
+
for root, dirs, files in os.walk(search_path, followlinks=follow_symlinks):
|
275
|
+
current_depth = len(Path(root).relative_to(search_path_obj).parts)
|
276
|
+
|
277
|
+
# Skip if max depth exceeded
|
278
|
+
if current_depth > max_depth:
|
279
|
+
dirs[:] = [] # Don't descend further
|
280
|
+
continue
|
281
|
+
|
282
|
+
stats["total_directories_scanned"] += 1
|
283
|
+
|
284
|
+
for file_name in files:
|
285
|
+
file_path = os.path.join(root, file_name)
|
286
|
+
stats["total_files_found"] += 1
|
287
|
+
|
288
|
+
try:
|
289
|
+
# Check if file matches patterns
|
290
|
+
if not self._matches_patterns(
|
291
|
+
file_name, file_patterns, exclude_patterns
|
292
|
+
):
|
293
|
+
continue
|
294
|
+
|
295
|
+
file_info = self._analyze_file(
|
296
|
+
file_path=file_path,
|
297
|
+
include_metadata=include_metadata,
|
298
|
+
include_checksums=include_checksums,
|
299
|
+
)
|
300
|
+
|
301
|
+
# Apply size filters
|
302
|
+
if min_size_mb is not None:
|
303
|
+
if file_info.get("size_mb", 0) < min_size_mb:
|
304
|
+
continue
|
305
|
+
|
306
|
+
if max_size_mb is not None:
|
307
|
+
if file_info.get("size_mb", 0) > max_size_mb:
|
308
|
+
continue
|
309
|
+
|
310
|
+
# Apply date filters
|
311
|
+
if older_than_days is not None or newer_than_days is not None:
|
312
|
+
if not self._matches_date_criteria(
|
313
|
+
file_info, older_than_days, newer_than_days
|
314
|
+
):
|
315
|
+
continue
|
316
|
+
|
317
|
+
discovered_files.append(file_info)
|
318
|
+
stats["total_files_matching"] += 1
|
319
|
+
|
320
|
+
except (OSError, PermissionError) as e:
|
321
|
+
stats["access_errors"] += 1
|
322
|
+
# Add error info for this specific file
|
323
|
+
discovered_files.append(
|
324
|
+
{
|
325
|
+
"type": "file_access_error",
|
326
|
+
"path": file_path,
|
327
|
+
"name": file_name,
|
328
|
+
"error": str(e),
|
329
|
+
"timestamp": datetime.now(UTC).isoformat() + "Z",
|
330
|
+
}
|
331
|
+
)
|
332
|
+
|
333
|
+
except Exception:
|
334
|
+
stats["access_errors"] += 1
|
335
|
+
raise
|
336
|
+
|
337
|
+
return discovered_files, stats
|
338
|
+
|
339
|
+
def _matches_patterns(
|
340
|
+
self, file_name: str, include_patterns: list[str], exclude_patterns: list[str]
|
341
|
+
) -> bool:
|
342
|
+
"""Check if filename matches include patterns and doesn't match exclude patterns."""
|
343
|
+
import fnmatch
|
344
|
+
|
345
|
+
# Check exclude patterns first
|
346
|
+
for pattern in exclude_patterns:
|
347
|
+
if fnmatch.fnmatch(file_name, pattern):
|
348
|
+
return False
|
349
|
+
|
350
|
+
# Check include patterns
|
351
|
+
if not include_patterns or include_patterns == ["*"]:
|
352
|
+
return True
|
353
|
+
|
354
|
+
for pattern in include_patterns:
|
355
|
+
if fnmatch.fnmatch(file_name, pattern):
|
356
|
+
return True
|
357
|
+
|
358
|
+
return False
|
359
|
+
|
360
|
+
def _analyze_file(
|
361
|
+
self, file_path: str, include_metadata: bool, include_checksums: bool
|
362
|
+
) -> dict[str, Any]:
|
363
|
+
"""Analyze a single file and return its information."""
|
364
|
+
|
365
|
+
file_path_obj = Path(file_path)
|
366
|
+
file_info = {
|
367
|
+
"type": "file",
|
368
|
+
"path": str(file_path),
|
369
|
+
"name": file_path_obj.name,
|
370
|
+
"directory": str(file_path_obj.parent),
|
371
|
+
}
|
372
|
+
|
373
|
+
try:
|
374
|
+
# Basic file stats
|
375
|
+
stat_info = file_path_obj.stat()
|
376
|
+
|
377
|
+
file_info.update(
|
378
|
+
{
|
379
|
+
"size_bytes": stat_info.st_size,
|
380
|
+
"size_mb": stat_info.st_size / (1024 * 1024),
|
381
|
+
"created_timestamp": stat_info.st_ctime,
|
382
|
+
"modified_timestamp": stat_info.st_mtime,
|
383
|
+
"accessed_timestamp": stat_info.st_atime,
|
384
|
+
"created_date": datetime.fromtimestamp(
|
385
|
+
stat_info.st_ctime, UTC
|
386
|
+
).isoformat()
|
387
|
+
+ "Z",
|
388
|
+
"modified_date": datetime.fromtimestamp(
|
389
|
+
stat_info.st_mtime, UTC
|
390
|
+
).isoformat()
|
391
|
+
+ "Z",
|
392
|
+
"accessed_date": datetime.fromtimestamp(
|
393
|
+
stat_info.st_atime, UTC
|
394
|
+
).isoformat()
|
395
|
+
+ "Z",
|
396
|
+
}
|
397
|
+
)
|
398
|
+
|
399
|
+
if include_metadata:
|
400
|
+
# File type detection
|
401
|
+
mime_type, encoding = mimetypes.guess_type(file_path)
|
402
|
+
file_info.update(
|
403
|
+
{
|
404
|
+
"mime_type": mime_type,
|
405
|
+
"encoding": encoding,
|
406
|
+
"extension": file_path_obj.suffix.lower(),
|
407
|
+
}
|
408
|
+
)
|
409
|
+
|
410
|
+
# File permissions
|
411
|
+
file_info.update(
|
412
|
+
{
|
413
|
+
"permissions": oct(stat_info.st_mode)[-3:],
|
414
|
+
"owner_uid": stat_info.st_uid,
|
415
|
+
"group_gid": stat_info.st_gid,
|
416
|
+
"is_readable": os.access(file_path, os.R_OK),
|
417
|
+
"is_writable": os.access(file_path, os.W_OK),
|
418
|
+
"is_executable": os.access(file_path, os.X_OK),
|
419
|
+
}
|
420
|
+
)
|
421
|
+
|
422
|
+
# Symbolic link detection
|
423
|
+
if file_path_obj.is_symlink():
|
424
|
+
try:
|
425
|
+
link_target = os.readlink(file_path)
|
426
|
+
file_info.update(
|
427
|
+
{
|
428
|
+
"is_symlink": True,
|
429
|
+
"link_target": link_target,
|
430
|
+
"link_target_exists": os.path.exists(link_target),
|
431
|
+
}
|
432
|
+
)
|
433
|
+
except OSError:
|
434
|
+
file_info.update(
|
435
|
+
{
|
436
|
+
"is_symlink": True,
|
437
|
+
"link_target": None,
|
438
|
+
"link_target_exists": False,
|
439
|
+
}
|
440
|
+
)
|
441
|
+
else:
|
442
|
+
file_info["is_symlink"] = False
|
443
|
+
|
444
|
+
# Content analysis for text files
|
445
|
+
if mime_type and mime_type.startswith("text/"):
|
446
|
+
try:
|
447
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
448
|
+
content_sample = f.read(1024) # Read first 1KB
|
449
|
+
file_info.update(
|
450
|
+
{
|
451
|
+
"line_count": len(content_sample.splitlines()),
|
452
|
+
"character_count": len(content_sample),
|
453
|
+
"content_sample": (
|
454
|
+
content_sample[:200] + "..."
|
455
|
+
if len(content_sample) > 200
|
456
|
+
else content_sample
|
457
|
+
),
|
458
|
+
}
|
459
|
+
)
|
460
|
+
except (UnicodeDecodeError, PermissionError):
|
461
|
+
pass
|
462
|
+
|
463
|
+
if include_checksums:
|
464
|
+
# Calculate file hashes
|
465
|
+
file_info.update(self._calculate_checksums(file_path))
|
466
|
+
|
467
|
+
except (OSError, PermissionError) as e:
|
468
|
+
file_info.update(
|
469
|
+
{
|
470
|
+
"error": str(e),
|
471
|
+
"accessible": False,
|
472
|
+
}
|
473
|
+
)
|
474
|
+
|
475
|
+
file_info["timestamp"] = datetime.now(UTC).isoformat() + "Z"
|
476
|
+
return file_info
|
477
|
+
|
478
|
+
def _calculate_checksums(self, file_path: str) -> dict[str, str]:
|
479
|
+
"""Calculate MD5 and SHA256 checksums for a file."""
|
480
|
+
checksums = {}
|
481
|
+
|
482
|
+
try:
|
483
|
+
md5_hash = hashlib.md5()
|
484
|
+
sha256_hash = hashlib.sha256()
|
485
|
+
|
486
|
+
with open(file_path, "rb") as f:
|
487
|
+
# Read file in chunks to handle large files efficiently
|
488
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
489
|
+
md5_hash.update(chunk)
|
490
|
+
sha256_hash.update(chunk)
|
491
|
+
|
492
|
+
checksums.update(
|
493
|
+
{
|
494
|
+
"md5": md5_hash.hexdigest(),
|
495
|
+
"sha256": sha256_hash.hexdigest(),
|
496
|
+
}
|
497
|
+
)
|
498
|
+
except (OSError, PermissionError) as e:
|
499
|
+
checksums.update(
|
500
|
+
{
|
501
|
+
"checksum_error": str(e),
|
502
|
+
}
|
503
|
+
)
|
504
|
+
|
505
|
+
return checksums
|
506
|
+
|
507
|
+
def _matches_date_criteria(
|
508
|
+
self,
|
509
|
+
file_info: dict[str, Any],
|
510
|
+
older_than_days: int | None,
|
511
|
+
newer_than_days: int | None,
|
512
|
+
) -> bool:
|
513
|
+
"""Check if file matches date criteria."""
|
514
|
+
|
515
|
+
modified_timestamp = file_info.get("modified_timestamp")
|
516
|
+
if modified_timestamp is None:
|
517
|
+
return True
|
518
|
+
|
519
|
+
now = time.time()
|
520
|
+
file_age_days = (now - modified_timestamp) / (24 * 3600)
|
521
|
+
|
522
|
+
if older_than_days is not None and file_age_days < older_than_days:
|
523
|
+
return False
|
524
|
+
|
525
|
+
if newer_than_days is not None and file_age_days > newer_than_days:
|
526
|
+
return False
|
527
|
+
|
528
|
+
return True
|
529
|
+
|
530
|
+
def _generate_discovery_summary(
|
531
|
+
self,
|
532
|
+
discovered_files: list[dict],
|
533
|
+
discovery_stats: dict[str, int],
|
534
|
+
execution_time: float,
|
535
|
+
) -> dict[str, Any]:
|
536
|
+
"""Generate summary of file discovery results."""
|
537
|
+
|
538
|
+
# Count files by type/extension
|
539
|
+
extension_counts = {}
|
540
|
+
mime_type_counts = {}
|
541
|
+
size_distribution = {"small": 0, "medium": 0, "large": 0, "very_large": 0}
|
542
|
+
|
543
|
+
total_size_mb = 0
|
544
|
+
error_count = 0
|
545
|
+
|
546
|
+
for file_info in discovered_files:
|
547
|
+
if file_info.get("type") in ["discovery_error", "file_access_error"]:
|
548
|
+
error_count += 1
|
549
|
+
continue
|
550
|
+
|
551
|
+
# Extension analysis
|
552
|
+
extension = file_info.get("extension", "")
|
553
|
+
extension_counts[extension] = extension_counts.get(extension, 0) + 1
|
554
|
+
|
555
|
+
# MIME type analysis
|
556
|
+
mime_type = file_info.get("mime_type", "unknown")
|
557
|
+
mime_type_counts[mime_type] = mime_type_counts.get(mime_type, 0) + 1
|
558
|
+
|
559
|
+
# Size distribution
|
560
|
+
size_mb = file_info.get("size_mb", 0)
|
561
|
+
total_size_mb += size_mb
|
562
|
+
|
563
|
+
if size_mb < 1:
|
564
|
+
size_distribution["small"] += 1
|
565
|
+
elif size_mb < 50:
|
566
|
+
size_distribution["medium"] += 1
|
567
|
+
elif size_mb < 500:
|
568
|
+
size_distribution["large"] += 1
|
569
|
+
else:
|
570
|
+
size_distribution["very_large"] += 1
|
571
|
+
|
572
|
+
# Find largest files
|
573
|
+
file_sizes = [
|
574
|
+
(f.get("size_mb", 0), f.get("path", ""))
|
575
|
+
for f in discovered_files
|
576
|
+
if f.get("type") == "file"
|
577
|
+
]
|
578
|
+
largest_files = sorted(file_sizes, reverse=True)[:10]
|
579
|
+
|
580
|
+
return {
|
581
|
+
"execution_time": execution_time,
|
582
|
+
"total_files_discovered": len(discovered_files) - error_count,
|
583
|
+
"total_errors": error_count,
|
584
|
+
"total_size_mb": total_size_mb,
|
585
|
+
"average_file_size_mb": total_size_mb
|
586
|
+
/ max(1, len(discovered_files) - error_count),
|
587
|
+
"extension_distribution": dict(
|
588
|
+
sorted(extension_counts.items(), key=lambda x: x[1], reverse=True)
|
589
|
+
),
|
590
|
+
"mime_type_distribution": dict(
|
591
|
+
sorted(mime_type_counts.items(), key=lambda x: x[1], reverse=True)
|
592
|
+
),
|
593
|
+
"size_distribution": size_distribution,
|
594
|
+
"largest_files": [
|
595
|
+
{"size_mb": size, "path": path} for size, path in largest_files[:5]
|
596
|
+
],
|
597
|
+
"discovery_stats": discovery_stats,
|
598
|
+
}
|
kailash/nodes/data/readers.py
CHANGED
@@ -30,7 +30,7 @@ Downstream Consumers:
|
|
30
30
|
|
31
31
|
import csv
|
32
32
|
import json
|
33
|
-
from typing import Any
|
33
|
+
from typing import Any
|
34
34
|
|
35
35
|
from kailash.nodes.base import Node, NodeParameter, register_node
|
36
36
|
from kailash.security import safe_open, validate_file_path
|
@@ -146,7 +146,7 @@ class CSVReaderNode(Node):
|
|
146
146
|
... )
|
147
147
|
"""
|
148
148
|
|
149
|
-
def get_parameters(self) ->
|
149
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
150
150
|
"""Define input parameters for CSV reading.
|
151
151
|
|
152
152
|
This method specifies the configuration options for reading CSV files,
|
@@ -197,7 +197,7 @@ class CSVReaderNode(Node):
|
|
197
197
|
),
|
198
198
|
}
|
199
199
|
|
200
|
-
def run(self, **kwargs) ->
|
200
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
201
201
|
"""Execute CSV reading operation.
|
202
202
|
|
203
203
|
This method performs the actual file reading, handling both headerless
|
@@ -272,7 +272,7 @@ class CSVReaderNode(Node):
|
|
272
272
|
index_pos = header_row.index(index_column) if index_column else None
|
273
273
|
|
274
274
|
for row in reader:
|
275
|
-
row_dict = dict(zip(header_row, row))
|
275
|
+
row_dict = dict(zip(header_row, row, strict=False))
|
276
276
|
data.append(row_dict)
|
277
277
|
|
278
278
|
# If index column specified, add to indexed dictionary
|
@@ -345,7 +345,7 @@ class JSONReaderNode(Node):
|
|
345
345
|
# }
|
346
346
|
"""
|
347
347
|
|
348
|
-
def get_parameters(self) ->
|
348
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
349
349
|
"""Define input parameters for JSON reading.
|
350
350
|
|
351
351
|
Simple parameter definition reflecting JSON's self-describing nature.
|
@@ -368,7 +368,7 @@ class JSONReaderNode(Node):
|
|
368
368
|
)
|
369
369
|
}
|
370
370
|
|
371
|
-
def run(self, **kwargs) ->
|
371
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
372
372
|
"""Execute JSON reading operation.
|
373
373
|
|
374
374
|
Reads and parses JSON file, preserving the original structure
|
@@ -472,7 +472,7 @@ class TextReaderNode(Node):
|
|
472
472
|
>>> # result['text'] = "2024-01-01 INFO: Application started\\n..."
|
473
473
|
"""
|
474
474
|
|
475
|
-
def get_parameters(self) ->
|
475
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
476
476
|
"""Define input parameters for text reading.
|
477
477
|
|
478
478
|
Provides essential parameters for text file reading with
|
@@ -507,7 +507,7 @@ class TextReaderNode(Node):
|
|
507
507
|
),
|
508
508
|
}
|
509
509
|
|
510
|
-
def run(self, **kwargs) ->
|
510
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
511
511
|
"""Execute text reading operation.
|
512
512
|
|
513
513
|
Reads entire text file into memory as a single string,
|