synapse-sdk 1.0.0b22__py3-none-any.whl → 1.0.0b23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

@@ -0,0 +1,20 @@
1
+ from .action import UploadAction
2
+ from .enums import LOG_MESSAGES, LogCode, UploadStatus
3
+ from .exceptions import ExcelParsingError, ExcelSecurityError
4
+ from .models import UploadParams
5
+ from .run import UploadRun
6
+ from .utils import ExcelMetadataUtils, ExcelSecurityConfig, PathAwareJSONEncoder
7
+
8
+ __all__ = [
9
+ 'UploadAction',
10
+ 'UploadRun',
11
+ 'UploadParams',
12
+ 'UploadStatus',
13
+ 'LogCode',
14
+ 'LOG_MESSAGES',
15
+ 'ExcelSecurityError',
16
+ 'ExcelParsingError',
17
+ 'PathAwareJSONEncoder',
18
+ 'ExcelSecurityConfig',
19
+ 'ExcelMetadataUtils',
20
+ ]
@@ -0,0 +1,623 @@
1
+ import asyncio
2
+ import os
3
+ import shutil
4
+ from datetime import datetime
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+ from typing import Any, Awaitable, Dict, List, Optional, TypeVar
8
+
9
+ from openpyxl import load_workbook
10
+ from openpyxl.utils.exceptions import InvalidFileException
11
+
12
+ from synapse_sdk.clients.exceptions import ClientError
13
+ from synapse_sdk.clients.utils import get_batched_list
14
+ from synapse_sdk.clients.validators.collections import FileSpecificationValidator
15
+ from synapse_sdk.plugins.categories.base import Action
16
+ from synapse_sdk.plugins.categories.decorators import register_action
17
+ from synapse_sdk.plugins.categories.upload.actions.upload.models import UploadParams
18
+ from synapse_sdk.plugins.enums import PluginCategory, RunMethod
19
+ from synapse_sdk.plugins.exceptions import ActionError
20
+ from synapse_sdk.utils.storage import get_pathlib
21
+
22
+ from .enums import LogCode, UploadStatus
23
+ from .exceptions import ExcelParsingError, ExcelSecurityError
24
+ from .run import UploadRun
25
+ from .utils import ExcelMetadataUtils, ExcelSecurityConfig
26
+
27
+ T = TypeVar('T')
28
+
29
+
30
+ @register_action
31
+ class UploadAction(Action):
32
+ """Main upload action for processing and uploading files to storage.
33
+
34
+ Handles file upload operations including validation, file discovery,
35
+ Excel metadata processing, and data unit generation. Supports both
36
+ synchronous and asynchronous upload processing with comprehensive
37
+ progress tracking and error handling.
38
+
39
+ Features:
40
+ - Recursive directory scanning
41
+ - Excel metadata validation and processing
42
+ - Batch processing for large file sets
43
+ - Type-based file organization
44
+ - Security validation for Excel files
45
+ - Progress tracking with detailed metrics
46
+ - Comprehensive error logging
47
+
48
+ Class Attributes:
49
+ name (str): Action identifier ('upload')
50
+ category (PluginCategory): UPLOAD category
51
+ method (RunMethod): JOB execution method
52
+ run_class (type): UploadRun for specialized logging
53
+ params_model (type): UploadParams for parameter validation
54
+ progress_categories (dict): Progress tracking configuration
55
+ metrics_categories (dict): Metrics collection configuration
56
+
57
+ Example:
58
+ >>> action = UploadAction(
59
+ ... params={
60
+ ... 'name': 'Data Upload',
61
+ ... 'path': '/data/files',
62
+ ... 'storage': 1,
63
+ ... 'collection': 5
64
+ ... },
65
+ ... plugin_config=config
66
+ ... )
67
+ >>> result = action.run_action()
68
+ """
69
+
70
+ name = 'upload'
71
+ category = PluginCategory.UPLOAD
72
+ method = RunMethod.JOB
73
+ run_class = UploadRun
74
+ params_model = UploadParams
75
+ progress_categories = {
76
+ 'analyze_collection': {
77
+ 'proportion': 2,
78
+ },
79
+ 'upload_data_files': {
80
+ 'proportion': 38,
81
+ },
82
+ 'generate_data_units': {
83
+ 'proportion': 60,
84
+ },
85
+ }
86
+ metrics_categories = {
87
+ 'data_files': {
88
+ 'stand_by': 0,
89
+ 'failed': 0,
90
+ 'success': 0,
91
+ },
92
+ 'data_units': {
93
+ 'stand_by': 0,
94
+ 'failed': 0,
95
+ 'success': 0,
96
+ },
97
+ }
98
+
99
+ def __init__(self, *args, **kwargs):
100
+ super().__init__(*args, **kwargs)
101
+ self.excel_config = ExcelSecurityConfig()
102
+ self.excel_utils = ExcelMetadataUtils(self.excel_config)
103
+
104
+ def get_uploader(self, path, file_specification, organized_files, params: Dict = {}):
105
+ """Get uploader from entrypoint."""
106
+ return self.entrypoint(
107
+ self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
108
+ )
109
+
110
+ def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
111
+ return [file_path for file_path in dir_path.rglob('*') if file_path.is_file()]
112
+
113
+ def _discover_files_non_recursive(self, dir_path: Path) -> List[Path]:
114
+ return [file_path for file_path in dir_path.glob('*') if file_path.is_file()]
115
+
116
+ def _validate_excel_security(self, excel_path: Path) -> None:
117
+ file_size = excel_path.stat().st_size
118
+ if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
119
+ raise ExcelSecurityError(
120
+ f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
121
+ )
122
+
123
+ estimated_memory = file_size * 3
124
+ if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
125
+ raise ExcelSecurityError(
126
+ f'Excel file may consume too much memory: ~{estimated_memory} bytes '
127
+ f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
128
+ )
129
+
130
+ def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
131
+ self._validate_excel_security(excel_path)
132
+ excel_bytes = excel_path.read_bytes()
133
+ return BytesIO(excel_bytes)
134
+
135
+ def _process_excel_headers(self, headers: tuple) -> tuple:
136
+ if len(headers) < 2:
137
+ raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
138
+ self._validate_excel_content(headers, 0)
139
+ return headers
140
+
141
+ def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
142
+ if not row[0] or str(row[0]).strip() == '':
143
+ return None
144
+
145
+ file_name = str(row[0]).strip()
146
+ if not self.excel_utils.is_valid_filename_length(file_name):
147
+ self.run.log_message_with_code(LogCode.FILENAME_TOO_LONG, file_name[:50])
148
+ return None
149
+
150
+ file_metadata: Dict[str, Any] = {}
151
+ for i, value in enumerate(row[1:], start=1):
152
+ if value is not None and i < len(headers):
153
+ header_value = headers[i]
154
+ column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
155
+
156
+ column_name = self.excel_utils.validate_and_truncate_string(
157
+ column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
158
+ )
159
+ str_value = self.excel_utils.validate_and_truncate_string(
160
+ str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
161
+ )
162
+ file_metadata[column_name] = str_value
163
+
164
+ return {file_name: file_metadata} if file_metadata else None
165
+
166
+ def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
167
+ if worksheet is None:
168
+ raise ExcelParsingError('Excel file has no active worksheet')
169
+
170
+ metadata_dict: Dict[str, Dict[str, Any]] = {}
171
+ headers: Optional[tuple] = None
172
+ data_row_count = 0
173
+ validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
174
+
175
+ for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
176
+ if not row or all(cell is None or str(cell).strip() == '' for cell in row):
177
+ continue
178
+
179
+ if row_idx == 0:
180
+ headers = self._process_excel_headers(row)
181
+ continue
182
+
183
+ if headers is None:
184
+ raise ExcelParsingError('Excel file missing header row')
185
+
186
+ data_row_count += 1
187
+
188
+ if data_row_count % validation_interval == 0:
189
+ self._validate_excel_content(headers, data_row_count)
190
+
191
+ row_result = self._process_excel_data_row(row, headers)
192
+ if row_result:
193
+ metadata_dict.update(row_result)
194
+
195
+ self._validate_excel_content(headers or (), data_row_count)
196
+
197
+ return metadata_dict
198
+
199
+ def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
200
+ if len(headers) > self.excel_config.MAX_COLUMNS:
201
+ raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
202
+
203
+ if row_count > self.excel_config.MAX_ROWS:
204
+ raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
205
+
206
+ def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
207
+ for extension in ['.xlsx', '.xls']:
208
+ excel_path = pathlib_cwd / f'meta{extension}'
209
+ if excel_path.exists() and excel_path.is_file():
210
+ return excel_path
211
+ return None
212
+
213
+ def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
214
+ excel_path = None
215
+
216
+ excel_metadata_path = self.params.get('excel_metadata_path')
217
+ if excel_metadata_path:
218
+ excel_path = pathlib_cwd / excel_metadata_path
219
+ if not excel_path.exists():
220
+ self.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
221
+ return {}
222
+ else:
223
+ excel_path = self._find_excel_metadata_file(pathlib_cwd)
224
+ if not excel_path:
225
+ return {}
226
+
227
+ try:
228
+ self.run.log_message_with_code(LogCode.EXCEL_FILE_VALIDATION_STARTED)
229
+
230
+ excel_stream = self._prepare_excel_file(excel_path)
231
+
232
+ workbook = load_workbook(excel_stream, read_only=True, data_only=True)
233
+ try:
234
+ self.run.log_message_with_code(LogCode.EXCEL_WORKBOOK_LOADED)
235
+ return self._process_excel_worksheet(workbook.active)
236
+ finally:
237
+ workbook.close()
238
+
239
+ except ExcelSecurityError as e:
240
+ self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VALIDATION_FAILED, str(e))
241
+ raise
242
+ except ExcelParsingError as e:
243
+ self.run.log_message_with_code(LogCode.EXCEL_PARSING_FAILED, str(e))
244
+ raise
245
+ except InvalidFileException as e:
246
+ self.run.log_message_with_code(LogCode.EXCEL_INVALID_FILE_FORMAT, str(e))
247
+ raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
248
+ except MemoryError:
249
+ self.run.log_message_with_code(LogCode.EXCEL_FILE_TOO_LARGE)
250
+ raise ExcelSecurityError('Excel file exceeds memory limits')
251
+ except (OSError, IOError) as e:
252
+ self.run.log_message_with_code(LogCode.EXCEL_FILE_ACCESS_ERROR, str(e))
253
+ raise ExcelParsingError(f'File access error: {str(e)}')
254
+ except Exception as e:
255
+ self.run.log_message_with_code(LogCode.EXCEL_UNEXPECTED_ERROR, str(e))
256
+ raise ExcelParsingError(f'Unexpected error: {str(e)}')
257
+
258
+ def start(self) -> Dict[str, Any]:
259
+ result: Dict[str, Any] = {}
260
+
261
+ storage_id = self.params.get('storage')
262
+ if storage_id is None:
263
+ raise ActionError('Storage parameter is required')
264
+ storage = self.client.get_storage(storage_id)
265
+
266
+ path = self.params.get('path')
267
+ if path is None:
268
+ raise ActionError('Path parameter is required')
269
+ pathlib_cwd = get_pathlib(storage, path)
270
+
271
+ excel_metadata: Dict[str, Dict[str, Any]] = {}
272
+ try:
273
+ excel_metadata = self._read_excel_metadata(pathlib_cwd)
274
+ if excel_metadata:
275
+ self.run.log_message_with_code(LogCode.EXCEL_METADATA_LOADED, len(excel_metadata))
276
+ except ExcelSecurityError as e:
277
+ self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VIOLATION, str(e))
278
+ return result
279
+ except ExcelParsingError as e:
280
+ if self.params.get('excel_metadata_path'):
281
+ self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
282
+ return result
283
+ else:
284
+ self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
285
+ excel_metadata = {}
286
+
287
+ file_specification_template = self._analyze_collection()
288
+ organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
289
+
290
+ uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
291
+
292
+ organized_files = uploader.handle_upload_files()
293
+
294
+ if not self._validate_organized_files(organized_files, file_specification_template):
295
+ self.run.log_message_with_code(LogCode.VALIDATION_FAILED)
296
+ raise ActionError('Upload is aborted due to validation errors.')
297
+
298
+ if not organized_files:
299
+ self.run.log_message_with_code(LogCode.NO_FILES_FOUND)
300
+ raise ActionError('Upload is aborted due to missing files.')
301
+
302
+ if self.params.get('use_async_upload', True):
303
+ uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
304
+ else:
305
+ uploaded_files = self._upload_files(organized_files)
306
+ result['uploaded_files_count'] = len(uploaded_files)
307
+
308
+ if not uploaded_files:
309
+ self.run.log_message_with_code(LogCode.NO_FILES_UPLOADED)
310
+ raise ActionError('Upload is aborted due to no uploaded files.')
311
+ generated_data_units = self._generate_data_units(
312
+ uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
313
+ )
314
+ result['generated_data_units_count'] = len(generated_data_units)
315
+
316
+ if not generated_data_units:
317
+ self.run.log_message_with_code(LogCode.NO_DATA_UNITS_GENERATED)
318
+ raise ActionError('Upload is aborted due to no generated data units.')
319
+
320
+ self._cleanup_temp_directory()
321
+
322
+ self.run.log_message_with_code(LogCode.IMPORT_COMPLETED)
323
+ return result
324
+
325
+ def _analyze_collection(self) -> Dict[str, Any]:
326
+ self.run.set_progress(0, 2, category='analyze_collection')
327
+
328
+ collection_id = self.params.get('data_collection')
329
+ if collection_id is None:
330
+ raise ActionError('Data collection parameter is required')
331
+ self.run.set_progress(1, 2, category='analyze_collection')
332
+
333
+ collection = self.run.client.get_data_collection(collection_id)
334
+ self.run.set_progress(2, 2, category='analyze_collection')
335
+
336
+ return collection['file_specifications']
337
+
338
+ def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
339
+ organized_files_count = len(organized_files)
340
+ self.run.set_progress(0, organized_files_count, category='upload_data_files')
341
+ self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
342
+
343
+ client = self.run.client
344
+ collection_id = self.params.get('data_collection')
345
+ if collection_id is None:
346
+ raise ActionError('Data collection parameter is required')
347
+ upload_result = []
348
+ current_progress = 0
349
+ success_count = 0
350
+ failed_count = 0
351
+
352
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
353
+
354
+ for organized_file in organized_files:
355
+ try:
356
+ use_chunked_upload = self._requires_chunked_upload(organized_file)
357
+ uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
358
+ self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
359
+ success_count += 1
360
+ upload_result.append(uploaded_data_file)
361
+ except Exception as e:
362
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
363
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
364
+ failed_count += 1
365
+
366
+ current_progress += 1
367
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
368
+ self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
369
+
370
+ self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
371
+
372
+ return upload_result
373
+
374
+ def run_async(self, coro: Awaitable[T]) -> T:
375
+ import concurrent.futures
376
+
377
+ def _run_in_thread():
378
+ return asyncio.run(coro)
379
+
380
+ try:
381
+ asyncio.get_running_loop()
382
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
383
+ future = executor.submit(_run_in_thread)
384
+ return future.result()
385
+ except RuntimeError:
386
+ return asyncio.run(coro)
387
+
388
+ async def _upload_files_async(
389
+ self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
390
+ ) -> List[Dict[str, Any]]:
391
+ organized_files_count = len(organized_files)
392
+ self.run.set_progress(0, organized_files_count, category='upload_data_files')
393
+ self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
394
+
395
+ client = self.run.client
396
+ collection_id = self.params.get('data_collection')
397
+ if collection_id is None:
398
+ raise ActionError('Data collection parameter is required')
399
+ upload_result = []
400
+ success_count = 0
401
+ failed_count = 0
402
+
403
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
404
+
405
+ semaphore = asyncio.Semaphore(max_concurrent)
406
+
407
+ async def upload_single_file(organized_file):
408
+ async with semaphore:
409
+ loop = asyncio.get_event_loop()
410
+ try:
411
+ use_chunked_upload = self._requires_chunked_upload(organized_file)
412
+ uploaded_data_file = await loop.run_in_executor(
413
+ None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
414
+ )
415
+ self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
416
+ return {'status': 'success', 'result': uploaded_data_file}
417
+ except ClientError as e:
418
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
419
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Client error: {str(e)}')
420
+ return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
421
+ except (OSError, IOError) as e:
422
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
423
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'File system error: {str(e)}')
424
+ return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
425
+ except MemoryError as e:
426
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
427
+ self.run.log_message_with_code(
428
+ LogCode.FILE_UPLOAD_FAILED, f'Memory error (file too large): {str(e)}'
429
+ )
430
+ return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
431
+ except asyncio.TimeoutError as e:
432
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
433
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Upload timeout: {str(e)}')
434
+ return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
435
+ except ValueError as e:
436
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
437
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Data validation error: {str(e)}')
438
+ return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
439
+ except Exception as e:
440
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
441
+ self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Unexpected error: {str(e)}')
442
+ return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
443
+
444
+ tasks = [upload_single_file(organized_file) for organized_file in organized_files]
445
+
446
+ current_progress = 0
447
+ for completed_task in asyncio.as_completed(tasks):
448
+ result = await completed_task
449
+ current_progress += 1
450
+
451
+ if result['status'] == 'success':
452
+ success_count += 1
453
+ upload_result.append(result['result'])
454
+ else:
455
+ failed_count += 1
456
+
457
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
458
+ self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
459
+
460
+ self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
461
+
462
+ return upload_result
463
+
464
+ def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
465
+ upload_result_count = len(uploaded_files)
466
+ self.run.set_progress(0, upload_result_count, category='generate_data_units')
467
+ self.run.log_message_with_code(LogCode.GENERATING_DATA_UNITS)
468
+
469
+ client = self.run.client
470
+ generated_data_units = []
471
+ current_progress = 0
472
+ success_count = 0
473
+ failed_count = 0
474
+
475
+ batches = get_batched_list(uploaded_files, batch_size)
476
+ batches_count = len(batches)
477
+
478
+ self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
479
+
480
+ for batch in batches:
481
+ try:
482
+ created_data_units = client.create_data_units(batch)
483
+ success_count += len(created_data_units)
484
+ generated_data_units.append(created_data_units)
485
+ for created_data_unit in created_data_units:
486
+ self.run.log_data_unit(
487
+ created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
488
+ )
489
+ except Exception as e:
490
+ failed_count += len(batch)
491
+ self.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
492
+ for _ in batch:
493
+ self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
494
+
495
+ current_progress += 1
496
+ self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
497
+ self.run.set_progress(current_progress, batches_count, category='generate_data_units')
498
+
499
+ self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
500
+
501
+ return sum(generated_data_units, [])
502
+
503
+ def _validate_organized_files(
504
+ self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
505
+ ) -> bool:
506
+ validator = FileSpecificationValidator(file_specification_template, organized_files)
507
+ return validator.validate()
508
+
509
+ def _organize_files(
510
+ self,
511
+ directory: Path,
512
+ file_specification: List[Dict[str, Any]],
513
+ excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
514
+ ) -> List[Dict[str, Any]]:
515
+ organized_files: List[Dict[str, Any]] = []
516
+
517
+ type_dirs: Dict[str, Path] = {}
518
+
519
+ for spec in file_specification:
520
+ spec_name = spec['name']
521
+ spec_dir = directory / spec_name
522
+ if spec_dir.exists() and spec_dir.is_dir():
523
+ type_dirs[spec_name] = spec_dir
524
+
525
+ if type_dirs:
526
+ self.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
527
+
528
+ if not type_dirs:
529
+ self.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
530
+ return organized_files
531
+
532
+ self.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
533
+ self.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
534
+
535
+ dataset_files = {}
536
+ required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
537
+
538
+ is_recursive = self.params.get('is_recursive', True)
539
+
540
+ for spec_name, dir_path in type_dirs.items():
541
+ if is_recursive:
542
+ files_list = self._discover_files_recursive(dir_path)
543
+ else:
544
+ files_list = self._discover_files_non_recursive(dir_path)
545
+
546
+ for file_path in files_list:
547
+ file_name = file_path.stem
548
+
549
+ if file_name not in dataset_files:
550
+ dataset_files[file_name] = {}
551
+
552
+ if spec_name not in dataset_files[file_name]:
553
+ dataset_files[file_name][spec_name] = file_path
554
+ else:
555
+ existing_file = dataset_files[file_name][spec_name]
556
+ if file_path.stat().st_mtime > existing_file.stat().st_mtime:
557
+ dataset_files[file_name][spec_name] = file_path
558
+
559
+ if not dataset_files:
560
+ self.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
561
+ return organized_files
562
+
563
+ self.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(dataset_files))
564
+
565
+ for file_name, files_dict in sorted(dataset_files.items()):
566
+ if all(req in files_dict for req in required_specs):
567
+ file_extensions = {}
568
+ for file_path in files_dict.values():
569
+ ext = file_path.suffix.lower()
570
+ if ext:
571
+ file_extensions[ext] = file_extensions.get(ext, 0) + 1
572
+
573
+ origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
574
+
575
+ meta_data: Dict[str, Any] = {
576
+ 'origin_file_stem': file_name,
577
+ 'origin_file_extension': origin_file_extension,
578
+ 'created_at': datetime.now().isoformat(),
579
+ }
580
+
581
+ if excel_metadata and file_name in excel_metadata:
582
+ meta_data.update(excel_metadata[file_name])
583
+
584
+ organized_files.append({'files': files_dict, 'meta': meta_data})
585
+ else:
586
+ missing = [req for req in required_specs if req not in files_dict]
587
+ self.run.log_message_with_code(LogCode.MISSING_REQUIRED_FILES, file_name, ', '.join(missing))
588
+
589
+ return organized_files
590
+
591
+ def _get_file_size_mb(self, file_path: Path) -> float:
592
+ return file_path.stat().st_size / (1024 * 1024)
593
+
594
+ def _requires_chunked_upload(self, organized_file: Dict[str, Any]) -> bool:
595
+ max_file_size_mb = self.params.get('max_file_size_mb', 50)
596
+ for file_path in organized_file.get('files', {}).values():
597
+ if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
598
+ return True
599
+ return False
600
+
601
+ def _cleanup_temp_directory(self, temp_path: Optional[Path] = None) -> None:
602
+ if temp_path is None:
603
+ try:
604
+ temp_path = Path(os.getcwd()) / 'temp'
605
+ except (FileNotFoundError, OSError):
606
+ return
607
+
608
+ if not temp_path.exists():
609
+ return
610
+
611
+ shutil.rmtree(temp_path, ignore_errors=True)
612
+ self.run.log_message(f'Cleaned up temporary directory: {temp_path}')
613
+
614
+ def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
615
+ if not self.run:
616
+ raise ValueError('Run instance not properly initialized')
617
+
618
+ assert isinstance(self.run, UploadRun)
619
+
620
+ metrics = self.run.MetricsRecord(
621
+ stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
622
+ )
623
+ self.run.log_metrics(metrics, category)