synapse-sdk 1.0.0b22__py3-none-any.whl → 1.0.0b23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

@@ -1,1368 +0,0 @@
1
- import asyncio
2
- import json
3
- import os
4
- import shutil
5
- from datetime import datetime
6
- from enum import Enum
7
- from io import BytesIO
8
- from pathlib import Path
9
- from typing import Annotated, Any, Awaitable, Dict, List, Optional, TypeVar
10
-
11
- from openpyxl import load_workbook
12
- from openpyxl.utils.exceptions import InvalidFileException
13
- from pydantic import AfterValidator, BaseModel, field_validator
14
- from pydantic_core import PydanticCustomError
15
-
16
- from synapse_sdk.clients.exceptions import ClientError
17
- from synapse_sdk.clients.utils import get_batched_list
18
- from synapse_sdk.clients.validators.collections import FileSpecificationValidator
19
- from synapse_sdk.i18n import gettext as _
20
- from synapse_sdk.plugins.categories.base import Action
21
- from synapse_sdk.plugins.categories.decorators import register_action
22
- from synapse_sdk.plugins.enums import PluginCategory, RunMethod
23
- from synapse_sdk.plugins.exceptions import ActionError
24
- from synapse_sdk.plugins.models import Run
25
- from synapse_sdk.shared.enums import Context
26
- from synapse_sdk.utils.pydantic.validators import non_blank
27
- from synapse_sdk.utils.storage import get_pathlib
28
-
29
- # Type variable for generic async return type
30
- T = TypeVar('T')
31
-
32
-
33
- class PathAwareJSONEncoder(json.JSONEncoder):
34
- """Custom JSON encoder that handles Path-like objects."""
35
-
36
- def default(self, obj):
37
- if hasattr(obj, '__fspath__') or hasattr(obj, 'as_posix'):
38
- # Handle Path-like objects (including UPath, SFTPPath, pathlib.Path, etc.)
39
- return str(obj)
40
- elif hasattr(obj, 'isoformat'):
41
- # Handle datetime objects
42
- return obj.isoformat()
43
- # Let the base class handle other types
44
- return super().default(obj)
45
-
46
-
47
- class UploadStatus(str, Enum):
48
- SUCCESS = 'success'
49
- FAILED = 'failed'
50
-
51
-
52
- class UploadRun(Run):
53
- class UploadEventLog(BaseModel):
54
- """Upload event log model."""
55
-
56
- info: Optional[str] = None
57
- status: Context
58
- created: str
59
-
60
- class DataFileLog(BaseModel):
61
- """Data file log model."""
62
-
63
- data_file_info: str | None
64
- status: UploadStatus
65
- created: str
66
-
67
- class DataUnitLog(BaseModel):
68
- """Data unit log model."""
69
-
70
- data_unit_id: int | None
71
- status: UploadStatus
72
- created: str
73
- data_unit_meta: dict | None
74
-
75
- class TaskLog(BaseModel):
76
- """Task log model."""
77
-
78
- task_id: int | None
79
- status: UploadStatus
80
- created: str
81
-
82
- class MetricsRecord(BaseModel):
83
- """Metrics record model."""
84
-
85
- stand_by: int
86
- failed: int
87
- success: int
88
-
89
- LOG_MESSAGES = {
90
- # Validation errors - show in both log_message and EventLog
91
- 'STORAGE_VALIDATION_FAILED': {
92
- 'message': 'Storage validation failed.',
93
- 'level': Context.DANGER,
94
- },
95
- 'COLLECTION_VALIDATION_FAILED': {
96
- 'message': 'Collection validation failed.',
97
- 'level': Context.DANGER,
98
- },
99
- 'PROJECT_VALIDATION_FAILED': {
100
- 'message': 'Project validation failed.',
101
- 'level': Context.DANGER,
102
- },
103
- 'VALIDATION_FAILED': {
104
- 'message': 'Validation failed.',
105
- 'level': Context.DANGER,
106
- },
107
- 'NO_FILES_FOUND': {
108
- 'message': 'Files not found on the path.',
109
- 'level': Context.WARNING,
110
- },
111
- 'NO_FILES_UPLOADED': {
112
- 'message': 'No files were uploaded.',
113
- 'level': Context.WARNING,
114
- },
115
- 'NO_DATA_UNITS_GENERATED': {
116
- 'message': 'No data units were generated.',
117
- 'level': Context.WARNING,
118
- },
119
- 'NO_TYPE_DIRECTORIES': {
120
- 'message': 'No type-based directory structure found.',
121
- 'level': Context.INFO,
122
- },
123
- 'EXCEL_SECURITY_VIOLATION': {
124
- 'message': 'Excel security validation failed: {}',
125
- 'level': Context.DANGER,
126
- },
127
- 'EXCEL_PARSING_ERROR': {
128
- 'message': 'Excel parsing failed: {}',
129
- 'level': Context.DANGER,
130
- },
131
- 'EXCEL_METADATA_LOADED': {
132
- 'message': 'Excel metadata loaded for {} files',
133
- 'level': None,
134
- },
135
- 'UPLOADING_DATA_FILES': {
136
- 'message': 'Uploading data files...',
137
- 'level': None,
138
- },
139
- 'GENERATING_DATA_UNITS': {
140
- 'message': 'Generating data units...',
141
- 'level': None,
142
- },
143
- 'IMPORT_COMPLETED': {
144
- 'message': 'Import completed.',
145
- 'level': None,
146
- },
147
- 'TYPE_DIRECTORIES_FOUND': {
148
- 'message': 'Found type directories: {}',
149
- 'level': None,
150
- },
151
- 'TYPE_STRUCTURE_DETECTED': {
152
- 'message': 'Detected type-based directory structure',
153
- 'level': None,
154
- },
155
- 'FILES_DISCOVERED': {
156
- 'message': 'Discovered {} files',
157
- 'level': None,
158
- },
159
- 'NO_FILES_FOUND_WARNING': {
160
- 'message': 'No files found.',
161
- 'level': Context.WARNING,
162
- },
163
- 'FILE_UPLOAD_FAILED': {
164
- 'message': 'Failed to upload file: {}',
165
- 'level': Context.DANGER,
166
- },
167
- 'DATA_UNIT_BATCH_FAILED': {
168
- 'message': 'Failed to create data units batch: {}',
169
- 'level': Context.DANGER,
170
- },
171
- 'FILENAME_TOO_LONG': {
172
- 'message': 'Skipping file with overly long name: {}...',
173
- 'level': Context.WARNING,
174
- },
175
- 'MISSING_REQUIRED_FILES': {
176
- 'message': '{} missing required files: {}',
177
- 'level': Context.WARNING,
178
- },
179
- 'EXCEL_FILE_NOT_FOUND': {
180
- 'message': 'Excel metadata file not found: {}',
181
- 'level': Context.WARNING,
182
- },
183
- # Debug information - only for EventLog
184
- 'EXCEL_FILE_VALIDATION_STARTED': {
185
- 'message': 'Excel file validation started',
186
- 'level': Context.INFO,
187
- },
188
- 'EXCEL_WORKBOOK_LOADED': {
189
- 'message': 'Excel workbook loaded successfully',
190
- 'level': Context.INFO,
191
- },
192
- 'FILE_ORGANIZATION_STARTED': {
193
- 'message': 'File organization started',
194
- 'level': Context.INFO,
195
- },
196
- 'BATCH_PROCESSING_STARTED': {
197
- 'message': 'Batch processing started: {} batches of {} items each',
198
- 'level': Context.INFO,
199
- },
200
- 'EXCEL_SECURITY_VALIDATION_STARTED': {
201
- 'message': 'Excel security validation started for file size: {} bytes',
202
- 'level': Context.INFO,
203
- },
204
- 'EXCEL_MEMORY_ESTIMATION': {
205
- 'message': 'Excel memory estimation: {} bytes (file) * 3 = {} bytes (estimated)',
206
- 'level': Context.INFO,
207
- },
208
- 'EXCEL_FILE_NOT_FOUND_PATH': {
209
- 'message': 'Excel metadata file not found',
210
- 'level': Context.WARNING,
211
- },
212
- 'EXCEL_SECURITY_VALIDATION_FAILED': {
213
- 'message': 'Excel security validation failed: {}',
214
- 'level': Context.DANGER,
215
- },
216
- 'EXCEL_PARSING_FAILED': {
217
- 'message': 'Excel parsing failed: {}',
218
- 'level': Context.DANGER,
219
- },
220
- 'EXCEL_INVALID_FILE_FORMAT': {
221
- 'message': 'Invalid Excel file format: {}',
222
- 'level': Context.DANGER,
223
- },
224
- 'EXCEL_FILE_TOO_LARGE': {
225
- 'message': 'Excel file too large to process (memory limit exceeded)',
226
- 'level': Context.DANGER,
227
- },
228
- 'EXCEL_FILE_ACCESS_ERROR': {
229
- 'message': 'File access error reading excel metadata: {}',
230
- 'level': Context.DANGER,
231
- },
232
- 'EXCEL_UNEXPECTED_ERROR': {
233
- 'message': 'Unexpected error reading excel metadata: {}',
234
- 'level': Context.DANGER,
235
- },
236
- }
237
-
238
- def log_message_with_code(self, code: str, *args, level: Optional[Context] = None):
239
- """Unified logging method that handles both log_message and EventLog based on configuration."""
240
- if code not in self.LOG_MESSAGES:
241
- self.log_message(f'Unknown log code: {code}')
242
- self.log_upload_event('UNKNOWN_LOG_CODE', code)
243
- return
244
-
245
- log_config = self.LOG_MESSAGES[code]
246
- message = log_config['message'].format(*args) if args else log_config['message']
247
- log_level = level or log_config['level'] or Context.INFO
248
-
249
- # Log to message if configured
250
- if log_level == Context.INFO.value:
251
- self.log_message(message, context=log_level.value)
252
- else:
253
- self.log_upload_event(code, *args, level)
254
-
255
- def log_upload_event(self, code: str, *args, level: Optional[Context] = None):
256
- """Log upload event using predefined code."""
257
- if code not in self.LOG_MESSAGES:
258
- now = datetime.now().isoformat()
259
- self.log(
260
- 'upload_event',
261
- self.UploadEventLog(info=f'Unknown log code: {code}', status=Context.DANGER, created=now).model_dump(),
262
- )
263
- return
264
-
265
- log_config = self.LOG_MESSAGES[code]
266
- message = log_config['message'].format(*args) if args else log_config['message']
267
- log_level = level or log_config['level'] or Context.INFO
268
-
269
- now = datetime.now().isoformat()
270
- self.log(
271
- 'upload_event',
272
- self.UploadEventLog(info=message, status=log_level, created=now).model_dump(),
273
- )
274
-
275
- def log_data_file(self, data_file_info: dict, status: UploadStatus):
276
- """Upload data_file log.
277
-
278
- Args:
279
- data_file_info (dict): The json info of the data file.
280
- status (UploadStatus): The status of the data file.
281
- """
282
- now = datetime.now().isoformat()
283
- # Use custom JSON encoder to handle Path-like objects
284
- data_file_info_str = json.dumps(data_file_info, ensure_ascii=False, cls=PathAwareJSONEncoder)
285
- self.log(
286
- 'upload_data_file',
287
- self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
288
- )
289
-
290
- def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
291
- """Upload data_unit log.
292
-
293
- Args:
294
- data_unit_id (int): The ID of the data unit.
295
- status (UploadStatus): The status of the data unit.
296
- data_unit_meta (dict | None): The metadata of the data unit.
297
- """
298
- now = datetime.now().isoformat()
299
- self.log(
300
- 'upload_data_unit',
301
- self.DataUnitLog(
302
- data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
303
- ).model_dump(),
304
- )
305
-
306
- def log_task(self, task_id: int, status: UploadStatus):
307
- """Upload task log.
308
-
309
- Args:
310
- task_id (int): The ID of the task.
311
- status (UploadStatus): The status of the task.
312
- """
313
- now = datetime.now().isoformat()
314
- self.log('upload_task', self.TaskLog(task_id=task_id, status=status.value, created=now).model_dump())
315
-
316
- def log_metrics(self, record: MetricsRecord, category: str):
317
- """Log upload metrics.
318
- Args:
319
- record (MetricsRecord): The metrics record to log.
320
- category (str): The category of the metrics.
321
- """
322
- record = self.MetricsRecord.model_validate(record)
323
- self.set_metrics(value=record.model_dump(), category=category)
324
-
325
-
326
- class ExcelSecurityError(Exception):
327
- """Custom exception for Excel security validation errors."""
328
-
329
- pass
330
-
331
-
332
- class ExcelParsingError(Exception):
333
- """Custom exception for Excel parsing errors."""
334
-
335
- pass
336
-
337
-
338
- class ExcelSecurityConfig:
339
- """Configuration class for Excel security settings."""
340
-
341
- def __init__(self):
342
- # File size limits
343
- self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
344
- self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
345
-
346
- # Memory limits
347
- self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
348
- self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
349
-
350
- # Content limits
351
- self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
352
- self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
353
-
354
- # String length limits
355
- self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
356
- self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
357
- self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
358
-
359
-
360
- class ExcelMetadataUtils:
361
- """Utility class for Excel metadata processing with shared validation logic."""
362
-
363
- def __init__(self, config: ExcelSecurityConfig):
364
- self.config = config
365
-
366
- def validate_and_truncate_string(self, value: str, max_length: int) -> str:
367
- """Validate and truncate string to specified maximum length.
368
-
369
- Args:
370
- value: String value to validate and truncate
371
- max_length: Maximum allowed length
372
-
373
- Returns:
374
- str: Validated and potentially truncated string
375
- """
376
- if not isinstance(value, str):
377
- value = str(value)
378
-
379
- value = value.strip()
380
-
381
- if len(value) > max_length:
382
- return value[:max_length]
383
-
384
- return value
385
-
386
- def is_valid_filename_length(self, filename: str) -> bool:
387
- """Check if filename length is within acceptable limits.
388
-
389
- Args:
390
- filename: Filename to check
391
-
392
- Returns:
393
- bool: True if filename length is acceptable
394
- """
395
- return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
396
-
397
-
398
- class UploadParams(BaseModel):
399
- """Upload action parameters.
400
-
401
- This class defines all configuration parameters for the upload action, including
402
- advanced asynchronous upload capabilities for improved performance.
403
-
404
- Args:
405
- name (str): The name of the action.
406
- description (str | None): The description of the action.
407
- checkpoint (int | None): The checkpoint of the action.
408
- path (str): The path of the action.
409
- storage (int): The storage of the action.
410
- collection (int): The collection of the action.
411
- project (int | None): The project of the action.
412
- excel_metadata_path (str | None): Path to excel file containing metadata.
413
- Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
414
- is_recursive (bool): Enable recursive file discovery in subdirectories. Defaults to False.
415
- use_async_upload (bool): Enable asynchronous upload data file processing for improved performance.
416
- Defaults to True.
417
- max_file_size_mb (int): The maximum file size not using chunked_upload in MB. Defaults to 50MB.
418
- creating_data_unit_batch_size (int): The batch size for creating data units. Defaults to 100.
419
- extra_params (dict | None): Extra parameters for the action.
420
- Example: {"include_metadata": True, "compression": "gzip"}
421
-
422
- Note:
423
- Async upload requires plugin developers to implement handle_upload_files_async()
424
- method for maximum benefit. Default implementation provides compatibility
425
- but limited performance improvement.
426
- """
427
-
428
- name: Annotated[str, AfterValidator(non_blank)]
429
- description: str | None
430
- path: str
431
- storage: int
432
- collection: int
433
- project: int | None
434
- excel_metadata_path: str | None = None
435
- is_recursive: bool = True
436
- max_file_size_mb: int = 50
437
- creating_data_unit_batch_size: int = 1
438
- use_async_upload: bool = True
439
- extra_params: dict | None = None
440
-
441
- @field_validator('storage', mode='before')
442
- @classmethod
443
- def check_storage_exists(cls, value: str, info) -> str:
444
- """Validate synapse-backend storage exists.
445
-
446
- TODO: Need to define validation method naming convention.
447
- TODO: Need to make validation method reusable.
448
- """
449
- action = info.context['action']
450
- client = action.client
451
- try:
452
- client.get_storage(value)
453
- except ClientError:
454
- raise PydanticCustomError('client_error', _('Error occurred while checking storage exists.'))
455
- return value
456
-
457
- @field_validator('collection', mode='before')
458
- @classmethod
459
- def check_collection_exists(cls, value: str, info) -> str:
460
- """Validate synapse-backend collection exists."""
461
- action = info.context['action']
462
- client = action.client
463
- try:
464
- client.get_data_collection(value)
465
- except ClientError:
466
- raise PydanticCustomError('client_error', _('Error occurred while checking collection exists.'))
467
- return value
468
-
469
- @field_validator('project', mode='before')
470
- @classmethod
471
- def check_project_exists(cls, value: str, info) -> str:
472
- """Validate synapse-backend project exists."""
473
- if not value:
474
- return value
475
-
476
- action = info.context['action']
477
- client = action.client
478
- try:
479
- client.get_project(value)
480
- except ClientError:
481
- raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
482
- return value
483
-
484
- @field_validator('excel_metadata_path', mode='before')
485
- @classmethod
486
- def check_excel_metadata_path(cls, value: str, info) -> str:
487
- """Validate excel metadata file exists and is secure if provided.
488
-
489
- This validator performs comprehensive security checks including:
490
- - File existence and format validation
491
- - File size limits (max 10MB)
492
- - Basic security checks for file content
493
-
494
- Args:
495
- value: The excel file path to validate
496
- info: Validation context information
497
-
498
- Returns:
499
- str: The validated file path
500
-
501
- Raises:
502
- PydanticCustomError: If validation fails
503
- """
504
- if not value:
505
- return value
506
-
507
- excel_path = Path(value)
508
-
509
- # Check file existence
510
- if not excel_path.exists():
511
- raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
512
-
513
- # Check file extension
514
- if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
515
- raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
516
-
517
- # Security check: file size limit
518
- file_size = excel_path.stat().st_size
519
- excel_config = ExcelSecurityConfig()
520
- if file_size > excel_config.MAX_FILE_SIZE_BYTES:
521
- raise PydanticCustomError(
522
- 'file_too_large',
523
- _('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
524
- )
525
-
526
- # Basic security check: ensure file is readable and not corrupted
527
- try:
528
- with open(excel_path, 'rb') as f:
529
- # Read first few bytes to check if it's a valid Excel file
530
- header = f.read(8)
531
- if not header:
532
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
533
-
534
- # Check for valid Excel file signatures
535
- if excel_path.suffix.lower() == '.xlsx':
536
- # XLSX files start with PK (ZIP signature)
537
- if not header.startswith(b'PK'):
538
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
539
- elif excel_path.suffix.lower() == '.xls':
540
- # XLS files have specific OLE signatures
541
- if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
542
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
543
-
544
- except (OSError, IOError):
545
- raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
546
-
547
- return value
548
-
549
-
550
- @register_action
551
- class UploadAction(Action):
552
- """Upload action class.
553
-
554
- Attrs:
555
- name (str): The name of the action.
556
- category (PluginCategory): The category of the action.
557
- method (RunMethod): The method to run of the action.
558
-
559
- Progress Categories:
560
- analyze_collection: The progress category for the analyze collection process.
561
- data_file_upload: The progress category for the upload process.
562
- generate_data_units: The progress category for the generate data units process.
563
-
564
- Metrics Categories:
565
- data_file: The metrics category for the data file.
566
- data_unit: The metrics category for the data unit.
567
- """
568
-
569
- name = 'upload'
570
- category = PluginCategory.UPLOAD
571
- method = RunMethod.JOB
572
- run_class = UploadRun
573
- progress_categories = {
574
- 'analyze_collection': {
575
- 'proportion': 2,
576
- },
577
- 'upload_data_files': {
578
- 'proportion': 38,
579
- },
580
- 'generate_data_units': {
581
- 'proportion': 60,
582
- },
583
- }
584
- metrics_categories = {
585
- 'data_files': {
586
- 'stand_by': 0,
587
- 'failed': 0,
588
- 'success': 0,
589
- },
590
- 'data_units': {
591
- 'stand_by': 0,
592
- 'failed': 0,
593
- 'success': 0,
594
- },
595
- }
596
-
597
- def __init__(self, *args, **kwargs):
598
- super().__init__(*args, **kwargs)
599
- self.excel_config = ExcelSecurityConfig()
600
- self.excel_utils = ExcelMetadataUtils(self.excel_config)
601
-
602
- def get_uploader(self, path, file_specification, organized_files, params: Dict = None):
603
- """Get uploader from entrypoint."""
604
- return self.entrypoint(
605
- self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
606
- )
607
-
608
- def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
609
- """Discover files recursively in a directory."""
610
- return [file_path for file_path in dir_path.rglob('*') if file_path.is_file()]
611
-
612
- def _discover_files_non_recursive(self, dir_path: Path) -> List[Path]:
613
- """Discover files in a directory (non-recursive)."""
614
- return [file_path for file_path in dir_path.glob('*') if file_path.is_file()]
615
-
616
- def _validate_excel_security(self, excel_path: Path) -> None:
617
- """Validate Excel file security constraints.
618
-
619
- Performs comprehensive security validation including:
620
- - File size limits
621
- - Memory usage constraints
622
- - Basic malicious content detection
623
-
624
- Args:
625
- excel_path: Path to the Excel file to validate
626
-
627
- Raises:
628
- ExcelSecurityError: If security validation fails
629
- """
630
- # File size check (already done in validator, but double-check)
631
- file_size = excel_path.stat().st_size
632
- if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
633
- raise ExcelSecurityError(
634
- f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
635
- )
636
-
637
- # Memory usage estimation (rough estimate: file_size * 3 for processing)
638
- estimated_memory = file_size * 3
639
- if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
640
- raise ExcelSecurityError(
641
- f'Excel file may consume too much memory: ~{estimated_memory} bytes '
642
- f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
643
- )
644
-
645
- def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
646
- """Prepare Excel file for reading with security validation.
647
-
648
- Args:
649
- excel_path: Path to the Excel file
650
-
651
- Returns:
652
- BytesIO: Excel file stream ready for reading
653
-
654
- Raises:
655
- ExcelSecurityError: If security validation fails
656
- """
657
- self._validate_excel_security(excel_path)
658
- excel_bytes = excel_path.read_bytes()
659
- return BytesIO(excel_bytes)
660
-
661
- def _process_excel_headers(self, headers: tuple) -> tuple:
662
- """Process and validate Excel headers.
663
-
664
- Args:
665
- headers: Raw header tuple from Excel
666
-
667
- Returns:
668
- tuple: Validated headers
669
-
670
- Raises:
671
- ExcelParsingError: If headers are invalid
672
- """
673
- if len(headers) < 2:
674
- raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
675
- self._validate_excel_content(headers, 0) # Validate column count
676
- return headers
677
-
678
- def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
679
- """Process a single Excel data row.
680
-
681
- Args:
682
- row: Raw row data from Excel
683
- headers: Excel headers
684
-
685
- Returns:
686
- Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
687
- """
688
- # Skip empty rows
689
- if not row[0] or str(row[0]).strip() == '':
690
- return None
691
-
692
- file_name = str(row[0]).strip()
693
- if not self.excel_utils.is_valid_filename_length(file_name):
694
- self.run.log_message_with_code('FILENAME_TOO_LONG', file_name[:50])
695
- return None
696
-
697
- # Create metadata dictionary from remaining columns
698
- file_metadata: Dict[str, Any] = {}
699
- for i, value in enumerate(row[1:], start=1):
700
- if value is not None and i < len(headers):
701
- header_value = headers[i]
702
- column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
703
-
704
- # Validate and truncate column name and value
705
- column_name = self.excel_utils.validate_and_truncate_string(
706
- column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
707
- )
708
- str_value = self.excel_utils.validate_and_truncate_string(
709
- str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
710
- )
711
- file_metadata[column_name] = str_value
712
-
713
- return {file_name: file_metadata} if file_metadata else None
714
-
715
- def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
716
- """Process Excel worksheet and extract metadata.
717
-
718
- Args:
719
- worksheet: openpyxl worksheet object
720
-
721
- Returns:
722
- Dict[str, Dict[str, Any]]: Extracted metadata dictionary
723
-
724
- Raises:
725
- ExcelParsingError: If worksheet processing fails
726
- """
727
- if worksheet is None:
728
- raise ExcelParsingError('Excel file has no active worksheet')
729
-
730
- metadata_dict: Dict[str, Dict[str, Any]] = {}
731
- headers: Optional[tuple] = None
732
- data_row_count = 0
733
- validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
734
-
735
- # Process rows one by one for memory efficiency
736
- for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
737
- if not row or all(cell is None or str(cell).strip() == '' for cell in row):
738
- continue # Skip completely empty rows
739
-
740
- if row_idx == 0: # Header row
741
- headers = self._process_excel_headers(row)
742
- continue
743
-
744
- # Data rows
745
- if headers is None:
746
- raise ExcelParsingError('Excel file missing header row')
747
-
748
- data_row_count += 1
749
-
750
- # Validate row count periodically
751
- if data_row_count % validation_interval == 0:
752
- self._validate_excel_content(headers, data_row_count)
753
-
754
- # Process individual row
755
- row_result = self._process_excel_data_row(row, headers)
756
- if row_result:
757
- metadata_dict.update(row_result)
758
-
759
- # Final validation
760
- self._validate_excel_content(headers or (), data_row_count)
761
-
762
- return metadata_dict
763
-
764
- def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
765
- """Validate Excel content constraints.
766
-
767
- Args:
768
- headers: Tuple of header values from the first row
769
- row_count: Total number of data rows processed
770
-
771
- Raises:
772
- ExcelParsingError: If content validation fails
773
- """
774
- # Limit number of columns to prevent memory exhaustion
775
- if len(headers) > self.excel_config.MAX_COLUMNS:
776
- raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
777
-
778
- # Limit number of rows to prevent excessive processing
779
- if row_count > self.excel_config.MAX_ROWS:
780
- raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
781
-
782
- def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
783
- """Find Excel metadata file in the directory.
784
-
785
- Checks for meta.xlsx and meta.xls in the given directory.
786
-
787
- Args:
788
- pathlib_cwd (Path): The pathlib object representing the current working directory.
789
-
790
- Returns:
791
- Optional[Path]: Path to the Excel metadata file if found, None otherwise.
792
- """
793
- # Check for xlsx first, then xls
794
- for extension in ['.xlsx', '.xls']:
795
- excel_path = pathlib_cwd / f'meta{extension}'
796
- if excel_path.exists() and excel_path.is_file():
797
- return excel_path
798
- return None
799
-
800
- def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
801
- """Read metadata from excel file with comprehensive security validation.
802
-
803
- This method orchestrates the Excel metadata reading process by delegating
804
- to specialized methods for each step.
805
-
806
- Args:
807
- pathlib_cwd (Path): The pathlib object representing the current working directory.
808
-
809
- Returns:
810
- Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
811
- Empty dict if no metadata is configured or if reading fails.
812
-
813
- Raises:
814
- ExcelSecurityError: If security validation fails
815
- ExcelParsingError: If Excel content is invalid or exceeds limits
816
- """
817
- excel_path = None
818
-
819
- # Check if user provided a specific excel_metadata_path
820
- excel_metadata_path = self.params.get('excel_metadata_path')
821
- if excel_metadata_path:
822
- excel_path = pathlib_cwd / excel_metadata_path
823
- if not excel_path.exists():
824
- self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH')
825
- return {}
826
- else:
827
- # Look for default meta.xlsx or meta.xls
828
- excel_path = self._find_excel_metadata_file(pathlib_cwd)
829
- if not excel_path:
830
- # No Excel metadata file found, return empty dict (not an error)
831
- return {}
832
-
833
- try:
834
- self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED')
835
-
836
- # Prepare Excel file with security validation
837
- excel_stream = self._prepare_excel_file(excel_path)
838
-
839
- # Load and process workbook
840
- workbook = load_workbook(excel_stream, read_only=True, data_only=True)
841
- try:
842
- self.run.log_message_with_code('EXCEL_WORKBOOK_LOADED')
843
- return self._process_excel_worksheet(workbook.active)
844
- finally:
845
- workbook.close()
846
-
847
- except ExcelSecurityError as e:
848
- self.run.log_message_with_code('EXCEL_SECURITY_VALIDATION_FAILED', str(e))
849
- raise
850
- except ExcelParsingError as e:
851
- self.run.log_message_with_code('EXCEL_PARSING_FAILED', str(e))
852
- raise
853
- except InvalidFileException as e:
854
- self.run.log_message_with_code('EXCEL_INVALID_FILE_FORMAT', str(e))
855
- raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
856
- except MemoryError:
857
- self.run.log_message_with_code('EXCEL_FILE_TOO_LARGE')
858
- raise ExcelSecurityError('Excel file exceeds memory limits')
859
- except (OSError, IOError) as e:
860
- self.run.log_message_with_code('EXCEL_FILE_ACCESS_ERROR', str(e))
861
- raise ExcelParsingError(f'File access error: {str(e)}')
862
- except Exception as e:
863
- self.run.log_message_with_code('EXCEL_UNEXPECTED_ERROR', str(e))
864
- raise ExcelParsingError(f'Unexpected error: {str(e)}')
865
-
866
- def start(self) -> Dict[str, Any]:
867
- """Start upload process.
868
-
869
- Returns:
870
- Dict: The result of the upload process.
871
- """
872
- # Setup result dict early for error handling
873
- result: Dict[str, Any] = {}
874
-
875
- # Setup path object with path and storage.
876
- storage_id = self.params.get('storage')
877
- if storage_id is None:
878
- raise ActionError('Storage parameter is required')
879
- storage = self.client.get_storage(storage_id)
880
-
881
- path = self.params.get('path')
882
- if path is None:
883
- raise ActionError('Path parameter is required')
884
- pathlib_cwd = get_pathlib(storage, path)
885
-
886
- # Read excel metadata if configured or default file exists
887
- excel_metadata: Dict[str, Dict[str, Any]] = {}
888
- try:
889
- excel_metadata = self._read_excel_metadata(pathlib_cwd)
890
- if excel_metadata:
891
- self.run.log_message_with_code('EXCEL_METADATA_LOADED', len(excel_metadata))
892
- except ExcelSecurityError as e:
893
- # Security violations should stop the process entirely
894
- self.run.log_message_with_code('EXCEL_SECURITY_VIOLATION', str(e))
895
- return result
896
- except ExcelParsingError as e:
897
- # Parsing errors can be non-critical if user didn't explicitly provide Excel file
898
- if self.params.get('excel_metadata_path'):
899
- # User explicitly provided Excel file, treat as error
900
- self.run.log_message_with_code('EXCEL_PARSING_ERROR', str(e))
901
- return result
902
- else:
903
- # Default Excel file found but failed to parse, treat as warning and continue
904
- self.run.log_message_with_code('EXCEL_PARSING_ERROR', str(e))
905
- excel_metadata = {}
906
-
907
- # Analyze Collection file specifications to determine the data structure for upload.
908
- file_specification_template = self._analyze_collection()
909
- organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
910
-
911
- # Initialize uploader.
912
- uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
913
-
914
- # Get organized files from the uploader (plugin developer's custom implementation)
915
- # or use the default organization method if uploader doesn't provide valid files
916
- organized_files = uploader.handle_upload_files()
917
-
918
- # Validate the organized files
919
- if not self._validate_organized_files(organized_files, file_specification_template):
920
- self.run.log_message_with_code('VALIDATION_FAILED')
921
- raise ActionError('Upload is aborted due to validation errors.')
922
-
923
- # Upload files to synapse-backend.
924
- if not organized_files:
925
- self.run.log_message_with_code('NO_FILES_FOUND')
926
- raise ActionError('Upload is aborted due to missing files.')
927
- # Choose upload method based on async parameter
928
- if self.params.get('use_async_upload', True):
929
- uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
930
- else:
931
- uploaded_files = self._upload_files(organized_files)
932
- result['uploaded_files_count'] = len(uploaded_files)
933
-
934
- # Generate data units for the uploaded data.
935
- if not uploaded_files:
936
- self.run.log_message_with_code('NO_FILES_UPLOADED')
937
- raise ActionError('Upload is aborted due to no uploaded files.')
938
- generated_data_units = self._generate_data_units(
939
- uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
940
- )
941
- result['generated_data_units_count'] = len(generated_data_units)
942
-
943
- # Setup task with uploaded synapse-backend data units.
944
- if not generated_data_units:
945
- self.run.log_message_with_code('NO_DATA_UNITS_GENERATED')
946
- raise ActionError('Upload is aborted due to no generated data units.')
947
-
948
- # Clean up if temp dir exists
949
- self._cleanup_temp_directory()
950
-
951
- self.run.log_message_with_code('IMPORT_COMPLETED')
952
- return result
953
-
954
- def _analyze_collection(self) -> Dict[str, Any]:
955
- """Analyze Synapse Collection Specifications.
956
-
957
- Returns:
958
- Dict: The file specifications of the collection.
959
- """
960
-
961
- # Initialize progress
962
- self.run.set_progress(0, 2, category='analyze_collection')
963
-
964
- collection_id = self.params.get('data_collection')
965
- if collection_id is None:
966
- raise ActionError('Data collection parameter is required')
967
- self.run.set_progress(1, 2, category='analyze_collection')
968
-
969
- collection = self.run.client.get_data_collection(collection_id)
970
- self.run.set_progress(2, 2, category='analyze_collection')
971
-
972
- return collection['file_specifications']
973
-
974
- def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
975
- """Upload files to synapse-backend.
976
-
977
- Returns:
978
- Dict: The result of the upload.
979
- """
980
- # Initialize progress
981
- organized_files_count = len(organized_files)
982
- self.run.set_progress(0, organized_files_count, category='upload_data_files')
983
- self.run.log_message_with_code('UPLOADING_DATA_FILES')
984
-
985
- client = self.run.client
986
- collection_id = self.params.get('data_collection')
987
- if collection_id is None:
988
- raise ActionError('Data collection parameter is required')
989
- upload_result = []
990
- current_progress = 0
991
- success_count = 0
992
- failed_count = 0
993
-
994
- # Initialize metrics
995
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
996
-
997
- for organized_file in organized_files:
998
- try:
999
- # Determine if chunked upload should be used based on file size
1000
- use_chunked_upload = self._requires_chunked_upload(organized_file)
1001
- uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
1002
- self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
1003
- success_count += 1
1004
- upload_result.append(uploaded_data_file)
1005
- except Exception as e:
1006
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1007
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', str(e))
1008
- failed_count += 1
1009
-
1010
- current_progress += 1
1011
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
1012
- self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
1013
-
1014
- # Finish progress
1015
- self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
1016
-
1017
- return upload_result
1018
-
1019
- def run_async(self, coro: Awaitable[T]) -> T:
1020
- """Run async coroutine safely using asyncio.run().
1021
-
1022
- This method properly manages event loop lifecycle and prevents
1023
- resource exhaustion from repeated event loop creation.
1024
-
1025
- Args:
1026
- coro: The coroutine to execute
1027
-
1028
- Returns:
1029
- The result of the coroutine execution
1030
-
1031
- Raises:
1032
- RuntimeError: If called from within an existing event loop
1033
- """
1034
- import concurrent.futures
1035
-
1036
- def _run_in_thread():
1037
- """Run the coroutine in a separate thread to avoid event loop conflicts."""
1038
- return asyncio.run(coro)
1039
-
1040
- # Check if we're already in an event loop
1041
- try:
1042
- # If this doesn't raise, we're in an event loop
1043
- asyncio.get_running_loop()
1044
- # Run in thread pool to avoid "RuntimeError: cannot be called from a running event loop"
1045
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1046
- future = executor.submit(_run_in_thread)
1047
- return future.result()
1048
- except RuntimeError:
1049
- # No event loop running, safe to use asyncio.run directly
1050
- return asyncio.run(coro)
1051
-
1052
- async def _upload_files_async(
1053
- self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
1054
- ) -> List[Dict[str, Any]]:
1055
- """Upload files to synapse-backend asynchronously with concurrency control."""
1056
- # Initialize progress
1057
- organized_files_count = len(organized_files)
1058
- self.run.set_progress(0, organized_files_count, category='upload_data_files')
1059
- self.run.log_message_with_code('UPLOADING_DATA_FILES')
1060
-
1061
- client = self.run.client
1062
- collection_id = self.params.get('data_collection')
1063
- if collection_id is None:
1064
- raise ActionError('Data collection parameter is required')
1065
- upload_result = []
1066
- success_count = 0
1067
- failed_count = 0
1068
-
1069
- # Initialize metrics
1070
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
1071
-
1072
- # Control concurrency with semaphore
1073
- semaphore = asyncio.Semaphore(max_concurrent)
1074
-
1075
- async def upload_single_file(organized_file):
1076
- async with semaphore:
1077
- loop = asyncio.get_event_loop()
1078
- try:
1079
- # Determine if chunked upload should be used based on file size
1080
- use_chunked_upload = self._requires_chunked_upload(organized_file)
1081
- # Run sync upload_data_file in thread pool
1082
- uploaded_data_file = await loop.run_in_executor(
1083
- None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
1084
- )
1085
- self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
1086
- return {'status': 'success', 'result': uploaded_data_file}
1087
- except ClientError as e:
1088
- # Handle API client errors (network, authentication, server errors)
1089
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1090
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Client error: {str(e)}')
1091
- return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
1092
- except (OSError, IOError) as e:
1093
- # Handle file system errors (file not found, permissions, disk full)
1094
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1095
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'File system error: {str(e)}')
1096
- return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
1097
- except MemoryError as e:
1098
- # Handle out of memory errors (large files)
1099
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1100
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Memory error (file too large): {str(e)}')
1101
- return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
1102
- except asyncio.TimeoutError as e:
1103
- # Handle timeout errors (slow network, large files)
1104
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1105
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Upload timeout: {str(e)}')
1106
- return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
1107
- except ValueError as e:
1108
- # Handle data validation errors (invalid file format, metadata issues)
1109
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1110
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Data validation error: {str(e)}')
1111
- return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
1112
- except Exception as e:
1113
- # Handle any remaining unexpected errors
1114
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
1115
- self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Unexpected error: {str(e)}')
1116
- return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
1117
-
1118
- # Create tasks for all files
1119
- tasks = [upload_single_file(organized_file) for organized_file in organized_files]
1120
-
1121
- # Process files with progress updates
1122
- current_progress = 0
1123
- for completed_task in asyncio.as_completed(tasks):
1124
- result = await completed_task
1125
- current_progress += 1
1126
-
1127
- if result['status'] == 'success':
1128
- success_count += 1
1129
- upload_result.append(result['result'])
1130
- else:
1131
- failed_count += 1
1132
-
1133
- # Update metrics and progress
1134
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
1135
- self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
1136
-
1137
- # Finish progress
1138
- self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
1139
-
1140
- return upload_result
1141
-
1142
- def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
1143
- """Generate data units for the uploaded data.
1144
-
1145
- TODO: make dynamic batch size depend on uploaded file sizes
1146
-
1147
- Returns:
1148
- Dict: The result of the generate data units process.
1149
- """
1150
- # Initialize progress
1151
- upload_result_count = len(uploaded_files)
1152
- self.run.set_progress(0, upload_result_count, category='generate_data_units')
1153
- self.run.log_message_with_code('GENERATING_DATA_UNITS')
1154
-
1155
- client = self.run.client
1156
- generated_data_units = []
1157
- current_progress = 0
1158
- success_count = 0
1159
- failed_count = 0
1160
-
1161
- batches = get_batched_list(uploaded_files, batch_size)
1162
- batches_count = len(batches)
1163
-
1164
- # Initialize metrics
1165
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
1166
-
1167
- for batch in batches:
1168
- try:
1169
- created_data_units = client.create_data_units(batch)
1170
- success_count += len(created_data_units)
1171
- generated_data_units.append(created_data_units)
1172
- for created_data_unit in created_data_units:
1173
- self.run.log_data_unit(
1174
- created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
1175
- )
1176
- except Exception as e:
1177
- failed_count += len(batch)
1178
- self.run.log_message_with_code('DATA_UNIT_BATCH_FAILED', str(e))
1179
- for _ in batch:
1180
- self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
1181
-
1182
- current_progress += 1
1183
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
1184
- self.run.set_progress(current_progress, batches_count, category='generate_data_units')
1185
-
1186
- # Finish progress
1187
- self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
1188
-
1189
- return sum(generated_data_units, [])
1190
-
1191
- def _validate_organized_files(
1192
- self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
1193
- ) -> bool:
1194
- """Validate organized files from Uploader."""
1195
- validator = FileSpecificationValidator(file_specification_template, organized_files)
1196
- return validator.validate()
1197
-
1198
- def _organize_files(
1199
- self,
1200
- directory: Path,
1201
- file_specification: List[Dict[str, Any]],
1202
- excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
1203
- ) -> List[Dict[str, Any]]:
1204
- """Organize files according to the file specification.
1205
-
1206
- Args:
1207
- directory (Path): Root directory containing files to organize.
1208
- file_specification (List[Dict[str, Any]]): File specification list with metadata.
1209
- excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
1210
- to their metadata key-value pairs from excel file.
1211
-
1212
- Returns:
1213
- List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
1214
- """
1215
- organized_files: List[Dict[str, Any]] = []
1216
-
1217
- # Check for type-based directory structure (e.g., image_1/, pcd_1/)
1218
- type_dirs: Dict[str, Path] = {}
1219
-
1220
- for spec in file_specification:
1221
- spec_name = spec['name']
1222
- spec_dir = directory / spec_name
1223
- if spec_dir.exists() and spec_dir.is_dir():
1224
- type_dirs[spec_name] = spec_dir
1225
-
1226
- if type_dirs:
1227
- self.run.log_message_with_code('TYPE_DIRECTORIES_FOUND', list(type_dirs.keys()))
1228
-
1229
- # If type-based directories don't exist, exit early
1230
- if not type_dirs:
1231
- self.run.log_message_with_code('NO_TYPE_DIRECTORIES')
1232
- return organized_files
1233
-
1234
- self.run.log_message_with_code('TYPE_STRUCTURE_DETECTED')
1235
- self.run.log_message_with_code('FILE_ORGANIZATION_STARTED')
1236
-
1237
- # Collect and process files in a single pass
1238
- dataset_files = {}
1239
- required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
1240
-
1241
- # Get recursive setting from params
1242
- is_recursive = self.params.get('is_recursive', True)
1243
-
1244
- # Process all files from all type directories
1245
- for spec_name, dir_path in type_dirs.items():
1246
- # Use appropriate method based on recursive setting
1247
- if is_recursive:
1248
- files_list = self._discover_files_recursive(dir_path)
1249
- else:
1250
- files_list = self._discover_files_non_recursive(dir_path)
1251
-
1252
- for file_path in files_list:
1253
- # Always use filename only for matching
1254
- file_name = file_path.stem
1255
-
1256
- # Initialize dataset entry if it doesn't exist
1257
- if file_name not in dataset_files:
1258
- dataset_files[file_name] = {}
1259
-
1260
- # Map this file to its specification (handle duplicates)
1261
- if spec_name not in dataset_files[file_name]:
1262
- dataset_files[file_name][spec_name] = file_path
1263
- else:
1264
- existing_file = dataset_files[file_name][spec_name]
1265
- if file_path.stat().st_mtime > existing_file.stat().st_mtime:
1266
- dataset_files[file_name][spec_name] = file_path
1267
-
1268
- if not dataset_files:
1269
- self.run.log_message_with_code('NO_FILES_FOUND_WARNING')
1270
- return organized_files
1271
-
1272
- self.run.log_message_with_code('FILES_DISCOVERED', len(dataset_files))
1273
-
1274
- # Organize datasets - check requirements and create metadata
1275
- for file_name, files_dict in sorted(dataset_files.items()):
1276
- if all(req in files_dict for req in required_specs):
1277
- # Get most common file extension
1278
- file_extensions = {}
1279
- for file_path in files_dict.values():
1280
- ext = file_path.suffix.lower()
1281
- if ext:
1282
- file_extensions[ext] = file_extensions.get(ext, 0) + 1
1283
-
1284
- origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
1285
-
1286
- # Create metadata for this dataset
1287
- meta_data: Dict[str, Any] = {
1288
- 'origin_file_stem': file_name,
1289
- 'origin_file_extension': origin_file_extension,
1290
- 'created_at': datetime.now().isoformat(),
1291
- }
1292
-
1293
- # Add excel metadata if available
1294
- if excel_metadata and file_name in excel_metadata:
1295
- meta_data.update(excel_metadata[file_name])
1296
-
1297
- # Add the organized dataset
1298
- organized_files.append({'files': files_dict, 'meta': meta_data})
1299
- else:
1300
- missing = [req for req in required_specs if req not in files_dict]
1301
- self.run.log_message_with_code('MISSING_REQUIRED_FILES', file_name, ', '.join(missing))
1302
-
1303
- return organized_files
1304
-
1305
- def _get_file_size_mb(self, file_path: Path) -> float:
1306
- """Get file size in MB.
1307
-
1308
- Args:
1309
- file_path (Path): Path to the file.
1310
-
1311
- Returns:
1312
- float: File size in MB.
1313
- """
1314
- return file_path.stat().st_size / (1024 * 1024)
1315
-
1316
- def _requires_chunked_upload(self, organized_file: Dict[str, Any]) -> bool:
1317
- """Determine if chunked upload is required based on file size threshold.
1318
-
1319
- Args:
1320
- organized_file (Dict[str, Any]): Organized file data with 'files' dict.
1321
-
1322
- Returns:
1323
- bool: True if any file exceeds the threshold, False otherwise.
1324
- """
1325
- max_file_size_mb = self.params.get('max_file_size_mb', 50)
1326
- for file_path in organized_file.get('files', {}).values():
1327
- if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
1328
- return True
1329
- return False
1330
-
1331
- def _cleanup_temp_directory(self, temp_path: Optional[Path] = None) -> None:
1332
- """Clean up temporary directory.
1333
-
1334
- Args:
1335
- temp_path (Optional[Path]): Path to temporary directory.
1336
- If None, uses default temp directory in current working directory.
1337
- """
1338
- if temp_path is None:
1339
- try:
1340
- temp_path = Path(os.getcwd()) / 'temp'
1341
- except (FileNotFoundError, OSError):
1342
- return
1343
-
1344
- if not temp_path.exists():
1345
- return
1346
-
1347
- shutil.rmtree(temp_path, ignore_errors=True)
1348
- self.run.log_message(f'Cleaned up temporary directory: {temp_path}')
1349
-
1350
- def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
1351
- """Update metrics for upload progress.
1352
-
1353
- Args:
1354
- total_count (int): Total number of items to process.
1355
- success_count (int): Number of successfully processed items.
1356
- failed_count (int): Number of failed items.
1357
- category (str): The category of the metrics.
1358
- """
1359
- if not self.run:
1360
- raise ValueError('Run instance not properly initialized')
1361
-
1362
- # Type assertion to help the linter
1363
- assert isinstance(self.run, UploadRun)
1364
-
1365
- metrics = self.run.MetricsRecord(
1366
- stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
1367
- )
1368
- self.run.log_metrics(metrics, category)