synapse-sdk 1.0.0a86__py3-none-any.whl → 1.0.0a88__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py +7 -3
- synapse_sdk/plugins/categories/upload/actions/upload.py +550 -96
- synapse_sdk/plugins/categories/upload/templates/config.yaml +0 -2
- synapse_sdk/plugins/templates/plugin-config-schema.json +0 -10
- synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt +1 -1
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/METADATA +1 -1
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/RECORD +11 -11
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
|
|
|
3
3
|
from synapse_sdk.plugins.categories.data_validation.actions.validation import ValidationDataStatus, ValidationResult
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) ->
|
|
6
|
+
def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
|
|
7
7
|
"""Validate data with assignment data.
|
|
8
8
|
|
|
9
9
|
* Custom validation logic can be added here.
|
|
@@ -15,7 +15,7 @@ def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Validation
|
|
|
15
15
|
**kwargs: Additional arguments.
|
|
16
16
|
|
|
17
17
|
Returns:
|
|
18
|
-
|
|
18
|
+
Dict[str, Any]: The validation result as a dictionary with ValidationResult structure.
|
|
19
19
|
"""
|
|
20
20
|
errors: List[str] = []
|
|
21
21
|
|
|
@@ -26,4 +26,8 @@ def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Validation
|
|
|
26
26
|
# Determine status based on errors
|
|
27
27
|
status = ValidationDataStatus.FAILED if errors else ValidationDataStatus.SUCCESS
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
# DO NOT MODIFY BELOW THIS LINE - Validation result should be returned as a dumped ValidationResult.
|
|
30
|
+
validation_result = ValidationResult(status=status, errors=errors)
|
|
31
|
+
result_dict = validation_result.model_dump()
|
|
32
|
+
result_dict['status'] = status.value
|
|
33
|
+
return result_dict
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from enum import Enum
|
|
5
|
+
from io import BytesIO
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Annotated, Dict, List
|
|
7
|
+
from typing import Annotated, Any, Dict, List, Optional
|
|
6
8
|
|
|
9
|
+
from openpyxl import load_workbook
|
|
10
|
+
from openpyxl.utils.exceptions import InvalidFileException
|
|
7
11
|
from pydantic import AfterValidator, BaseModel, field_validator
|
|
8
12
|
from pydantic_core import PydanticCustomError
|
|
9
13
|
|
|
@@ -39,6 +43,7 @@ class UploadRun(Run):
|
|
|
39
43
|
data_unit_id: int | None
|
|
40
44
|
status: UploadStatus
|
|
41
45
|
created: str
|
|
46
|
+
data_unit_meta: dict | None
|
|
42
47
|
|
|
43
48
|
class TaskLog(BaseModel):
|
|
44
49
|
"""Task log model."""
|
|
@@ -59,26 +64,29 @@ class UploadRun(Run):
|
|
|
59
64
|
|
|
60
65
|
Args:
|
|
61
66
|
data_file_info (dict): The json info of the data file.
|
|
62
|
-
|
|
63
|
-
status (DataUnitStatus): The status of the data unit.
|
|
67
|
+
status (UploadStatus): The status of the data file.
|
|
64
68
|
"""
|
|
65
69
|
now = datetime.now().isoformat()
|
|
70
|
+
data_file_info_str = json.dumps(data_file_info, ensure_ascii=False)
|
|
66
71
|
self.log(
|
|
67
72
|
'upload_data_file',
|
|
68
|
-
self.DataFileLog(data_file_info=
|
|
73
|
+
self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
|
|
69
74
|
)
|
|
70
75
|
|
|
71
|
-
def log_data_unit(self, data_unit_id: int, status: UploadStatus):
|
|
76
|
+
def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
|
|
72
77
|
"""Upload data_unit log.
|
|
73
78
|
|
|
74
79
|
Args:
|
|
75
80
|
data_unit_id (int): The ID of the data unit.
|
|
76
|
-
status (
|
|
81
|
+
status (UploadStatus): The status of the data unit.
|
|
82
|
+
data_unit_meta (dict | None): The metadata of the data unit.
|
|
77
83
|
"""
|
|
78
84
|
now = datetime.now().isoformat()
|
|
79
85
|
self.log(
|
|
80
86
|
'upload_data_unit',
|
|
81
|
-
self.DataUnitLog(
|
|
87
|
+
self.DataUnitLog(
|
|
88
|
+
data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
|
|
89
|
+
).model_dump(),
|
|
82
90
|
)
|
|
83
91
|
|
|
84
92
|
def log_task(self, task_id: int, status: UploadStatus):
|
|
@@ -101,6 +109,78 @@ class UploadRun(Run):
|
|
|
101
109
|
self.set_metrics(value=record.model_dump(), category=category)
|
|
102
110
|
|
|
103
111
|
|
|
112
|
+
class ExcelSecurityError(Exception):
|
|
113
|
+
"""Custom exception for Excel security validation errors."""
|
|
114
|
+
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ExcelParsingError(Exception):
|
|
119
|
+
"""Custom exception for Excel parsing errors."""
|
|
120
|
+
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ExcelSecurityConfig:
|
|
125
|
+
"""Configuration class for Excel security settings."""
|
|
126
|
+
|
|
127
|
+
def __init__(self):
|
|
128
|
+
# File size limits
|
|
129
|
+
self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
|
|
130
|
+
self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
|
|
131
|
+
|
|
132
|
+
# Memory limits
|
|
133
|
+
self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
|
|
134
|
+
self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
|
|
135
|
+
|
|
136
|
+
# Content limits
|
|
137
|
+
self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
|
|
138
|
+
self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
|
|
139
|
+
|
|
140
|
+
# String length limits
|
|
141
|
+
self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
|
|
142
|
+
self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
|
|
143
|
+
self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ExcelMetadataUtils:
|
|
147
|
+
"""Utility class for Excel metadata processing with shared validation logic."""
|
|
148
|
+
|
|
149
|
+
def __init__(self, config: ExcelSecurityConfig):
|
|
150
|
+
self.config = config
|
|
151
|
+
|
|
152
|
+
def validate_and_truncate_string(self, value: str, max_length: int) -> str:
|
|
153
|
+
"""Validate and truncate string to specified maximum length.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
value: String value to validate and truncate
|
|
157
|
+
max_length: Maximum allowed length
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
str: Validated and potentially truncated string
|
|
161
|
+
"""
|
|
162
|
+
if not isinstance(value, str):
|
|
163
|
+
value = str(value)
|
|
164
|
+
|
|
165
|
+
value = value.strip()
|
|
166
|
+
|
|
167
|
+
if len(value) > max_length:
|
|
168
|
+
return value[:max_length]
|
|
169
|
+
|
|
170
|
+
return value
|
|
171
|
+
|
|
172
|
+
def is_valid_filename_length(self, filename: str) -> bool:
|
|
173
|
+
"""Check if filename length is within acceptable limits.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
filename: Filename to check
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
bool: True if filename length is acceptable
|
|
180
|
+
"""
|
|
181
|
+
return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
|
|
182
|
+
|
|
183
|
+
|
|
104
184
|
class UploadParams(BaseModel):
|
|
105
185
|
"""Upload action parameters.
|
|
106
186
|
|
|
@@ -112,6 +192,8 @@ class UploadParams(BaseModel):
|
|
|
112
192
|
storage (int): The storage of the action.
|
|
113
193
|
collection (int): The collection of the action.
|
|
114
194
|
project (int | None): The project of the action.
|
|
195
|
+
use_excel_metadata (bool): Whether to use excel file for additional metadata.
|
|
196
|
+
excel_metadata_path (str | None): Path to excel file containing metadata.
|
|
115
197
|
"""
|
|
116
198
|
|
|
117
199
|
name: Annotated[str, AfterValidator(non_blank)]
|
|
@@ -120,8 +202,8 @@ class UploadParams(BaseModel):
|
|
|
120
202
|
storage: int
|
|
121
203
|
collection: int
|
|
122
204
|
project: int | None
|
|
123
|
-
|
|
124
|
-
|
|
205
|
+
use_excel_metadata: bool = False
|
|
206
|
+
excel_metadata_path: str | None = None
|
|
125
207
|
|
|
126
208
|
@field_validator('storage', mode='before')
|
|
127
209
|
@classmethod
|
|
@@ -166,6 +248,74 @@ class UploadParams(BaseModel):
|
|
|
166
248
|
raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
|
|
167
249
|
return value
|
|
168
250
|
|
|
251
|
+
@field_validator('excel_metadata_path', mode='before')
|
|
252
|
+
@classmethod
|
|
253
|
+
def check_excel_metadata_path(cls, value: str, info) -> str:
|
|
254
|
+
"""Validate excel metadata file exists and is secure if use_excel_metadata is True.
|
|
255
|
+
|
|
256
|
+
This validator performs comprehensive security checks including:
|
|
257
|
+
- File existence and format validation
|
|
258
|
+
- File size limits (max 10MB)
|
|
259
|
+
- Basic security checks for file content
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
value: The excel file path to validate
|
|
263
|
+
info: Validation context information
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
str: The validated file path
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
PydanticCustomError: If validation fails
|
|
270
|
+
"""
|
|
271
|
+
if not value:
|
|
272
|
+
return value
|
|
273
|
+
|
|
274
|
+
# Check if use_excel_metadata is True
|
|
275
|
+
data = info.data
|
|
276
|
+
if data.get('use_excel_metadata', False):
|
|
277
|
+
excel_path = Path(value)
|
|
278
|
+
|
|
279
|
+
# Check file existence
|
|
280
|
+
if not excel_path.exists():
|
|
281
|
+
raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
|
|
282
|
+
|
|
283
|
+
# Check file extension
|
|
284
|
+
if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
|
|
285
|
+
raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
|
|
286
|
+
|
|
287
|
+
# Security check: file size limit
|
|
288
|
+
file_size = excel_path.stat().st_size
|
|
289
|
+
excel_config = ExcelSecurityConfig()
|
|
290
|
+
if file_size > excel_config.MAX_FILE_SIZE_BYTES:
|
|
291
|
+
raise PydanticCustomError(
|
|
292
|
+
'file_too_large',
|
|
293
|
+
_('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Basic security check: ensure file is readable and not corrupted
|
|
297
|
+
try:
|
|
298
|
+
with open(excel_path, 'rb') as f:
|
|
299
|
+
# Read first few bytes to check if it's a valid Excel file
|
|
300
|
+
header = f.read(8)
|
|
301
|
+
if not header:
|
|
302
|
+
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
|
|
303
|
+
|
|
304
|
+
# Check for valid Excel file signatures
|
|
305
|
+
if excel_path.suffix.lower() == '.xlsx':
|
|
306
|
+
# XLSX files start with PK (ZIP signature)
|
|
307
|
+
if not header.startswith(b'PK'):
|
|
308
|
+
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
309
|
+
elif excel_path.suffix.lower() == '.xls':
|
|
310
|
+
# XLS files have specific OLE signatures
|
|
311
|
+
if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
|
|
312
|
+
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
313
|
+
|
|
314
|
+
except (OSError, IOError):
|
|
315
|
+
raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
|
|
316
|
+
|
|
317
|
+
return value
|
|
318
|
+
|
|
169
319
|
|
|
170
320
|
@register_action
|
|
171
321
|
class UploadAction(Action):
|
|
@@ -194,13 +344,13 @@ class UploadAction(Action):
|
|
|
194
344
|
run_class = UploadRun
|
|
195
345
|
progress_categories = {
|
|
196
346
|
'analyze_collection': {
|
|
197
|
-
'proportion':
|
|
347
|
+
'proportion': 2,
|
|
198
348
|
},
|
|
199
349
|
'upload_data_files': {
|
|
200
|
-
'proportion':
|
|
350
|
+
'proportion': 38,
|
|
201
351
|
},
|
|
202
352
|
'generate_data_units': {
|
|
203
|
-
'proportion':
|
|
353
|
+
'proportion': 60,
|
|
204
354
|
},
|
|
205
355
|
}
|
|
206
356
|
metrics_categories = {
|
|
@@ -216,30 +366,283 @@ class UploadAction(Action):
|
|
|
216
366
|
},
|
|
217
367
|
}
|
|
218
368
|
|
|
369
|
+
def __init__(self, *args, **kwargs):
|
|
370
|
+
super().__init__(*args, **kwargs)
|
|
371
|
+
self.excel_config = ExcelSecurityConfig()
|
|
372
|
+
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
373
|
+
|
|
219
374
|
def get_uploader(self, path, file_specification, organized_files):
|
|
220
375
|
"""Get uploader from entrypoint."""
|
|
221
376
|
return self.entrypoint(self.run, path, file_specification, organized_files)
|
|
222
377
|
|
|
223
|
-
def
|
|
378
|
+
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
379
|
+
"""Validate Excel file security constraints.
|
|
380
|
+
|
|
381
|
+
Performs comprehensive security validation including:
|
|
382
|
+
- File size limits
|
|
383
|
+
- Memory usage constraints
|
|
384
|
+
- Basic malicious content detection
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
excel_path: Path to the Excel file to validate
|
|
388
|
+
|
|
389
|
+
Raises:
|
|
390
|
+
ExcelSecurityError: If security validation fails
|
|
391
|
+
"""
|
|
392
|
+
# File size check (already done in validator, but double-check)
|
|
393
|
+
file_size = excel_path.stat().st_size
|
|
394
|
+
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
395
|
+
raise ExcelSecurityError(
|
|
396
|
+
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Memory usage estimation (rough estimate: file_size * 3 for processing)
|
|
400
|
+
estimated_memory = file_size * 3
|
|
401
|
+
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
402
|
+
raise ExcelSecurityError(
|
|
403
|
+
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
404
|
+
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
408
|
+
"""Prepare Excel file for reading with security validation.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
excel_path: Path to the Excel file
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
BytesIO: Excel file stream ready for reading
|
|
415
|
+
|
|
416
|
+
Raises:
|
|
417
|
+
ExcelSecurityError: If security validation fails
|
|
418
|
+
"""
|
|
419
|
+
self._validate_excel_security(excel_path)
|
|
420
|
+
excel_bytes = excel_path.read_bytes()
|
|
421
|
+
return BytesIO(excel_bytes)
|
|
422
|
+
|
|
423
|
+
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
424
|
+
"""Process and validate Excel headers.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
headers: Raw header tuple from Excel
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
tuple: Validated headers
|
|
431
|
+
|
|
432
|
+
Raises:
|
|
433
|
+
ExcelParsingError: If headers are invalid
|
|
434
|
+
"""
|
|
435
|
+
if len(headers) < 2:
|
|
436
|
+
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
437
|
+
self._validate_excel_content(headers, 0) # Validate column count
|
|
438
|
+
return headers
|
|
439
|
+
|
|
440
|
+
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
441
|
+
"""Process a single Excel data row.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
row: Raw row data from Excel
|
|
445
|
+
headers: Excel headers
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
|
|
449
|
+
"""
|
|
450
|
+
# Skip empty rows
|
|
451
|
+
if not row[0] or str(row[0]).strip() == '':
|
|
452
|
+
return None
|
|
453
|
+
|
|
454
|
+
file_name = str(row[0]).strip()
|
|
455
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
456
|
+
self.run.log_message(
|
|
457
|
+
f'Skipping file with overly long name: {file_name[:50]}...', context=Context.WARNING.value
|
|
458
|
+
)
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
# Create metadata dictionary from remaining columns
|
|
462
|
+
file_metadata: Dict[str, Any] = {}
|
|
463
|
+
for i, value in enumerate(row[1:], start=1):
|
|
464
|
+
if value is not None and i < len(headers):
|
|
465
|
+
header_value = headers[i]
|
|
466
|
+
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
467
|
+
|
|
468
|
+
# Validate and truncate column name and value
|
|
469
|
+
column_name = self.excel_utils.validate_and_truncate_string(
|
|
470
|
+
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
471
|
+
)
|
|
472
|
+
str_value = self.excel_utils.validate_and_truncate_string(
|
|
473
|
+
str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
474
|
+
)
|
|
475
|
+
file_metadata[column_name] = str_value
|
|
476
|
+
|
|
477
|
+
return {file_name: file_metadata} if file_metadata else None
|
|
478
|
+
|
|
479
|
+
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
480
|
+
"""Process Excel worksheet and extract metadata.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
worksheet: openpyxl worksheet object
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Dict[str, Dict[str, Any]]: Extracted metadata dictionary
|
|
487
|
+
|
|
488
|
+
Raises:
|
|
489
|
+
ExcelParsingError: If worksheet processing fails
|
|
490
|
+
"""
|
|
491
|
+
if worksheet is None:
|
|
492
|
+
raise ExcelParsingError('Excel file has no active worksheet')
|
|
493
|
+
|
|
494
|
+
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
495
|
+
headers: Optional[tuple] = None
|
|
496
|
+
data_row_count = 0
|
|
497
|
+
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
498
|
+
|
|
499
|
+
# Process rows one by one for memory efficiency
|
|
500
|
+
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
501
|
+
if not row or all(cell is None or str(cell).strip() == '' for cell in row):
|
|
502
|
+
continue # Skip completely empty rows
|
|
503
|
+
|
|
504
|
+
if row_idx == 0: # Header row
|
|
505
|
+
headers = self._process_excel_headers(row)
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
# Data rows
|
|
509
|
+
if headers is None:
|
|
510
|
+
raise ExcelParsingError('Excel file missing header row')
|
|
511
|
+
|
|
512
|
+
data_row_count += 1
|
|
513
|
+
|
|
514
|
+
# Validate row count periodically
|
|
515
|
+
if data_row_count % validation_interval == 0:
|
|
516
|
+
self._validate_excel_content(headers, data_row_count)
|
|
517
|
+
|
|
518
|
+
# Process individual row
|
|
519
|
+
row_result = self._process_excel_data_row(row, headers)
|
|
520
|
+
if row_result:
|
|
521
|
+
metadata_dict.update(row_result)
|
|
522
|
+
|
|
523
|
+
# Final validation
|
|
524
|
+
self._validate_excel_content(headers or (), data_row_count)
|
|
525
|
+
|
|
526
|
+
return metadata_dict
|
|
527
|
+
|
|
528
|
+
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
529
|
+
"""Validate Excel content constraints.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
headers: Tuple of header values from the first row
|
|
533
|
+
row_count: Total number of data rows processed
|
|
534
|
+
|
|
535
|
+
Raises:
|
|
536
|
+
ExcelParsingError: If content validation fails
|
|
537
|
+
"""
|
|
538
|
+
# Limit number of columns to prevent memory exhaustion
|
|
539
|
+
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
540
|
+
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
541
|
+
|
|
542
|
+
# Limit number of rows to prevent excessive processing
|
|
543
|
+
if row_count > self.excel_config.MAX_ROWS:
|
|
544
|
+
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
545
|
+
|
|
546
|
+
def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
|
|
547
|
+
"""Read metadata from excel file with comprehensive security validation.
|
|
548
|
+
|
|
549
|
+
This method orchestrates the Excel metadata reading process by delegating
|
|
550
|
+
to specialized methods for each step.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
pathlib_cwd (Path): The pathlib object representing the current working directory.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
|
|
557
|
+
Empty dict if no metadata is configured or if reading fails.
|
|
558
|
+
|
|
559
|
+
Raises:
|
|
560
|
+
ExcelSecurityError: If security validation fails
|
|
561
|
+
ExcelParsingError: If Excel content is invalid or exceeds limits
|
|
562
|
+
"""
|
|
563
|
+
if not self.params.get('use_excel_metadata', False) or not self.params.get('excel_metadata_path'):
|
|
564
|
+
return {}
|
|
565
|
+
|
|
566
|
+
excel_path = pathlib_cwd / self.params['excel_metadata_path']
|
|
567
|
+
if not excel_path.exists():
|
|
568
|
+
self.run.log_message(f'Excel metadata file not found: {excel_path}', context=Context.WARNING.value)
|
|
569
|
+
return {}
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
# Prepare Excel file with security validation
|
|
573
|
+
excel_stream = self._prepare_excel_file(excel_path)
|
|
574
|
+
|
|
575
|
+
# Load and process workbook
|
|
576
|
+
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
577
|
+
try:
|
|
578
|
+
return self._process_excel_worksheet(workbook.active)
|
|
579
|
+
finally:
|
|
580
|
+
workbook.close()
|
|
581
|
+
|
|
582
|
+
except ExcelSecurityError as e:
|
|
583
|
+
self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
|
|
584
|
+
raise
|
|
585
|
+
except ExcelParsingError as e:
|
|
586
|
+
self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
|
|
587
|
+
raise
|
|
588
|
+
except InvalidFileException as e:
|
|
589
|
+
self.run.log_message(f'Invalid Excel file format: {str(e)}', context=Context.ERROR.value)
|
|
590
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
591
|
+
except MemoryError:
|
|
592
|
+
self.run.log_message('Excel file too large to process (memory limit exceeded)', context=Context.ERROR.value)
|
|
593
|
+
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
594
|
+
except (OSError, IOError) as e:
|
|
595
|
+
self.run.log_message(f'File access error reading excel metadata: {str(e)}', context=Context.ERROR.value)
|
|
596
|
+
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
597
|
+
except Exception as e:
|
|
598
|
+
self.run.log_message(f'Unexpected error reading excel metadata: {str(e)}', context=Context.ERROR.value)
|
|
599
|
+
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
600
|
+
|
|
601
|
+
def start(self) -> Dict[str, Any]:
|
|
224
602
|
"""Start upload process.
|
|
225
603
|
|
|
226
604
|
Returns:
|
|
227
605
|
Dict: The result of the upload process.
|
|
228
606
|
"""
|
|
607
|
+
# Setup result dict early for error handling
|
|
608
|
+
result: Dict[str, Any] = {}
|
|
609
|
+
|
|
229
610
|
# Setup path object with path and storage.
|
|
230
611
|
storage = self.client.get_storage(self.params['storage'])
|
|
231
612
|
pathlib_cwd = get_pathlib(storage, self.params['path'])
|
|
232
613
|
|
|
614
|
+
# Read excel metadata if configured
|
|
615
|
+
excel_metadata: Dict[str, Dict[str, Any]] = {}
|
|
616
|
+
try:
|
|
617
|
+
excel_metadata = self._read_excel_metadata(pathlib_cwd)
|
|
618
|
+
if excel_metadata:
|
|
619
|
+
self.run.log_message(f'Excel metadata loaded for {len(excel_metadata)} files')
|
|
620
|
+
elif self.params.get('use_excel_metadata', False):
|
|
621
|
+
self.run.log_message('Excel metadata enabled but no entries found')
|
|
622
|
+
# Don't log anything if Excel metadata is not being used
|
|
623
|
+
except ExcelSecurityError as e:
|
|
624
|
+
# Security violations should stop the process entirely
|
|
625
|
+
self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
|
|
626
|
+
self.run.log_message('Upload aborted due to Excel security concerns.', context=Context.ERROR.value)
|
|
627
|
+
return result
|
|
628
|
+
except ExcelParsingError as e:
|
|
629
|
+
# Parsing errors can be non-critical if Excel metadata is optional
|
|
630
|
+
if self.params.get('use_excel_metadata', False):
|
|
631
|
+
self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
|
|
632
|
+
self.run.log_message('Upload aborted due to Excel parsing failure.', context=Context.ERROR.value)
|
|
633
|
+
return result
|
|
634
|
+
else:
|
|
635
|
+
# If Excel metadata is not explicitly enabled, treat as warning and continue
|
|
636
|
+
self.run.log_message(f'Excel parsing failed (continuing): {str(e)}', context=Context.WARNING.value)
|
|
637
|
+
excel_metadata = {}
|
|
638
|
+
|
|
233
639
|
# Analyze Collection file specifications to determine the data structure for upload.
|
|
234
640
|
file_specification_template = self._analyze_collection()
|
|
235
|
-
organized_files = self._organize_files(pathlib_cwd, file_specification_template)
|
|
641
|
+
organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
|
|
236
642
|
|
|
237
643
|
# Initialize uploader.
|
|
238
644
|
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
|
|
239
645
|
|
|
240
|
-
# Setup result dict.
|
|
241
|
-
result = {}
|
|
242
|
-
|
|
243
646
|
# Get organized files from the uploader (plugin developer's custom implementation)
|
|
244
647
|
# or use the default organization method if uploader doesn't provide valid files
|
|
245
648
|
organized_files = uploader.handle_upload_files()
|
|
@@ -247,37 +650,35 @@ class UploadAction(Action):
|
|
|
247
650
|
# Validate the organized files
|
|
248
651
|
if not self._validate_organized_files(organized_files, file_specification_template):
|
|
249
652
|
self.run.log_message('Validation failed.', context=Context.ERROR.value)
|
|
250
|
-
self.run.
|
|
653
|
+
self.run.log_message('Upload is aborted due to validation errors.', context=Context.ERROR.value)
|
|
251
654
|
return result
|
|
252
655
|
|
|
253
656
|
# Upload files to synapse-backend.
|
|
254
|
-
|
|
255
|
-
if not organized_files_count:
|
|
657
|
+
if not organized_files:
|
|
256
658
|
self.run.log_message('Files not found on the path.', context=Context.WARNING.value)
|
|
257
|
-
self.run.
|
|
659
|
+
self.run.log_message('Upload is aborted due to missing files.', context=Context.ERROR.value)
|
|
258
660
|
return result
|
|
259
|
-
uploaded_files = self._upload_files(organized_files
|
|
661
|
+
uploaded_files = self._upload_files(organized_files)
|
|
260
662
|
result['uploaded_files_count'] = len(uploaded_files)
|
|
261
663
|
|
|
262
664
|
# Generate data units for the uploaded data.
|
|
263
|
-
|
|
264
|
-
if not upload_result_count:
|
|
665
|
+
if not uploaded_files:
|
|
265
666
|
self.run.log_message('No files were uploaded.', context=Context.WARNING.value)
|
|
266
|
-
self.run.
|
|
667
|
+
self.run.log_message('Upload is aborted due to no uploaded files.', context=Context.ERROR.value)
|
|
267
668
|
return result
|
|
268
|
-
generated_data_units = self._generate_data_units(uploaded_files
|
|
669
|
+
generated_data_units = self._generate_data_units(uploaded_files)
|
|
269
670
|
result['generated_data_units_count'] = len(generated_data_units)
|
|
270
671
|
|
|
271
672
|
# Setup task with uploaded synapse-backend data units.
|
|
272
|
-
if not
|
|
673
|
+
if not generated_data_units:
|
|
273
674
|
self.run.log_message('No data units were generated.', context=Context.WARNING.value)
|
|
274
|
-
self.run.
|
|
675
|
+
self.run.log_message('Upload is aborted due to no generated data units.', context=Context.ERROR.value)
|
|
275
676
|
return result
|
|
276
677
|
|
|
277
|
-
self.run.
|
|
678
|
+
self.run.log_message('Import completed.')
|
|
278
679
|
return result
|
|
279
680
|
|
|
280
|
-
def _analyze_collection(self) -> Dict:
|
|
681
|
+
def _analyze_collection(self) -> Dict[str, Any]:
|
|
281
682
|
"""Analyze Synapse Collection Specifications.
|
|
282
683
|
|
|
283
684
|
Returns:
|
|
@@ -285,50 +686,60 @@ class UploadAction(Action):
|
|
|
285
686
|
"""
|
|
286
687
|
|
|
287
688
|
# Initialize progress
|
|
288
|
-
self.run.set_progress(0,
|
|
689
|
+
self.run.set_progress(0, 2, category='analyze_collection')
|
|
289
690
|
|
|
290
|
-
client = self.run.client
|
|
291
691
|
collection_id = self.params['data_collection']
|
|
292
|
-
|
|
692
|
+
self.run.set_progress(1, 2, category='analyze_collection')
|
|
293
693
|
|
|
294
|
-
|
|
295
|
-
self.run.set_progress(
|
|
694
|
+
collection = self.run.client.get_data_collection(collection_id)
|
|
695
|
+
self.run.set_progress(2, 2, category='analyze_collection')
|
|
696
|
+
self.run.log_message('Collection analysis completed.')
|
|
296
697
|
|
|
297
698
|
return collection['file_specifications']
|
|
298
699
|
|
|
299
|
-
def _upload_files(self, organized_files,
|
|
700
|
+
def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
300
701
|
"""Upload files to synapse-backend.
|
|
301
702
|
|
|
302
703
|
Returns:
|
|
303
704
|
Dict: The result of the upload.
|
|
304
705
|
"""
|
|
305
706
|
# Initialize progress
|
|
707
|
+
organized_files_count = len(organized_files)
|
|
306
708
|
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
307
709
|
self.run.log_message('Uploading data files...')
|
|
308
|
-
data_file_metrics_record = self.run.MetricsRecord(stand_by=organized_files_count, success=0, failed=0)
|
|
309
710
|
|
|
310
711
|
client = self.run.client
|
|
311
712
|
collection_id = self.params['data_collection']
|
|
312
713
|
upload_result = []
|
|
313
|
-
organized_files_count = len(organized_files)
|
|
314
714
|
current_progress = 0
|
|
715
|
+
success_count = 0
|
|
716
|
+
failed_count = 0
|
|
717
|
+
|
|
718
|
+
# Initialize metrics
|
|
719
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_file')
|
|
720
|
+
|
|
315
721
|
for organized_file in organized_files:
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
722
|
+
try:
|
|
723
|
+
uploaded_data_file = client.upload_data_file(organized_file, collection_id)
|
|
724
|
+
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
725
|
+
success_count += 1
|
|
726
|
+
upload_result.append(uploaded_data_file)
|
|
727
|
+
except Exception as e:
|
|
728
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
729
|
+
self.run.log_message(f'Failed to upload file: {str(e)}')
|
|
730
|
+
failed_count += 1
|
|
731
|
+
|
|
323
732
|
current_progress += 1
|
|
733
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_file')
|
|
734
|
+
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
324
735
|
|
|
325
736
|
# Finish progress
|
|
326
737
|
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
327
|
-
self.run.log_message('Upload data files completed.')
|
|
738
|
+
self.run.log_message(f'Upload data files completed. Success: {success_count}, Failed: {failed_count}')
|
|
328
739
|
|
|
329
740
|
return upload_result
|
|
330
741
|
|
|
331
|
-
def _generate_data_units(self, uploaded_files: List,
|
|
742
|
+
def _generate_data_units(self, uploaded_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
332
743
|
"""Generate data units for the uploaded data.
|
|
333
744
|
|
|
334
745
|
TODO: make batch size configurable.
|
|
@@ -337,76 +748,93 @@ class UploadAction(Action):
|
|
|
337
748
|
Dict: The result of the generate data units process.
|
|
338
749
|
"""
|
|
339
750
|
# Initialize progress
|
|
751
|
+
upload_result_count = len(uploaded_files)
|
|
340
752
|
self.run.set_progress(0, upload_result_count, category='generate_data_units')
|
|
341
|
-
|
|
753
|
+
self.run.log_message('Generating data units...')
|
|
342
754
|
|
|
343
755
|
client = self.run.client
|
|
344
|
-
|
|
345
756
|
generated_data_units = []
|
|
346
757
|
current_progress = 0
|
|
758
|
+
success_count = 0
|
|
759
|
+
failed_count = 0
|
|
760
|
+
|
|
347
761
|
batches = get_batched_list(uploaded_files, 100)
|
|
348
762
|
batches_count = len(batches)
|
|
763
|
+
|
|
764
|
+
# Initialize metrics
|
|
765
|
+
self._update_metrics(upload_result_count, success_count, failed_count, 'data_unit')
|
|
766
|
+
|
|
349
767
|
for batch in batches:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
768
|
+
try:
|
|
769
|
+
created_data_units = client.create_data_units(batch)
|
|
770
|
+
success_count += len(created_data_units)
|
|
771
|
+
generated_data_units.append(created_data_units)
|
|
772
|
+
for created_data_unit in created_data_units:
|
|
773
|
+
self.run.log_data_unit(
|
|
774
|
+
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
775
|
+
)
|
|
776
|
+
except Exception as e:
|
|
777
|
+
failed_count += len(batch)
|
|
778
|
+
self.run.log_message(f'Failed to create data units batch: {str(e)}')
|
|
779
|
+
for _ in batch:
|
|
780
|
+
self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
781
|
+
|
|
356
782
|
current_progress += 1
|
|
357
|
-
|
|
358
|
-
|
|
783
|
+
self._update_metrics(upload_result_count, success_count, failed_count, 'data_unit')
|
|
784
|
+
self.run.set_progress(current_progress, batches_count, category='generate_data_units')
|
|
359
785
|
|
|
360
786
|
# Finish progress
|
|
361
787
|
self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
|
|
788
|
+
self.run.log_message(f'Data units generation completed. Success: {success_count}, Failed: {failed_count}')
|
|
362
789
|
|
|
363
790
|
return sum(generated_data_units, [])
|
|
364
791
|
|
|
365
|
-
def _validate_organized_files(
|
|
792
|
+
def _validate_organized_files(
|
|
793
|
+
self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
|
|
794
|
+
) -> bool:
|
|
366
795
|
"""Validate organized files from Uploader."""
|
|
367
796
|
validator = FileSpecificationValidator(file_specification_template, organized_files)
|
|
368
797
|
return validator.validate()
|
|
369
798
|
|
|
370
|
-
def _organize_files(
|
|
799
|
+
def _organize_files(
|
|
800
|
+
self,
|
|
801
|
+
directory: Path,
|
|
802
|
+
file_specification: List[Dict[str, Any]],
|
|
803
|
+
excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
804
|
+
) -> List[Dict[str, Any]]:
|
|
371
805
|
"""Organize files according to the file specification.
|
|
806
|
+
|
|
372
807
|
This method handles type-based directory structure where files are organized in
|
|
373
808
|
directories named after file types (e.g., 'image_1/' directory contains image files
|
|
374
809
|
like '1.jpg', '2.jpg'). For each dataset ID found in the primary directory, it attempts
|
|
375
810
|
to find corresponding files in all type directories.
|
|
376
811
|
|
|
377
|
-
TODO
|
|
378
|
-
(e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037,
|
|
812
|
+
TODO: Add Logic to handle file specific name patterns and extensions.
|
|
813
|
+
(e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037,
|
|
814
|
+
image_2:S_DCH_230725_0156_LF_037.jpg)
|
|
815
|
+
|
|
379
816
|
Args:
|
|
380
817
|
directory (Path): Root directory containing files to organize.
|
|
381
|
-
file_specification (List): File specification list.
|
|
818
|
+
file_specification (List[Dict[str, Any]]): File specification list with metadata.
|
|
819
|
+
excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
|
|
820
|
+
to their metadata key-value pairs from excel file.
|
|
821
|
+
|
|
382
822
|
Returns:
|
|
383
|
-
List: List of dictionaries containing organized files.
|
|
823
|
+
List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
|
|
384
824
|
"""
|
|
385
|
-
organized_files = []
|
|
825
|
+
organized_files: List[Dict[str, Any]] = []
|
|
386
826
|
|
|
387
827
|
# Check for type-based directory structure (e.g., image_1/, pcd_1/)
|
|
388
|
-
type_dirs = {}
|
|
389
|
-
type_extensions = {} # Store common extensions for each type directory
|
|
828
|
+
type_dirs: Dict[str, Path] = {}
|
|
390
829
|
|
|
391
830
|
for spec in file_specification:
|
|
392
831
|
spec_name = spec['name']
|
|
393
|
-
|
|
394
832
|
spec_dir = directory / spec_name
|
|
395
833
|
if spec_dir.exists() and spec_dir.is_dir():
|
|
396
834
|
type_dirs[spec_name] = spec_dir
|
|
397
835
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
for file_path in spec_dir.glob('*'):
|
|
401
|
-
if file_path.is_file():
|
|
402
|
-
ext = file_path.suffix.lower()
|
|
403
|
-
extensions[ext] = extensions.get(ext, 0) + 1
|
|
404
|
-
|
|
405
|
-
# Find the most common extension
|
|
406
|
-
if extensions:
|
|
407
|
-
common_ext = max(extensions.items(), key=lambda x: x[1])[0]
|
|
408
|
-
type_extensions[spec_name] = common_ext
|
|
409
|
-
self.run.log_message(f'Found type directory: {spec_name} (extension: {common_ext})')
|
|
836
|
+
if type_dirs:
|
|
837
|
+
self.run.log_message(f'Found type directories: {list(type_dirs.keys())}')
|
|
410
838
|
|
|
411
839
|
# If type-based directories don't exist, exit early
|
|
412
840
|
if not type_dirs:
|
|
@@ -415,10 +843,11 @@ class UploadAction(Action):
|
|
|
415
843
|
|
|
416
844
|
self.run.log_message('Detected type-based directory structure')
|
|
417
845
|
|
|
418
|
-
#
|
|
419
|
-
dataset_files = {}
|
|
846
|
+
# Collect and process files in a single pass
|
|
847
|
+
dataset_files = {}
|
|
848
|
+
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
420
849
|
|
|
421
|
-
#
|
|
850
|
+
# Process all files from all type directories
|
|
422
851
|
for spec_name, dir_path in type_dirs.items():
|
|
423
852
|
for file_path in dir_path.glob('*'):
|
|
424
853
|
if file_path.is_file():
|
|
@@ -428,45 +857,70 @@ class UploadAction(Action):
|
|
|
428
857
|
if file_name not in dataset_files:
|
|
429
858
|
dataset_files[file_name] = {}
|
|
430
859
|
|
|
431
|
-
# Map this file to its specification
|
|
860
|
+
# Map this file to its specification (handle duplicates)
|
|
432
861
|
if spec_name not in dataset_files[file_name]:
|
|
433
862
|
dataset_files[file_name][spec_name] = file_path
|
|
434
863
|
else:
|
|
435
|
-
# If multiple files with same file_name for same spec, use most recent
|
|
436
864
|
existing_file = dataset_files[file_name][spec_name]
|
|
437
865
|
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
438
866
|
dataset_files[file_name][spec_name] = file_path
|
|
439
|
-
self.run.log_message(
|
|
440
|
-
f"Found newer file for name of {file_name}, spec '{spec_name}': "
|
|
441
|
-
f'{file_path.name} (replacing {existing_file.name})'
|
|
442
|
-
)
|
|
443
867
|
|
|
444
868
|
if not dataset_files:
|
|
445
869
|
self.run.log_message('No files found.', context=Context.WARNING.value)
|
|
446
870
|
return organized_files
|
|
447
871
|
|
|
448
|
-
self.run.log_message(f'
|
|
872
|
+
self.run.log_message(f'Discovered {len(dataset_files)} files')
|
|
449
873
|
|
|
450
|
-
#
|
|
874
|
+
# Organize datasets - check requirements and create metadata
|
|
451
875
|
for file_name, files_dict in sorted(dataset_files.items()):
|
|
452
|
-
# Check if all required files are present
|
|
453
|
-
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
454
876
|
if all(req in files_dict for req in required_specs):
|
|
877
|
+
# Get most common file extension
|
|
878
|
+
file_extensions = {}
|
|
879
|
+
for file_path in files_dict.values():
|
|
880
|
+
ext = file_path.suffix.lower()
|
|
881
|
+
if ext:
|
|
882
|
+
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
883
|
+
|
|
884
|
+
origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
|
|
885
|
+
|
|
455
886
|
# Create metadata for this dataset
|
|
456
|
-
meta_data = {
|
|
887
|
+
meta_data: Dict[str, Any] = {
|
|
457
888
|
'origin_file_stem': file_name,
|
|
889
|
+
'origin_file_extension': origin_file_extension,
|
|
458
890
|
'created_at': datetime.now().isoformat(),
|
|
459
891
|
}
|
|
460
892
|
|
|
893
|
+
# Add excel metadata if available
|
|
894
|
+
if excel_metadata and file_name in excel_metadata:
|
|
895
|
+
meta_data.update(excel_metadata[file_name])
|
|
896
|
+
|
|
461
897
|
# Add the organized dataset
|
|
462
898
|
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
463
899
|
else:
|
|
464
|
-
# Missing required files warning
|
|
465
900
|
missing = [req for req in required_specs if req not in files_dict]
|
|
466
901
|
self.run.log_message(
|
|
467
|
-
f'Dataset ID {file_name}
|
|
902
|
+
f'Dataset ID {file_name} missing required files: {", ".join(missing)}',
|
|
468
903
|
context=Context.WARNING.value,
|
|
469
904
|
)
|
|
470
905
|
|
|
471
|
-
self.run.log_message(f'Total datasets organized: {len(organized_files)}')
|
|
472
906
|
return organized_files
|
|
907
|
+
|
|
908
|
+
def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
|
|
909
|
+
"""Update metrics for upload progress.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
total_count (int): Total number of items to process.
|
|
913
|
+
success_count (int): Number of successfully processed items.
|
|
914
|
+
failed_count (int): Number of failed items.
|
|
915
|
+
category (str): The category of the metrics.
|
|
916
|
+
"""
|
|
917
|
+
if not self.run:
|
|
918
|
+
raise ValueError('Run instance not properly initialized')
|
|
919
|
+
|
|
920
|
+
# Type assertion to help the linter
|
|
921
|
+
assert isinstance(self.run, UploadRun)
|
|
922
|
+
|
|
923
|
+
metrics = self.run.MetricsRecord(
|
|
924
|
+
stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
|
|
925
|
+
)
|
|
926
|
+
self.run.log_metrics(metrics, category)
|
|
@@ -2,8 +2,6 @@ actions:
|
|
|
2
2
|
upload:
|
|
3
3
|
entrypoint: plugin.upload.Uploader
|
|
4
4
|
options:
|
|
5
|
-
allow_generate_tasks: false # Allow the plugin to generate tasks for the uploaded data
|
|
6
|
-
allow_generate_ground_truths: false # Allow the plugin to generate ground truths for the uploaded data
|
|
7
5
|
supported_data_type: image # A primary data type of synapse backend collection. (e.g. 'image', 'text', 'video', 'pcd', 'audio')
|
|
8
6
|
ui_schema: |
|
|
9
7
|
Dumped FormKit Schema for upload plugin custom options
|
|
@@ -241,16 +241,6 @@
|
|
|
241
241
|
"type": "boolean",
|
|
242
242
|
"description": "Whether to enable visualization for training",
|
|
243
243
|
"default": false
|
|
244
|
-
},
|
|
245
|
-
"allow_generate_tasks": {
|
|
246
|
-
"type": "boolean",
|
|
247
|
-
"description": "Allow the plugin to generate tasks for uploaded data",
|
|
248
|
-
"default": false
|
|
249
|
-
},
|
|
250
|
-
"allow_generate_ground_truths": {
|
|
251
|
-
"type": "boolean",
|
|
252
|
-
"description": "Allow the plugin to generate ground truths for uploaded data",
|
|
253
|
-
"default": false
|
|
254
244
|
}
|
|
255
245
|
}
|
|
256
246
|
},
|
|
@@ -1 +1 @@
|
|
|
1
|
-
synapse-sdk[all]
|
|
1
|
+
synapse-sdk[all]
|
|
@@ -120,7 +120,7 @@ synapse_sdk/plugins/categories/data_validation/actions/__init__.py,sha256=47DEQp
|
|
|
120
120
|
synapse_sdk/plugins/categories/data_validation/actions/validation.py,sha256=E6irjH3aylnS_nIlXVDdqwpkvMxlTwzmedYeXj5sgSk,2262
|
|
121
121
|
synapse_sdk/plugins/categories/data_validation/templates/config.yaml,sha256=Hijb-b3hy0msZsTV_bbr3Hvlk8ok-Rk0a05mlLGTCAg,66
|
|
122
122
|
synapse_sdk/plugins/categories/data_validation/templates/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
123
|
-
synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py,sha256=
|
|
123
|
+
synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py,sha256=O-5eFC-njZzt7tAZlIRAG2UT_zhMEYDrio1e2g4UjVY,1263
|
|
124
124
|
synapse_sdk/plugins/categories/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
125
|
synapse_sdk/plugins/categories/export/enums.py,sha256=gtyngvQ1DKkos9iKGcbecwTVQQ6sDwbrBPSGPNb5Am0,127
|
|
126
126
|
synapse_sdk/plugins/categories/export/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -165,12 +165,12 @@ synapse_sdk/plugins/categories/smart_tool/templates/plugin/__init__.py,sha256=47
|
|
|
165
165
|
synapse_sdk/plugins/categories/smart_tool/templates/plugin/auto_label.py,sha256=eevNg0nOcYFR4z_L_R-sCvVOYoLWSAH1jwDkAf3YCjY,320
|
|
166
166
|
synapse_sdk/plugins/categories/upload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
167
|
synapse_sdk/plugins/categories/upload/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
|
-
synapse_sdk/plugins/categories/upload/actions/upload.py,sha256=
|
|
169
|
-
synapse_sdk/plugins/categories/upload/templates/config.yaml,sha256=
|
|
168
|
+
synapse_sdk/plugins/categories/upload/actions/upload.py,sha256=Nhlq1aIvnhybxKhm7KlJoTGP3NO-JsE2Ya7Dgi7HVOg,37427
|
|
169
|
+
synapse_sdk/plugins/categories/upload/templates/config.yaml,sha256=6_dRa0_J2aS8NSUfO4MKbPxZcdPS2FpJzzp51edYAZc,281
|
|
170
170
|
synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
171
|
synapse_sdk/plugins/categories/upload/templates/plugin/upload.py,sha256=IZU4sdSMSLKPCtlNqF7DP2howTdYR6hr74HCUZsGdPk,1559
|
|
172
172
|
synapse_sdk/plugins/templates/cookiecutter.json,sha256=NxOWk9A_v1pO0Ny4IYT9Cj5iiJ16--cIQrGC67QdR0I,396
|
|
173
|
-
synapse_sdk/plugins/templates/plugin-config-schema.json,sha256=
|
|
173
|
+
synapse_sdk/plugins/templates/plugin-config-schema.json,sha256=_ff1dnHdWPcWjD1Wy8Wn0aT6TnMD5J5Sdms3RTSWCUs,11805
|
|
174
174
|
synapse_sdk/plugins/templates/schema.json,sha256=E5bZjskN0mqQtk2eLAB1nfXufIoGvyrtkMMP-wGKSdc,14029
|
|
175
175
|
synapse_sdk/plugins/templates/hooks/post_gen_project.py,sha256=jqlYkY1O2TxIR-Vh3gnwILYy8k-D39Xx66d2KNQVMCs,147
|
|
176
176
|
synapse_sdk/plugins/templates/hooks/pre_prompt.py,sha256=aOAMM623s0sKFGjTZaotAOYFvsNMxeii4tPyhOAFKVE,539
|
|
@@ -179,7 +179,7 @@ synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/.pre-c
|
|
|
179
179
|
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/README.md,sha256=ETBZv_2Ocgzn4Fe3o5Y842mZiz00ABuAalrXpNVnWU0,56
|
|
180
180
|
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/config.yaml,sha256=tbX80vu-c_FyRJgPQXp8WoxJ2XhTYF8c1qv7SdHwddY,230
|
|
181
181
|
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/pyproject.toml,sha256=Usgd80tHZAD1Ug5MAjPfETUZxtKKgZW-xovFEAEbQDo,317
|
|
182
|
-
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt,sha256=
|
|
182
|
+
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt,sha256=TtWSjUr7lzkNdnFrLIEaC8ZMHAL3pEu602pcccLen0o,16
|
|
183
183
|
synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
184
184
|
synapse_sdk/plugins/utils/__init__.py,sha256=h868T8mxZDgo746W5u0YUB_biHhPov0sr4usFjefYJw,988
|
|
185
185
|
synapse_sdk/plugins/utils/actions.py,sha256=QjzISxUMyrcFtOphaYP8tdAkAAzOUZlfnMbVwR_zt2I,3736
|
|
@@ -221,9 +221,9 @@ synapse_sdk/utils/storage/providers/gcp.py,sha256=i2BQCu1Kej1If9SuNr2_lEyTcr5M_n
|
|
|
221
221
|
synapse_sdk/utils/storage/providers/http.py,sha256=2DhIulND47JOnS5ZY7MZUex7Su3peAPksGo1Wwg07L4,5828
|
|
222
222
|
synapse_sdk/utils/storage/providers/s3.py,sha256=ZmqekAvIgcQBdRU-QVJYv1Rlp6VHfXwtbtjTSphua94,2573
|
|
223
223
|
synapse_sdk/utils/storage/providers/sftp.py,sha256=_8s9hf0JXIO21gvm-JVS00FbLsbtvly4c-ETLRax68A,1426
|
|
224
|
-
synapse_sdk-1.0.
|
|
225
|
-
synapse_sdk-1.0.
|
|
226
|
-
synapse_sdk-1.0.
|
|
227
|
-
synapse_sdk-1.0.
|
|
228
|
-
synapse_sdk-1.0.
|
|
229
|
-
synapse_sdk-1.0.
|
|
224
|
+
synapse_sdk-1.0.0a88.dist-info/licenses/LICENSE,sha256=bKzmC5YAg4V1Fhl8OO_tqY8j62hgdncAkN7VrdjmrGk,1101
|
|
225
|
+
synapse_sdk-1.0.0a88.dist-info/METADATA,sha256=z0WgvP2Fz3kFGvw7LheZC1_jFFIAIUFcQdMT18UL5EY,3805
|
|
226
|
+
synapse_sdk-1.0.0a88.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
227
|
+
synapse_sdk-1.0.0a88.dist-info/entry_points.txt,sha256=VNptJoGoNJI8yLXfBmhgUefMsmGI0m3-0YoMvrOgbxo,48
|
|
228
|
+
synapse_sdk-1.0.0a88.dist-info/top_level.txt,sha256=ytgJMRK1slVOKUpgcw3LEyHHP7S34J6n_gJzdkcSsw8,12
|
|
229
|
+
synapse_sdk-1.0.0a88.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|