synapse-sdk 1.0.0b22__py3-none-any.whl → 1.0.0b23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +680 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +897 -0
- synapse_sdk/devtools/docs/sidebars.ts +1 -0
- synapse_sdk/plugins/README.md +934 -0
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +20 -0
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +623 -0
- synapse_sdk/plugins/categories/upload/actions/upload/enums.py +221 -0
- synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +149 -0
- synapse_sdk/plugins/categories/upload/actions/upload/run.py +178 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +139 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +6 -1
- synapse_sdk/plugins/models.py +13 -7
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/METADATA +1 -1
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/RECORD +19 -10
- synapse_sdk/plugins/categories/upload/actions/upload.py +0 -1368
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/top_level.txt +0 -0
|
@@ -1,1368 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
import shutil
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from enum import Enum
|
|
7
|
-
from io import BytesIO
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import Annotated, Any, Awaitable, Dict, List, Optional, TypeVar
|
|
10
|
-
|
|
11
|
-
from openpyxl import load_workbook
|
|
12
|
-
from openpyxl.utils.exceptions import InvalidFileException
|
|
13
|
-
from pydantic import AfterValidator, BaseModel, field_validator
|
|
14
|
-
from pydantic_core import PydanticCustomError
|
|
15
|
-
|
|
16
|
-
from synapse_sdk.clients.exceptions import ClientError
|
|
17
|
-
from synapse_sdk.clients.utils import get_batched_list
|
|
18
|
-
from synapse_sdk.clients.validators.collections import FileSpecificationValidator
|
|
19
|
-
from synapse_sdk.i18n import gettext as _
|
|
20
|
-
from synapse_sdk.plugins.categories.base import Action
|
|
21
|
-
from synapse_sdk.plugins.categories.decorators import register_action
|
|
22
|
-
from synapse_sdk.plugins.enums import PluginCategory, RunMethod
|
|
23
|
-
from synapse_sdk.plugins.exceptions import ActionError
|
|
24
|
-
from synapse_sdk.plugins.models import Run
|
|
25
|
-
from synapse_sdk.shared.enums import Context
|
|
26
|
-
from synapse_sdk.utils.pydantic.validators import non_blank
|
|
27
|
-
from synapse_sdk.utils.storage import get_pathlib
|
|
28
|
-
|
|
29
|
-
# Type variable for generic async return type
|
|
30
|
-
T = TypeVar('T')
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class PathAwareJSONEncoder(json.JSONEncoder):
|
|
34
|
-
"""Custom JSON encoder that handles Path-like objects."""
|
|
35
|
-
|
|
36
|
-
def default(self, obj):
|
|
37
|
-
if hasattr(obj, '__fspath__') or hasattr(obj, 'as_posix'):
|
|
38
|
-
# Handle Path-like objects (including UPath, SFTPPath, pathlib.Path, etc.)
|
|
39
|
-
return str(obj)
|
|
40
|
-
elif hasattr(obj, 'isoformat'):
|
|
41
|
-
# Handle datetime objects
|
|
42
|
-
return obj.isoformat()
|
|
43
|
-
# Let the base class handle other types
|
|
44
|
-
return super().default(obj)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class UploadStatus(str, Enum):
|
|
48
|
-
SUCCESS = 'success'
|
|
49
|
-
FAILED = 'failed'
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class UploadRun(Run):
|
|
53
|
-
class UploadEventLog(BaseModel):
|
|
54
|
-
"""Upload event log model."""
|
|
55
|
-
|
|
56
|
-
info: Optional[str] = None
|
|
57
|
-
status: Context
|
|
58
|
-
created: str
|
|
59
|
-
|
|
60
|
-
class DataFileLog(BaseModel):
|
|
61
|
-
"""Data file log model."""
|
|
62
|
-
|
|
63
|
-
data_file_info: str | None
|
|
64
|
-
status: UploadStatus
|
|
65
|
-
created: str
|
|
66
|
-
|
|
67
|
-
class DataUnitLog(BaseModel):
|
|
68
|
-
"""Data unit log model."""
|
|
69
|
-
|
|
70
|
-
data_unit_id: int | None
|
|
71
|
-
status: UploadStatus
|
|
72
|
-
created: str
|
|
73
|
-
data_unit_meta: dict | None
|
|
74
|
-
|
|
75
|
-
class TaskLog(BaseModel):
|
|
76
|
-
"""Task log model."""
|
|
77
|
-
|
|
78
|
-
task_id: int | None
|
|
79
|
-
status: UploadStatus
|
|
80
|
-
created: str
|
|
81
|
-
|
|
82
|
-
class MetricsRecord(BaseModel):
|
|
83
|
-
"""Metrics record model."""
|
|
84
|
-
|
|
85
|
-
stand_by: int
|
|
86
|
-
failed: int
|
|
87
|
-
success: int
|
|
88
|
-
|
|
89
|
-
LOG_MESSAGES = {
|
|
90
|
-
# Validation errors - show in both log_message and EventLog
|
|
91
|
-
'STORAGE_VALIDATION_FAILED': {
|
|
92
|
-
'message': 'Storage validation failed.',
|
|
93
|
-
'level': Context.DANGER,
|
|
94
|
-
},
|
|
95
|
-
'COLLECTION_VALIDATION_FAILED': {
|
|
96
|
-
'message': 'Collection validation failed.',
|
|
97
|
-
'level': Context.DANGER,
|
|
98
|
-
},
|
|
99
|
-
'PROJECT_VALIDATION_FAILED': {
|
|
100
|
-
'message': 'Project validation failed.',
|
|
101
|
-
'level': Context.DANGER,
|
|
102
|
-
},
|
|
103
|
-
'VALIDATION_FAILED': {
|
|
104
|
-
'message': 'Validation failed.',
|
|
105
|
-
'level': Context.DANGER,
|
|
106
|
-
},
|
|
107
|
-
'NO_FILES_FOUND': {
|
|
108
|
-
'message': 'Files not found on the path.',
|
|
109
|
-
'level': Context.WARNING,
|
|
110
|
-
},
|
|
111
|
-
'NO_FILES_UPLOADED': {
|
|
112
|
-
'message': 'No files were uploaded.',
|
|
113
|
-
'level': Context.WARNING,
|
|
114
|
-
},
|
|
115
|
-
'NO_DATA_UNITS_GENERATED': {
|
|
116
|
-
'message': 'No data units were generated.',
|
|
117
|
-
'level': Context.WARNING,
|
|
118
|
-
},
|
|
119
|
-
'NO_TYPE_DIRECTORIES': {
|
|
120
|
-
'message': 'No type-based directory structure found.',
|
|
121
|
-
'level': Context.INFO,
|
|
122
|
-
},
|
|
123
|
-
'EXCEL_SECURITY_VIOLATION': {
|
|
124
|
-
'message': 'Excel security validation failed: {}',
|
|
125
|
-
'level': Context.DANGER,
|
|
126
|
-
},
|
|
127
|
-
'EXCEL_PARSING_ERROR': {
|
|
128
|
-
'message': 'Excel parsing failed: {}',
|
|
129
|
-
'level': Context.DANGER,
|
|
130
|
-
},
|
|
131
|
-
'EXCEL_METADATA_LOADED': {
|
|
132
|
-
'message': 'Excel metadata loaded for {} files',
|
|
133
|
-
'level': None,
|
|
134
|
-
},
|
|
135
|
-
'UPLOADING_DATA_FILES': {
|
|
136
|
-
'message': 'Uploading data files...',
|
|
137
|
-
'level': None,
|
|
138
|
-
},
|
|
139
|
-
'GENERATING_DATA_UNITS': {
|
|
140
|
-
'message': 'Generating data units...',
|
|
141
|
-
'level': None,
|
|
142
|
-
},
|
|
143
|
-
'IMPORT_COMPLETED': {
|
|
144
|
-
'message': 'Import completed.',
|
|
145
|
-
'level': None,
|
|
146
|
-
},
|
|
147
|
-
'TYPE_DIRECTORIES_FOUND': {
|
|
148
|
-
'message': 'Found type directories: {}',
|
|
149
|
-
'level': None,
|
|
150
|
-
},
|
|
151
|
-
'TYPE_STRUCTURE_DETECTED': {
|
|
152
|
-
'message': 'Detected type-based directory structure',
|
|
153
|
-
'level': None,
|
|
154
|
-
},
|
|
155
|
-
'FILES_DISCOVERED': {
|
|
156
|
-
'message': 'Discovered {} files',
|
|
157
|
-
'level': None,
|
|
158
|
-
},
|
|
159
|
-
'NO_FILES_FOUND_WARNING': {
|
|
160
|
-
'message': 'No files found.',
|
|
161
|
-
'level': Context.WARNING,
|
|
162
|
-
},
|
|
163
|
-
'FILE_UPLOAD_FAILED': {
|
|
164
|
-
'message': 'Failed to upload file: {}',
|
|
165
|
-
'level': Context.DANGER,
|
|
166
|
-
},
|
|
167
|
-
'DATA_UNIT_BATCH_FAILED': {
|
|
168
|
-
'message': 'Failed to create data units batch: {}',
|
|
169
|
-
'level': Context.DANGER,
|
|
170
|
-
},
|
|
171
|
-
'FILENAME_TOO_LONG': {
|
|
172
|
-
'message': 'Skipping file with overly long name: {}...',
|
|
173
|
-
'level': Context.WARNING,
|
|
174
|
-
},
|
|
175
|
-
'MISSING_REQUIRED_FILES': {
|
|
176
|
-
'message': '{} missing required files: {}',
|
|
177
|
-
'level': Context.WARNING,
|
|
178
|
-
},
|
|
179
|
-
'EXCEL_FILE_NOT_FOUND': {
|
|
180
|
-
'message': 'Excel metadata file not found: {}',
|
|
181
|
-
'level': Context.WARNING,
|
|
182
|
-
},
|
|
183
|
-
# Debug information - only for EventLog
|
|
184
|
-
'EXCEL_FILE_VALIDATION_STARTED': {
|
|
185
|
-
'message': 'Excel file validation started',
|
|
186
|
-
'level': Context.INFO,
|
|
187
|
-
},
|
|
188
|
-
'EXCEL_WORKBOOK_LOADED': {
|
|
189
|
-
'message': 'Excel workbook loaded successfully',
|
|
190
|
-
'level': Context.INFO,
|
|
191
|
-
},
|
|
192
|
-
'FILE_ORGANIZATION_STARTED': {
|
|
193
|
-
'message': 'File organization started',
|
|
194
|
-
'level': Context.INFO,
|
|
195
|
-
},
|
|
196
|
-
'BATCH_PROCESSING_STARTED': {
|
|
197
|
-
'message': 'Batch processing started: {} batches of {} items each',
|
|
198
|
-
'level': Context.INFO,
|
|
199
|
-
},
|
|
200
|
-
'EXCEL_SECURITY_VALIDATION_STARTED': {
|
|
201
|
-
'message': 'Excel security validation started for file size: {} bytes',
|
|
202
|
-
'level': Context.INFO,
|
|
203
|
-
},
|
|
204
|
-
'EXCEL_MEMORY_ESTIMATION': {
|
|
205
|
-
'message': 'Excel memory estimation: {} bytes (file) * 3 = {} bytes (estimated)',
|
|
206
|
-
'level': Context.INFO,
|
|
207
|
-
},
|
|
208
|
-
'EXCEL_FILE_NOT_FOUND_PATH': {
|
|
209
|
-
'message': 'Excel metadata file not found',
|
|
210
|
-
'level': Context.WARNING,
|
|
211
|
-
},
|
|
212
|
-
'EXCEL_SECURITY_VALIDATION_FAILED': {
|
|
213
|
-
'message': 'Excel security validation failed: {}',
|
|
214
|
-
'level': Context.DANGER,
|
|
215
|
-
},
|
|
216
|
-
'EXCEL_PARSING_FAILED': {
|
|
217
|
-
'message': 'Excel parsing failed: {}',
|
|
218
|
-
'level': Context.DANGER,
|
|
219
|
-
},
|
|
220
|
-
'EXCEL_INVALID_FILE_FORMAT': {
|
|
221
|
-
'message': 'Invalid Excel file format: {}',
|
|
222
|
-
'level': Context.DANGER,
|
|
223
|
-
},
|
|
224
|
-
'EXCEL_FILE_TOO_LARGE': {
|
|
225
|
-
'message': 'Excel file too large to process (memory limit exceeded)',
|
|
226
|
-
'level': Context.DANGER,
|
|
227
|
-
},
|
|
228
|
-
'EXCEL_FILE_ACCESS_ERROR': {
|
|
229
|
-
'message': 'File access error reading excel metadata: {}',
|
|
230
|
-
'level': Context.DANGER,
|
|
231
|
-
},
|
|
232
|
-
'EXCEL_UNEXPECTED_ERROR': {
|
|
233
|
-
'message': 'Unexpected error reading excel metadata: {}',
|
|
234
|
-
'level': Context.DANGER,
|
|
235
|
-
},
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
def log_message_with_code(self, code: str, *args, level: Optional[Context] = None):
|
|
239
|
-
"""Unified logging method that handles both log_message and EventLog based on configuration."""
|
|
240
|
-
if code not in self.LOG_MESSAGES:
|
|
241
|
-
self.log_message(f'Unknown log code: {code}')
|
|
242
|
-
self.log_upload_event('UNKNOWN_LOG_CODE', code)
|
|
243
|
-
return
|
|
244
|
-
|
|
245
|
-
log_config = self.LOG_MESSAGES[code]
|
|
246
|
-
message = log_config['message'].format(*args) if args else log_config['message']
|
|
247
|
-
log_level = level or log_config['level'] or Context.INFO
|
|
248
|
-
|
|
249
|
-
# Log to message if configured
|
|
250
|
-
if log_level == Context.INFO.value:
|
|
251
|
-
self.log_message(message, context=log_level.value)
|
|
252
|
-
else:
|
|
253
|
-
self.log_upload_event(code, *args, level)
|
|
254
|
-
|
|
255
|
-
def log_upload_event(self, code: str, *args, level: Optional[Context] = None):
|
|
256
|
-
"""Log upload event using predefined code."""
|
|
257
|
-
if code not in self.LOG_MESSAGES:
|
|
258
|
-
now = datetime.now().isoformat()
|
|
259
|
-
self.log(
|
|
260
|
-
'upload_event',
|
|
261
|
-
self.UploadEventLog(info=f'Unknown log code: {code}', status=Context.DANGER, created=now).model_dump(),
|
|
262
|
-
)
|
|
263
|
-
return
|
|
264
|
-
|
|
265
|
-
log_config = self.LOG_MESSAGES[code]
|
|
266
|
-
message = log_config['message'].format(*args) if args else log_config['message']
|
|
267
|
-
log_level = level or log_config['level'] or Context.INFO
|
|
268
|
-
|
|
269
|
-
now = datetime.now().isoformat()
|
|
270
|
-
self.log(
|
|
271
|
-
'upload_event',
|
|
272
|
-
self.UploadEventLog(info=message, status=log_level, created=now).model_dump(),
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
def log_data_file(self, data_file_info: dict, status: UploadStatus):
|
|
276
|
-
"""Upload data_file log.
|
|
277
|
-
|
|
278
|
-
Args:
|
|
279
|
-
data_file_info (dict): The json info of the data file.
|
|
280
|
-
status (UploadStatus): The status of the data file.
|
|
281
|
-
"""
|
|
282
|
-
now = datetime.now().isoformat()
|
|
283
|
-
# Use custom JSON encoder to handle Path-like objects
|
|
284
|
-
data_file_info_str = json.dumps(data_file_info, ensure_ascii=False, cls=PathAwareJSONEncoder)
|
|
285
|
-
self.log(
|
|
286
|
-
'upload_data_file',
|
|
287
|
-
self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
|
|
291
|
-
"""Upload data_unit log.
|
|
292
|
-
|
|
293
|
-
Args:
|
|
294
|
-
data_unit_id (int): The ID of the data unit.
|
|
295
|
-
status (UploadStatus): The status of the data unit.
|
|
296
|
-
data_unit_meta (dict | None): The metadata of the data unit.
|
|
297
|
-
"""
|
|
298
|
-
now = datetime.now().isoformat()
|
|
299
|
-
self.log(
|
|
300
|
-
'upload_data_unit',
|
|
301
|
-
self.DataUnitLog(
|
|
302
|
-
data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
|
|
303
|
-
).model_dump(),
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
def log_task(self, task_id: int, status: UploadStatus):
|
|
307
|
-
"""Upload task log.
|
|
308
|
-
|
|
309
|
-
Args:
|
|
310
|
-
task_id (int): The ID of the task.
|
|
311
|
-
status (UploadStatus): The status of the task.
|
|
312
|
-
"""
|
|
313
|
-
now = datetime.now().isoformat()
|
|
314
|
-
self.log('upload_task', self.TaskLog(task_id=task_id, status=status.value, created=now).model_dump())
|
|
315
|
-
|
|
316
|
-
def log_metrics(self, record: MetricsRecord, category: str):
|
|
317
|
-
"""Log upload metrics.
|
|
318
|
-
Args:
|
|
319
|
-
record (MetricsRecord): The metrics record to log.
|
|
320
|
-
category (str): The category of the metrics.
|
|
321
|
-
"""
|
|
322
|
-
record = self.MetricsRecord.model_validate(record)
|
|
323
|
-
self.set_metrics(value=record.model_dump(), category=category)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
class ExcelSecurityError(Exception):
|
|
327
|
-
"""Custom exception for Excel security validation errors."""
|
|
328
|
-
|
|
329
|
-
pass
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
class ExcelParsingError(Exception):
|
|
333
|
-
"""Custom exception for Excel parsing errors."""
|
|
334
|
-
|
|
335
|
-
pass
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
class ExcelSecurityConfig:
|
|
339
|
-
"""Configuration class for Excel security settings."""
|
|
340
|
-
|
|
341
|
-
def __init__(self):
|
|
342
|
-
# File size limits
|
|
343
|
-
self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
|
|
344
|
-
self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
|
|
345
|
-
|
|
346
|
-
# Memory limits
|
|
347
|
-
self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
|
|
348
|
-
self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
|
|
349
|
-
|
|
350
|
-
# Content limits
|
|
351
|
-
self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
|
|
352
|
-
self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
|
|
353
|
-
|
|
354
|
-
# String length limits
|
|
355
|
-
self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
|
|
356
|
-
self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
|
|
357
|
-
self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
class ExcelMetadataUtils:
|
|
361
|
-
"""Utility class for Excel metadata processing with shared validation logic."""
|
|
362
|
-
|
|
363
|
-
def __init__(self, config: ExcelSecurityConfig):
|
|
364
|
-
self.config = config
|
|
365
|
-
|
|
366
|
-
def validate_and_truncate_string(self, value: str, max_length: int) -> str:
|
|
367
|
-
"""Validate and truncate string to specified maximum length.
|
|
368
|
-
|
|
369
|
-
Args:
|
|
370
|
-
value: String value to validate and truncate
|
|
371
|
-
max_length: Maximum allowed length
|
|
372
|
-
|
|
373
|
-
Returns:
|
|
374
|
-
str: Validated and potentially truncated string
|
|
375
|
-
"""
|
|
376
|
-
if not isinstance(value, str):
|
|
377
|
-
value = str(value)
|
|
378
|
-
|
|
379
|
-
value = value.strip()
|
|
380
|
-
|
|
381
|
-
if len(value) > max_length:
|
|
382
|
-
return value[:max_length]
|
|
383
|
-
|
|
384
|
-
return value
|
|
385
|
-
|
|
386
|
-
def is_valid_filename_length(self, filename: str) -> bool:
|
|
387
|
-
"""Check if filename length is within acceptable limits.
|
|
388
|
-
|
|
389
|
-
Args:
|
|
390
|
-
filename: Filename to check
|
|
391
|
-
|
|
392
|
-
Returns:
|
|
393
|
-
bool: True if filename length is acceptable
|
|
394
|
-
"""
|
|
395
|
-
return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
class UploadParams(BaseModel):
|
|
399
|
-
"""Upload action parameters.
|
|
400
|
-
|
|
401
|
-
This class defines all configuration parameters for the upload action, including
|
|
402
|
-
advanced asynchronous upload capabilities for improved performance.
|
|
403
|
-
|
|
404
|
-
Args:
|
|
405
|
-
name (str): The name of the action.
|
|
406
|
-
description (str | None): The description of the action.
|
|
407
|
-
checkpoint (int | None): The checkpoint of the action.
|
|
408
|
-
path (str): The path of the action.
|
|
409
|
-
storage (int): The storage of the action.
|
|
410
|
-
collection (int): The collection of the action.
|
|
411
|
-
project (int | None): The project of the action.
|
|
412
|
-
excel_metadata_path (str | None): Path to excel file containing metadata.
|
|
413
|
-
Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
|
|
414
|
-
is_recursive (bool): Enable recursive file discovery in subdirectories. Defaults to False.
|
|
415
|
-
use_async_upload (bool): Enable asynchronous upload data file processing for improved performance.
|
|
416
|
-
Defaults to True.
|
|
417
|
-
max_file_size_mb (int): The maximum file size not using chunked_upload in MB. Defaults to 50MB.
|
|
418
|
-
creating_data_unit_batch_size (int): The batch size for creating data units. Defaults to 100.
|
|
419
|
-
extra_params (dict | None): Extra parameters for the action.
|
|
420
|
-
Example: {"include_metadata": True, "compression": "gzip"}
|
|
421
|
-
|
|
422
|
-
Note:
|
|
423
|
-
Async upload requires plugin developers to implement handle_upload_files_async()
|
|
424
|
-
method for maximum benefit. Default implementation provides compatibility
|
|
425
|
-
but limited performance improvement.
|
|
426
|
-
"""
|
|
427
|
-
|
|
428
|
-
name: Annotated[str, AfterValidator(non_blank)]
|
|
429
|
-
description: str | None
|
|
430
|
-
path: str
|
|
431
|
-
storage: int
|
|
432
|
-
collection: int
|
|
433
|
-
project: int | None
|
|
434
|
-
excel_metadata_path: str | None = None
|
|
435
|
-
is_recursive: bool = True
|
|
436
|
-
max_file_size_mb: int = 50
|
|
437
|
-
creating_data_unit_batch_size: int = 1
|
|
438
|
-
use_async_upload: bool = True
|
|
439
|
-
extra_params: dict | None = None
|
|
440
|
-
|
|
441
|
-
@field_validator('storage', mode='before')
|
|
442
|
-
@classmethod
|
|
443
|
-
def check_storage_exists(cls, value: str, info) -> str:
|
|
444
|
-
"""Validate synapse-backend storage exists.
|
|
445
|
-
|
|
446
|
-
TODO: Need to define validation method naming convention.
|
|
447
|
-
TODO: Need to make validation method reusable.
|
|
448
|
-
"""
|
|
449
|
-
action = info.context['action']
|
|
450
|
-
client = action.client
|
|
451
|
-
try:
|
|
452
|
-
client.get_storage(value)
|
|
453
|
-
except ClientError:
|
|
454
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking storage exists.'))
|
|
455
|
-
return value
|
|
456
|
-
|
|
457
|
-
@field_validator('collection', mode='before')
|
|
458
|
-
@classmethod
|
|
459
|
-
def check_collection_exists(cls, value: str, info) -> str:
|
|
460
|
-
"""Validate synapse-backend collection exists."""
|
|
461
|
-
action = info.context['action']
|
|
462
|
-
client = action.client
|
|
463
|
-
try:
|
|
464
|
-
client.get_data_collection(value)
|
|
465
|
-
except ClientError:
|
|
466
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking collection exists.'))
|
|
467
|
-
return value
|
|
468
|
-
|
|
469
|
-
@field_validator('project', mode='before')
|
|
470
|
-
@classmethod
|
|
471
|
-
def check_project_exists(cls, value: str, info) -> str:
|
|
472
|
-
"""Validate synapse-backend project exists."""
|
|
473
|
-
if not value:
|
|
474
|
-
return value
|
|
475
|
-
|
|
476
|
-
action = info.context['action']
|
|
477
|
-
client = action.client
|
|
478
|
-
try:
|
|
479
|
-
client.get_project(value)
|
|
480
|
-
except ClientError:
|
|
481
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
|
|
482
|
-
return value
|
|
483
|
-
|
|
484
|
-
@field_validator('excel_metadata_path', mode='before')
|
|
485
|
-
@classmethod
|
|
486
|
-
def check_excel_metadata_path(cls, value: str, info) -> str:
|
|
487
|
-
"""Validate excel metadata file exists and is secure if provided.
|
|
488
|
-
|
|
489
|
-
This validator performs comprehensive security checks including:
|
|
490
|
-
- File existence and format validation
|
|
491
|
-
- File size limits (max 10MB)
|
|
492
|
-
- Basic security checks for file content
|
|
493
|
-
|
|
494
|
-
Args:
|
|
495
|
-
value: The excel file path to validate
|
|
496
|
-
info: Validation context information
|
|
497
|
-
|
|
498
|
-
Returns:
|
|
499
|
-
str: The validated file path
|
|
500
|
-
|
|
501
|
-
Raises:
|
|
502
|
-
PydanticCustomError: If validation fails
|
|
503
|
-
"""
|
|
504
|
-
if not value:
|
|
505
|
-
return value
|
|
506
|
-
|
|
507
|
-
excel_path = Path(value)
|
|
508
|
-
|
|
509
|
-
# Check file existence
|
|
510
|
-
if not excel_path.exists():
|
|
511
|
-
raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
|
|
512
|
-
|
|
513
|
-
# Check file extension
|
|
514
|
-
if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
|
|
515
|
-
raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
|
|
516
|
-
|
|
517
|
-
# Security check: file size limit
|
|
518
|
-
file_size = excel_path.stat().st_size
|
|
519
|
-
excel_config = ExcelSecurityConfig()
|
|
520
|
-
if file_size > excel_config.MAX_FILE_SIZE_BYTES:
|
|
521
|
-
raise PydanticCustomError(
|
|
522
|
-
'file_too_large',
|
|
523
|
-
_('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
# Basic security check: ensure file is readable and not corrupted
|
|
527
|
-
try:
|
|
528
|
-
with open(excel_path, 'rb') as f:
|
|
529
|
-
# Read first few bytes to check if it's a valid Excel file
|
|
530
|
-
header = f.read(8)
|
|
531
|
-
if not header:
|
|
532
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
|
|
533
|
-
|
|
534
|
-
# Check for valid Excel file signatures
|
|
535
|
-
if excel_path.suffix.lower() == '.xlsx':
|
|
536
|
-
# XLSX files start with PK (ZIP signature)
|
|
537
|
-
if not header.startswith(b'PK'):
|
|
538
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
539
|
-
elif excel_path.suffix.lower() == '.xls':
|
|
540
|
-
# XLS files have specific OLE signatures
|
|
541
|
-
if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
|
|
542
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
543
|
-
|
|
544
|
-
except (OSError, IOError):
|
|
545
|
-
raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
|
|
546
|
-
|
|
547
|
-
return value
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
@register_action
|
|
551
|
-
class UploadAction(Action):
|
|
552
|
-
"""Upload action class.
|
|
553
|
-
|
|
554
|
-
Attrs:
|
|
555
|
-
name (str): The name of the action.
|
|
556
|
-
category (PluginCategory): The category of the action.
|
|
557
|
-
method (RunMethod): The method to run of the action.
|
|
558
|
-
|
|
559
|
-
Progress Categories:
|
|
560
|
-
analyze_collection: The progress category for the analyze collection process.
|
|
561
|
-
data_file_upload: The progress category for the upload process.
|
|
562
|
-
generate_data_units: The progress category for the generate data units process.
|
|
563
|
-
|
|
564
|
-
Metrics Categories:
|
|
565
|
-
data_file: The metrics category for the data file.
|
|
566
|
-
data_unit: The metrics category for the data unit.
|
|
567
|
-
"""
|
|
568
|
-
|
|
569
|
-
name = 'upload'
|
|
570
|
-
category = PluginCategory.UPLOAD
|
|
571
|
-
method = RunMethod.JOB
|
|
572
|
-
run_class = UploadRun
|
|
573
|
-
progress_categories = {
|
|
574
|
-
'analyze_collection': {
|
|
575
|
-
'proportion': 2,
|
|
576
|
-
},
|
|
577
|
-
'upload_data_files': {
|
|
578
|
-
'proportion': 38,
|
|
579
|
-
},
|
|
580
|
-
'generate_data_units': {
|
|
581
|
-
'proportion': 60,
|
|
582
|
-
},
|
|
583
|
-
}
|
|
584
|
-
metrics_categories = {
|
|
585
|
-
'data_files': {
|
|
586
|
-
'stand_by': 0,
|
|
587
|
-
'failed': 0,
|
|
588
|
-
'success': 0,
|
|
589
|
-
},
|
|
590
|
-
'data_units': {
|
|
591
|
-
'stand_by': 0,
|
|
592
|
-
'failed': 0,
|
|
593
|
-
'success': 0,
|
|
594
|
-
},
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
def __init__(self, *args, **kwargs):
|
|
598
|
-
super().__init__(*args, **kwargs)
|
|
599
|
-
self.excel_config = ExcelSecurityConfig()
|
|
600
|
-
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
601
|
-
|
|
602
|
-
def get_uploader(self, path, file_specification, organized_files, params: Dict = None):
|
|
603
|
-
"""Get uploader from entrypoint."""
|
|
604
|
-
return self.entrypoint(
|
|
605
|
-
self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
|
|
609
|
-
"""Discover files recursively in a directory."""
|
|
610
|
-
return [file_path for file_path in dir_path.rglob('*') if file_path.is_file()]
|
|
611
|
-
|
|
612
|
-
def _discover_files_non_recursive(self, dir_path: Path) -> List[Path]:
|
|
613
|
-
"""Discover files in a directory (non-recursive)."""
|
|
614
|
-
return [file_path for file_path in dir_path.glob('*') if file_path.is_file()]
|
|
615
|
-
|
|
616
|
-
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
617
|
-
"""Validate Excel file security constraints.
|
|
618
|
-
|
|
619
|
-
Performs comprehensive security validation including:
|
|
620
|
-
- File size limits
|
|
621
|
-
- Memory usage constraints
|
|
622
|
-
- Basic malicious content detection
|
|
623
|
-
|
|
624
|
-
Args:
|
|
625
|
-
excel_path: Path to the Excel file to validate
|
|
626
|
-
|
|
627
|
-
Raises:
|
|
628
|
-
ExcelSecurityError: If security validation fails
|
|
629
|
-
"""
|
|
630
|
-
# File size check (already done in validator, but double-check)
|
|
631
|
-
file_size = excel_path.stat().st_size
|
|
632
|
-
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
633
|
-
raise ExcelSecurityError(
|
|
634
|
-
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
635
|
-
)
|
|
636
|
-
|
|
637
|
-
# Memory usage estimation (rough estimate: file_size * 3 for processing)
|
|
638
|
-
estimated_memory = file_size * 3
|
|
639
|
-
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
640
|
-
raise ExcelSecurityError(
|
|
641
|
-
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
642
|
-
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
646
|
-
"""Prepare Excel file for reading with security validation.
|
|
647
|
-
|
|
648
|
-
Args:
|
|
649
|
-
excel_path: Path to the Excel file
|
|
650
|
-
|
|
651
|
-
Returns:
|
|
652
|
-
BytesIO: Excel file stream ready for reading
|
|
653
|
-
|
|
654
|
-
Raises:
|
|
655
|
-
ExcelSecurityError: If security validation fails
|
|
656
|
-
"""
|
|
657
|
-
self._validate_excel_security(excel_path)
|
|
658
|
-
excel_bytes = excel_path.read_bytes()
|
|
659
|
-
return BytesIO(excel_bytes)
|
|
660
|
-
|
|
661
|
-
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
662
|
-
"""Process and validate Excel headers.
|
|
663
|
-
|
|
664
|
-
Args:
|
|
665
|
-
headers: Raw header tuple from Excel
|
|
666
|
-
|
|
667
|
-
Returns:
|
|
668
|
-
tuple: Validated headers
|
|
669
|
-
|
|
670
|
-
Raises:
|
|
671
|
-
ExcelParsingError: If headers are invalid
|
|
672
|
-
"""
|
|
673
|
-
if len(headers) < 2:
|
|
674
|
-
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
675
|
-
self._validate_excel_content(headers, 0) # Validate column count
|
|
676
|
-
return headers
|
|
677
|
-
|
|
678
|
-
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
679
|
-
"""Process a single Excel data row.
|
|
680
|
-
|
|
681
|
-
Args:
|
|
682
|
-
row: Raw row data from Excel
|
|
683
|
-
headers: Excel headers
|
|
684
|
-
|
|
685
|
-
Returns:
|
|
686
|
-
Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
|
|
687
|
-
"""
|
|
688
|
-
# Skip empty rows
|
|
689
|
-
if not row[0] or str(row[0]).strip() == '':
|
|
690
|
-
return None
|
|
691
|
-
|
|
692
|
-
file_name = str(row[0]).strip()
|
|
693
|
-
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
694
|
-
self.run.log_message_with_code('FILENAME_TOO_LONG', file_name[:50])
|
|
695
|
-
return None
|
|
696
|
-
|
|
697
|
-
# Create metadata dictionary from remaining columns
|
|
698
|
-
file_metadata: Dict[str, Any] = {}
|
|
699
|
-
for i, value in enumerate(row[1:], start=1):
|
|
700
|
-
if value is not None and i < len(headers):
|
|
701
|
-
header_value = headers[i]
|
|
702
|
-
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
703
|
-
|
|
704
|
-
# Validate and truncate column name and value
|
|
705
|
-
column_name = self.excel_utils.validate_and_truncate_string(
|
|
706
|
-
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
707
|
-
)
|
|
708
|
-
str_value = self.excel_utils.validate_and_truncate_string(
|
|
709
|
-
str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
710
|
-
)
|
|
711
|
-
file_metadata[column_name] = str_value
|
|
712
|
-
|
|
713
|
-
return {file_name: file_metadata} if file_metadata else None
|
|
714
|
-
|
|
715
|
-
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
716
|
-
"""Process Excel worksheet and extract metadata.
|
|
717
|
-
|
|
718
|
-
Args:
|
|
719
|
-
worksheet: openpyxl worksheet object
|
|
720
|
-
|
|
721
|
-
Returns:
|
|
722
|
-
Dict[str, Dict[str, Any]]: Extracted metadata dictionary
|
|
723
|
-
|
|
724
|
-
Raises:
|
|
725
|
-
ExcelParsingError: If worksheet processing fails
|
|
726
|
-
"""
|
|
727
|
-
if worksheet is None:
|
|
728
|
-
raise ExcelParsingError('Excel file has no active worksheet')
|
|
729
|
-
|
|
730
|
-
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
731
|
-
headers: Optional[tuple] = None
|
|
732
|
-
data_row_count = 0
|
|
733
|
-
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
734
|
-
|
|
735
|
-
# Process rows one by one for memory efficiency
|
|
736
|
-
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
737
|
-
if not row or all(cell is None or str(cell).strip() == '' for cell in row):
|
|
738
|
-
continue # Skip completely empty rows
|
|
739
|
-
|
|
740
|
-
if row_idx == 0: # Header row
|
|
741
|
-
headers = self._process_excel_headers(row)
|
|
742
|
-
continue
|
|
743
|
-
|
|
744
|
-
# Data rows
|
|
745
|
-
if headers is None:
|
|
746
|
-
raise ExcelParsingError('Excel file missing header row')
|
|
747
|
-
|
|
748
|
-
data_row_count += 1
|
|
749
|
-
|
|
750
|
-
# Validate row count periodically
|
|
751
|
-
if data_row_count % validation_interval == 0:
|
|
752
|
-
self._validate_excel_content(headers, data_row_count)
|
|
753
|
-
|
|
754
|
-
# Process individual row
|
|
755
|
-
row_result = self._process_excel_data_row(row, headers)
|
|
756
|
-
if row_result:
|
|
757
|
-
metadata_dict.update(row_result)
|
|
758
|
-
|
|
759
|
-
# Final validation
|
|
760
|
-
self._validate_excel_content(headers or (), data_row_count)
|
|
761
|
-
|
|
762
|
-
return metadata_dict
|
|
763
|
-
|
|
764
|
-
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
765
|
-
"""Validate Excel content constraints.
|
|
766
|
-
|
|
767
|
-
Args:
|
|
768
|
-
headers: Tuple of header values from the first row
|
|
769
|
-
row_count: Total number of data rows processed
|
|
770
|
-
|
|
771
|
-
Raises:
|
|
772
|
-
ExcelParsingError: If content validation fails
|
|
773
|
-
"""
|
|
774
|
-
# Limit number of columns to prevent memory exhaustion
|
|
775
|
-
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
776
|
-
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
777
|
-
|
|
778
|
-
# Limit number of rows to prevent excessive processing
|
|
779
|
-
if row_count > self.excel_config.MAX_ROWS:
|
|
780
|
-
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
781
|
-
|
|
782
|
-
def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
|
|
783
|
-
"""Find Excel metadata file in the directory.
|
|
784
|
-
|
|
785
|
-
Checks for meta.xlsx and meta.xls in the given directory.
|
|
786
|
-
|
|
787
|
-
Args:
|
|
788
|
-
pathlib_cwd (Path): The pathlib object representing the current working directory.
|
|
789
|
-
|
|
790
|
-
Returns:
|
|
791
|
-
Optional[Path]: Path to the Excel metadata file if found, None otherwise.
|
|
792
|
-
"""
|
|
793
|
-
# Check for xlsx first, then xls
|
|
794
|
-
for extension in ['.xlsx', '.xls']:
|
|
795
|
-
excel_path = pathlib_cwd / f'meta{extension}'
|
|
796
|
-
if excel_path.exists() and excel_path.is_file():
|
|
797
|
-
return excel_path
|
|
798
|
-
return None
|
|
799
|
-
|
|
800
|
-
def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
|
|
801
|
-
"""Read metadata from excel file with comprehensive security validation.
|
|
802
|
-
|
|
803
|
-
This method orchestrates the Excel metadata reading process by delegating
|
|
804
|
-
to specialized methods for each step.
|
|
805
|
-
|
|
806
|
-
Args:
|
|
807
|
-
pathlib_cwd (Path): The pathlib object representing the current working directory.
|
|
808
|
-
|
|
809
|
-
Returns:
|
|
810
|
-
Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
|
|
811
|
-
Empty dict if no metadata is configured or if reading fails.
|
|
812
|
-
|
|
813
|
-
Raises:
|
|
814
|
-
ExcelSecurityError: If security validation fails
|
|
815
|
-
ExcelParsingError: If Excel content is invalid or exceeds limits
|
|
816
|
-
"""
|
|
817
|
-
excel_path = None
|
|
818
|
-
|
|
819
|
-
# Check if user provided a specific excel_metadata_path
|
|
820
|
-
excel_metadata_path = self.params.get('excel_metadata_path')
|
|
821
|
-
if excel_metadata_path:
|
|
822
|
-
excel_path = pathlib_cwd / excel_metadata_path
|
|
823
|
-
if not excel_path.exists():
|
|
824
|
-
self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH')
|
|
825
|
-
return {}
|
|
826
|
-
else:
|
|
827
|
-
# Look for default meta.xlsx or meta.xls
|
|
828
|
-
excel_path = self._find_excel_metadata_file(pathlib_cwd)
|
|
829
|
-
if not excel_path:
|
|
830
|
-
# No Excel metadata file found, return empty dict (not an error)
|
|
831
|
-
return {}
|
|
832
|
-
|
|
833
|
-
try:
|
|
834
|
-
self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED')
|
|
835
|
-
|
|
836
|
-
# Prepare Excel file with security validation
|
|
837
|
-
excel_stream = self._prepare_excel_file(excel_path)
|
|
838
|
-
|
|
839
|
-
# Load and process workbook
|
|
840
|
-
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
841
|
-
try:
|
|
842
|
-
self.run.log_message_with_code('EXCEL_WORKBOOK_LOADED')
|
|
843
|
-
return self._process_excel_worksheet(workbook.active)
|
|
844
|
-
finally:
|
|
845
|
-
workbook.close()
|
|
846
|
-
|
|
847
|
-
except ExcelSecurityError as e:
|
|
848
|
-
self.run.log_message_with_code('EXCEL_SECURITY_VALIDATION_FAILED', str(e))
|
|
849
|
-
raise
|
|
850
|
-
except ExcelParsingError as e:
|
|
851
|
-
self.run.log_message_with_code('EXCEL_PARSING_FAILED', str(e))
|
|
852
|
-
raise
|
|
853
|
-
except InvalidFileException as e:
|
|
854
|
-
self.run.log_message_with_code('EXCEL_INVALID_FILE_FORMAT', str(e))
|
|
855
|
-
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
856
|
-
except MemoryError:
|
|
857
|
-
self.run.log_message_with_code('EXCEL_FILE_TOO_LARGE')
|
|
858
|
-
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
859
|
-
except (OSError, IOError) as e:
|
|
860
|
-
self.run.log_message_with_code('EXCEL_FILE_ACCESS_ERROR', str(e))
|
|
861
|
-
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
862
|
-
except Exception as e:
|
|
863
|
-
self.run.log_message_with_code('EXCEL_UNEXPECTED_ERROR', str(e))
|
|
864
|
-
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
865
|
-
|
|
866
|
-
def start(self) -> Dict[str, Any]:
|
|
867
|
-
"""Start upload process.
|
|
868
|
-
|
|
869
|
-
Returns:
|
|
870
|
-
Dict: The result of the upload process.
|
|
871
|
-
"""
|
|
872
|
-
# Setup result dict early for error handling
|
|
873
|
-
result: Dict[str, Any] = {}
|
|
874
|
-
|
|
875
|
-
# Setup path object with path and storage.
|
|
876
|
-
storage_id = self.params.get('storage')
|
|
877
|
-
if storage_id is None:
|
|
878
|
-
raise ActionError('Storage parameter is required')
|
|
879
|
-
storage = self.client.get_storage(storage_id)
|
|
880
|
-
|
|
881
|
-
path = self.params.get('path')
|
|
882
|
-
if path is None:
|
|
883
|
-
raise ActionError('Path parameter is required')
|
|
884
|
-
pathlib_cwd = get_pathlib(storage, path)
|
|
885
|
-
|
|
886
|
-
# Read excel metadata if configured or default file exists
|
|
887
|
-
excel_metadata: Dict[str, Dict[str, Any]] = {}
|
|
888
|
-
try:
|
|
889
|
-
excel_metadata = self._read_excel_metadata(pathlib_cwd)
|
|
890
|
-
if excel_metadata:
|
|
891
|
-
self.run.log_message_with_code('EXCEL_METADATA_LOADED', len(excel_metadata))
|
|
892
|
-
except ExcelSecurityError as e:
|
|
893
|
-
# Security violations should stop the process entirely
|
|
894
|
-
self.run.log_message_with_code('EXCEL_SECURITY_VIOLATION', str(e))
|
|
895
|
-
return result
|
|
896
|
-
except ExcelParsingError as e:
|
|
897
|
-
# Parsing errors can be non-critical if user didn't explicitly provide Excel file
|
|
898
|
-
if self.params.get('excel_metadata_path'):
|
|
899
|
-
# User explicitly provided Excel file, treat as error
|
|
900
|
-
self.run.log_message_with_code('EXCEL_PARSING_ERROR', str(e))
|
|
901
|
-
return result
|
|
902
|
-
else:
|
|
903
|
-
# Default Excel file found but failed to parse, treat as warning and continue
|
|
904
|
-
self.run.log_message_with_code('EXCEL_PARSING_ERROR', str(e))
|
|
905
|
-
excel_metadata = {}
|
|
906
|
-
|
|
907
|
-
# Analyze Collection file specifications to determine the data structure for upload.
|
|
908
|
-
file_specification_template = self._analyze_collection()
|
|
909
|
-
organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
|
|
910
|
-
|
|
911
|
-
# Initialize uploader.
|
|
912
|
-
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
|
|
913
|
-
|
|
914
|
-
# Get organized files from the uploader (plugin developer's custom implementation)
|
|
915
|
-
# or use the default organization method if uploader doesn't provide valid files
|
|
916
|
-
organized_files = uploader.handle_upload_files()
|
|
917
|
-
|
|
918
|
-
# Validate the organized files
|
|
919
|
-
if not self._validate_organized_files(organized_files, file_specification_template):
|
|
920
|
-
self.run.log_message_with_code('VALIDATION_FAILED')
|
|
921
|
-
raise ActionError('Upload is aborted due to validation errors.')
|
|
922
|
-
|
|
923
|
-
# Upload files to synapse-backend.
|
|
924
|
-
if not organized_files:
|
|
925
|
-
self.run.log_message_with_code('NO_FILES_FOUND')
|
|
926
|
-
raise ActionError('Upload is aborted due to missing files.')
|
|
927
|
-
# Choose upload method based on async parameter
|
|
928
|
-
if self.params.get('use_async_upload', True):
|
|
929
|
-
uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
|
|
930
|
-
else:
|
|
931
|
-
uploaded_files = self._upload_files(organized_files)
|
|
932
|
-
result['uploaded_files_count'] = len(uploaded_files)
|
|
933
|
-
|
|
934
|
-
# Generate data units for the uploaded data.
|
|
935
|
-
if not uploaded_files:
|
|
936
|
-
self.run.log_message_with_code('NO_FILES_UPLOADED')
|
|
937
|
-
raise ActionError('Upload is aborted due to no uploaded files.')
|
|
938
|
-
generated_data_units = self._generate_data_units(
|
|
939
|
-
uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
|
|
940
|
-
)
|
|
941
|
-
result['generated_data_units_count'] = len(generated_data_units)
|
|
942
|
-
|
|
943
|
-
# Setup task with uploaded synapse-backend data units.
|
|
944
|
-
if not generated_data_units:
|
|
945
|
-
self.run.log_message_with_code('NO_DATA_UNITS_GENERATED')
|
|
946
|
-
raise ActionError('Upload is aborted due to no generated data units.')
|
|
947
|
-
|
|
948
|
-
# Clean up if temp dir exists
|
|
949
|
-
self._cleanup_temp_directory()
|
|
950
|
-
|
|
951
|
-
self.run.log_message_with_code('IMPORT_COMPLETED')
|
|
952
|
-
return result
|
|
953
|
-
|
|
954
|
-
def _analyze_collection(self) -> Dict[str, Any]:
|
|
955
|
-
"""Analyze Synapse Collection Specifications.
|
|
956
|
-
|
|
957
|
-
Returns:
|
|
958
|
-
Dict: The file specifications of the collection.
|
|
959
|
-
"""
|
|
960
|
-
|
|
961
|
-
# Initialize progress
|
|
962
|
-
self.run.set_progress(0, 2, category='analyze_collection')
|
|
963
|
-
|
|
964
|
-
collection_id = self.params.get('data_collection')
|
|
965
|
-
if collection_id is None:
|
|
966
|
-
raise ActionError('Data collection parameter is required')
|
|
967
|
-
self.run.set_progress(1, 2, category='analyze_collection')
|
|
968
|
-
|
|
969
|
-
collection = self.run.client.get_data_collection(collection_id)
|
|
970
|
-
self.run.set_progress(2, 2, category='analyze_collection')
|
|
971
|
-
|
|
972
|
-
return collection['file_specifications']
|
|
973
|
-
|
|
974
|
-
def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
975
|
-
"""Upload files to synapse-backend.
|
|
976
|
-
|
|
977
|
-
Returns:
|
|
978
|
-
Dict: The result of the upload.
|
|
979
|
-
"""
|
|
980
|
-
# Initialize progress
|
|
981
|
-
organized_files_count = len(organized_files)
|
|
982
|
-
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
983
|
-
self.run.log_message_with_code('UPLOADING_DATA_FILES')
|
|
984
|
-
|
|
985
|
-
client = self.run.client
|
|
986
|
-
collection_id = self.params.get('data_collection')
|
|
987
|
-
if collection_id is None:
|
|
988
|
-
raise ActionError('Data collection parameter is required')
|
|
989
|
-
upload_result = []
|
|
990
|
-
current_progress = 0
|
|
991
|
-
success_count = 0
|
|
992
|
-
failed_count = 0
|
|
993
|
-
|
|
994
|
-
# Initialize metrics
|
|
995
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
996
|
-
|
|
997
|
-
for organized_file in organized_files:
|
|
998
|
-
try:
|
|
999
|
-
# Determine if chunked upload should be used based on file size
|
|
1000
|
-
use_chunked_upload = self._requires_chunked_upload(organized_file)
|
|
1001
|
-
uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
1002
|
-
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
1003
|
-
success_count += 1
|
|
1004
|
-
upload_result.append(uploaded_data_file)
|
|
1005
|
-
except Exception as e:
|
|
1006
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1007
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', str(e))
|
|
1008
|
-
failed_count += 1
|
|
1009
|
-
|
|
1010
|
-
current_progress += 1
|
|
1011
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
1012
|
-
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
1013
|
-
|
|
1014
|
-
# Finish progress
|
|
1015
|
-
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
1016
|
-
|
|
1017
|
-
return upload_result
|
|
1018
|
-
|
|
1019
|
-
def run_async(self, coro: Awaitable[T]) -> T:
|
|
1020
|
-
"""Run async coroutine safely using asyncio.run().
|
|
1021
|
-
|
|
1022
|
-
This method properly manages event loop lifecycle and prevents
|
|
1023
|
-
resource exhaustion from repeated event loop creation.
|
|
1024
|
-
|
|
1025
|
-
Args:
|
|
1026
|
-
coro: The coroutine to execute
|
|
1027
|
-
|
|
1028
|
-
Returns:
|
|
1029
|
-
The result of the coroutine execution
|
|
1030
|
-
|
|
1031
|
-
Raises:
|
|
1032
|
-
RuntimeError: If called from within an existing event loop
|
|
1033
|
-
"""
|
|
1034
|
-
import concurrent.futures
|
|
1035
|
-
|
|
1036
|
-
def _run_in_thread():
|
|
1037
|
-
"""Run the coroutine in a separate thread to avoid event loop conflicts."""
|
|
1038
|
-
return asyncio.run(coro)
|
|
1039
|
-
|
|
1040
|
-
# Check if we're already in an event loop
|
|
1041
|
-
try:
|
|
1042
|
-
# If this doesn't raise, we're in an event loop
|
|
1043
|
-
asyncio.get_running_loop()
|
|
1044
|
-
# Run in thread pool to avoid "RuntimeError: cannot be called from a running event loop"
|
|
1045
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
1046
|
-
future = executor.submit(_run_in_thread)
|
|
1047
|
-
return future.result()
|
|
1048
|
-
except RuntimeError:
|
|
1049
|
-
# No event loop running, safe to use asyncio.run directly
|
|
1050
|
-
return asyncio.run(coro)
|
|
1051
|
-
|
|
1052
|
-
async def _upload_files_async(
|
|
1053
|
-
self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
|
|
1054
|
-
) -> List[Dict[str, Any]]:
|
|
1055
|
-
"""Upload files to synapse-backend asynchronously with concurrency control."""
|
|
1056
|
-
# Initialize progress
|
|
1057
|
-
organized_files_count = len(organized_files)
|
|
1058
|
-
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
1059
|
-
self.run.log_message_with_code('UPLOADING_DATA_FILES')
|
|
1060
|
-
|
|
1061
|
-
client = self.run.client
|
|
1062
|
-
collection_id = self.params.get('data_collection')
|
|
1063
|
-
if collection_id is None:
|
|
1064
|
-
raise ActionError('Data collection parameter is required')
|
|
1065
|
-
upload_result = []
|
|
1066
|
-
success_count = 0
|
|
1067
|
-
failed_count = 0
|
|
1068
|
-
|
|
1069
|
-
# Initialize metrics
|
|
1070
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
1071
|
-
|
|
1072
|
-
# Control concurrency with semaphore
|
|
1073
|
-
semaphore = asyncio.Semaphore(max_concurrent)
|
|
1074
|
-
|
|
1075
|
-
async def upload_single_file(organized_file):
|
|
1076
|
-
async with semaphore:
|
|
1077
|
-
loop = asyncio.get_event_loop()
|
|
1078
|
-
try:
|
|
1079
|
-
# Determine if chunked upload should be used based on file size
|
|
1080
|
-
use_chunked_upload = self._requires_chunked_upload(organized_file)
|
|
1081
|
-
# Run sync upload_data_file in thread pool
|
|
1082
|
-
uploaded_data_file = await loop.run_in_executor(
|
|
1083
|
-
None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
1084
|
-
)
|
|
1085
|
-
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
1086
|
-
return {'status': 'success', 'result': uploaded_data_file}
|
|
1087
|
-
except ClientError as e:
|
|
1088
|
-
# Handle API client errors (network, authentication, server errors)
|
|
1089
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1090
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Client error: {str(e)}')
|
|
1091
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
|
|
1092
|
-
except (OSError, IOError) as e:
|
|
1093
|
-
# Handle file system errors (file not found, permissions, disk full)
|
|
1094
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1095
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'File system error: {str(e)}')
|
|
1096
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
|
|
1097
|
-
except MemoryError as e:
|
|
1098
|
-
# Handle out of memory errors (large files)
|
|
1099
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1100
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Memory error (file too large): {str(e)}')
|
|
1101
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
|
|
1102
|
-
except asyncio.TimeoutError as e:
|
|
1103
|
-
# Handle timeout errors (slow network, large files)
|
|
1104
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1105
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Upload timeout: {str(e)}')
|
|
1106
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
|
|
1107
|
-
except ValueError as e:
|
|
1108
|
-
# Handle data validation errors (invalid file format, metadata issues)
|
|
1109
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1110
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Data validation error: {str(e)}')
|
|
1111
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
|
|
1112
|
-
except Exception as e:
|
|
1113
|
-
# Handle any remaining unexpected errors
|
|
1114
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1115
|
-
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Unexpected error: {str(e)}')
|
|
1116
|
-
return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
|
|
1117
|
-
|
|
1118
|
-
# Create tasks for all files
|
|
1119
|
-
tasks = [upload_single_file(organized_file) for organized_file in organized_files]
|
|
1120
|
-
|
|
1121
|
-
# Process files with progress updates
|
|
1122
|
-
current_progress = 0
|
|
1123
|
-
for completed_task in asyncio.as_completed(tasks):
|
|
1124
|
-
result = await completed_task
|
|
1125
|
-
current_progress += 1
|
|
1126
|
-
|
|
1127
|
-
if result['status'] == 'success':
|
|
1128
|
-
success_count += 1
|
|
1129
|
-
upload_result.append(result['result'])
|
|
1130
|
-
else:
|
|
1131
|
-
failed_count += 1
|
|
1132
|
-
|
|
1133
|
-
# Update metrics and progress
|
|
1134
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
1135
|
-
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
1136
|
-
|
|
1137
|
-
# Finish progress
|
|
1138
|
-
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
1139
|
-
|
|
1140
|
-
return upload_result
|
|
1141
|
-
|
|
1142
|
-
def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
|
|
1143
|
-
"""Generate data units for the uploaded data.
|
|
1144
|
-
|
|
1145
|
-
TODO: make dynamic batch size depend on uploaded file sizes
|
|
1146
|
-
|
|
1147
|
-
Returns:
|
|
1148
|
-
Dict: The result of the generate data units process.
|
|
1149
|
-
"""
|
|
1150
|
-
# Initialize progress
|
|
1151
|
-
upload_result_count = len(uploaded_files)
|
|
1152
|
-
self.run.set_progress(0, upload_result_count, category='generate_data_units')
|
|
1153
|
-
self.run.log_message_with_code('GENERATING_DATA_UNITS')
|
|
1154
|
-
|
|
1155
|
-
client = self.run.client
|
|
1156
|
-
generated_data_units = []
|
|
1157
|
-
current_progress = 0
|
|
1158
|
-
success_count = 0
|
|
1159
|
-
failed_count = 0
|
|
1160
|
-
|
|
1161
|
-
batches = get_batched_list(uploaded_files, batch_size)
|
|
1162
|
-
batches_count = len(batches)
|
|
1163
|
-
|
|
1164
|
-
# Initialize metrics
|
|
1165
|
-
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
1166
|
-
|
|
1167
|
-
for batch in batches:
|
|
1168
|
-
try:
|
|
1169
|
-
created_data_units = client.create_data_units(batch)
|
|
1170
|
-
success_count += len(created_data_units)
|
|
1171
|
-
generated_data_units.append(created_data_units)
|
|
1172
|
-
for created_data_unit in created_data_units:
|
|
1173
|
-
self.run.log_data_unit(
|
|
1174
|
-
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
1175
|
-
)
|
|
1176
|
-
except Exception as e:
|
|
1177
|
-
failed_count += len(batch)
|
|
1178
|
-
self.run.log_message_with_code('DATA_UNIT_BATCH_FAILED', str(e))
|
|
1179
|
-
for _ in batch:
|
|
1180
|
-
self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
1181
|
-
|
|
1182
|
-
current_progress += 1
|
|
1183
|
-
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
1184
|
-
self.run.set_progress(current_progress, batches_count, category='generate_data_units')
|
|
1185
|
-
|
|
1186
|
-
# Finish progress
|
|
1187
|
-
self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
|
|
1188
|
-
|
|
1189
|
-
return sum(generated_data_units, [])
|
|
1190
|
-
|
|
1191
|
-
def _validate_organized_files(
|
|
1192
|
-
self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
|
|
1193
|
-
) -> bool:
|
|
1194
|
-
"""Validate organized files from Uploader."""
|
|
1195
|
-
validator = FileSpecificationValidator(file_specification_template, organized_files)
|
|
1196
|
-
return validator.validate()
|
|
1197
|
-
|
|
1198
|
-
def _organize_files(
|
|
1199
|
-
self,
|
|
1200
|
-
directory: Path,
|
|
1201
|
-
file_specification: List[Dict[str, Any]],
|
|
1202
|
-
excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
1203
|
-
) -> List[Dict[str, Any]]:
|
|
1204
|
-
"""Organize files according to the file specification.
|
|
1205
|
-
|
|
1206
|
-
Args:
|
|
1207
|
-
directory (Path): Root directory containing files to organize.
|
|
1208
|
-
file_specification (List[Dict[str, Any]]): File specification list with metadata.
|
|
1209
|
-
excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
|
|
1210
|
-
to their metadata key-value pairs from excel file.
|
|
1211
|
-
|
|
1212
|
-
Returns:
|
|
1213
|
-
List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
|
|
1214
|
-
"""
|
|
1215
|
-
organized_files: List[Dict[str, Any]] = []
|
|
1216
|
-
|
|
1217
|
-
# Check for type-based directory structure (e.g., image_1/, pcd_1/)
|
|
1218
|
-
type_dirs: Dict[str, Path] = {}
|
|
1219
|
-
|
|
1220
|
-
for spec in file_specification:
|
|
1221
|
-
spec_name = spec['name']
|
|
1222
|
-
spec_dir = directory / spec_name
|
|
1223
|
-
if spec_dir.exists() and spec_dir.is_dir():
|
|
1224
|
-
type_dirs[spec_name] = spec_dir
|
|
1225
|
-
|
|
1226
|
-
if type_dirs:
|
|
1227
|
-
self.run.log_message_with_code('TYPE_DIRECTORIES_FOUND', list(type_dirs.keys()))
|
|
1228
|
-
|
|
1229
|
-
# If type-based directories don't exist, exit early
|
|
1230
|
-
if not type_dirs:
|
|
1231
|
-
self.run.log_message_with_code('NO_TYPE_DIRECTORIES')
|
|
1232
|
-
return organized_files
|
|
1233
|
-
|
|
1234
|
-
self.run.log_message_with_code('TYPE_STRUCTURE_DETECTED')
|
|
1235
|
-
self.run.log_message_with_code('FILE_ORGANIZATION_STARTED')
|
|
1236
|
-
|
|
1237
|
-
# Collect and process files in a single pass
|
|
1238
|
-
dataset_files = {}
|
|
1239
|
-
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
1240
|
-
|
|
1241
|
-
# Get recursive setting from params
|
|
1242
|
-
is_recursive = self.params.get('is_recursive', True)
|
|
1243
|
-
|
|
1244
|
-
# Process all files from all type directories
|
|
1245
|
-
for spec_name, dir_path in type_dirs.items():
|
|
1246
|
-
# Use appropriate method based on recursive setting
|
|
1247
|
-
if is_recursive:
|
|
1248
|
-
files_list = self._discover_files_recursive(dir_path)
|
|
1249
|
-
else:
|
|
1250
|
-
files_list = self._discover_files_non_recursive(dir_path)
|
|
1251
|
-
|
|
1252
|
-
for file_path in files_list:
|
|
1253
|
-
# Always use filename only for matching
|
|
1254
|
-
file_name = file_path.stem
|
|
1255
|
-
|
|
1256
|
-
# Initialize dataset entry if it doesn't exist
|
|
1257
|
-
if file_name not in dataset_files:
|
|
1258
|
-
dataset_files[file_name] = {}
|
|
1259
|
-
|
|
1260
|
-
# Map this file to its specification (handle duplicates)
|
|
1261
|
-
if spec_name not in dataset_files[file_name]:
|
|
1262
|
-
dataset_files[file_name][spec_name] = file_path
|
|
1263
|
-
else:
|
|
1264
|
-
existing_file = dataset_files[file_name][spec_name]
|
|
1265
|
-
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
1266
|
-
dataset_files[file_name][spec_name] = file_path
|
|
1267
|
-
|
|
1268
|
-
if not dataset_files:
|
|
1269
|
-
self.run.log_message_with_code('NO_FILES_FOUND_WARNING')
|
|
1270
|
-
return organized_files
|
|
1271
|
-
|
|
1272
|
-
self.run.log_message_with_code('FILES_DISCOVERED', len(dataset_files))
|
|
1273
|
-
|
|
1274
|
-
# Organize datasets - check requirements and create metadata
|
|
1275
|
-
for file_name, files_dict in sorted(dataset_files.items()):
|
|
1276
|
-
if all(req in files_dict for req in required_specs):
|
|
1277
|
-
# Get most common file extension
|
|
1278
|
-
file_extensions = {}
|
|
1279
|
-
for file_path in files_dict.values():
|
|
1280
|
-
ext = file_path.suffix.lower()
|
|
1281
|
-
if ext:
|
|
1282
|
-
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
1283
|
-
|
|
1284
|
-
origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
|
|
1285
|
-
|
|
1286
|
-
# Create metadata for this dataset
|
|
1287
|
-
meta_data: Dict[str, Any] = {
|
|
1288
|
-
'origin_file_stem': file_name,
|
|
1289
|
-
'origin_file_extension': origin_file_extension,
|
|
1290
|
-
'created_at': datetime.now().isoformat(),
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
# Add excel metadata if available
|
|
1294
|
-
if excel_metadata and file_name in excel_metadata:
|
|
1295
|
-
meta_data.update(excel_metadata[file_name])
|
|
1296
|
-
|
|
1297
|
-
# Add the organized dataset
|
|
1298
|
-
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
1299
|
-
else:
|
|
1300
|
-
missing = [req for req in required_specs if req not in files_dict]
|
|
1301
|
-
self.run.log_message_with_code('MISSING_REQUIRED_FILES', file_name, ', '.join(missing))
|
|
1302
|
-
|
|
1303
|
-
return organized_files
|
|
1304
|
-
|
|
1305
|
-
def _get_file_size_mb(self, file_path: Path) -> float:
|
|
1306
|
-
"""Get file size in MB.
|
|
1307
|
-
|
|
1308
|
-
Args:
|
|
1309
|
-
file_path (Path): Path to the file.
|
|
1310
|
-
|
|
1311
|
-
Returns:
|
|
1312
|
-
float: File size in MB.
|
|
1313
|
-
"""
|
|
1314
|
-
return file_path.stat().st_size / (1024 * 1024)
|
|
1315
|
-
|
|
1316
|
-
def _requires_chunked_upload(self, organized_file: Dict[str, Any]) -> bool:
|
|
1317
|
-
"""Determine if chunked upload is required based on file size threshold.
|
|
1318
|
-
|
|
1319
|
-
Args:
|
|
1320
|
-
organized_file (Dict[str, Any]): Organized file data with 'files' dict.
|
|
1321
|
-
|
|
1322
|
-
Returns:
|
|
1323
|
-
bool: True if any file exceeds the threshold, False otherwise.
|
|
1324
|
-
"""
|
|
1325
|
-
max_file_size_mb = self.params.get('max_file_size_mb', 50)
|
|
1326
|
-
for file_path in organized_file.get('files', {}).values():
|
|
1327
|
-
if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
|
|
1328
|
-
return True
|
|
1329
|
-
return False
|
|
1330
|
-
|
|
1331
|
-
def _cleanup_temp_directory(self, temp_path: Optional[Path] = None) -> None:
|
|
1332
|
-
"""Clean up temporary directory.
|
|
1333
|
-
|
|
1334
|
-
Args:
|
|
1335
|
-
temp_path (Optional[Path]): Path to temporary directory.
|
|
1336
|
-
If None, uses default temp directory in current working directory.
|
|
1337
|
-
"""
|
|
1338
|
-
if temp_path is None:
|
|
1339
|
-
try:
|
|
1340
|
-
temp_path = Path(os.getcwd()) / 'temp'
|
|
1341
|
-
except (FileNotFoundError, OSError):
|
|
1342
|
-
return
|
|
1343
|
-
|
|
1344
|
-
if not temp_path.exists():
|
|
1345
|
-
return
|
|
1346
|
-
|
|
1347
|
-
shutil.rmtree(temp_path, ignore_errors=True)
|
|
1348
|
-
self.run.log_message(f'Cleaned up temporary directory: {temp_path}')
|
|
1349
|
-
|
|
1350
|
-
def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
|
|
1351
|
-
"""Update metrics for upload progress.
|
|
1352
|
-
|
|
1353
|
-
Args:
|
|
1354
|
-
total_count (int): Total number of items to process.
|
|
1355
|
-
success_count (int): Number of successfully processed items.
|
|
1356
|
-
failed_count (int): Number of failed items.
|
|
1357
|
-
category (str): The category of the metrics.
|
|
1358
|
-
"""
|
|
1359
|
-
if not self.run:
|
|
1360
|
-
raise ValueError('Run instance not properly initialized')
|
|
1361
|
-
|
|
1362
|
-
# Type assertion to help the linter
|
|
1363
|
-
assert isinstance(self.run, UploadRun)
|
|
1364
|
-
|
|
1365
|
-
metrics = self.run.MetricsRecord(
|
|
1366
|
-
stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
|
|
1367
|
-
)
|
|
1368
|
-
self.run.log_metrics(metrics, category)
|