synapse-sdk 1.0.0b22__py3-none-any.whl → 1.0.0b23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +680 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +897 -0
- synapse_sdk/devtools/docs/sidebars.ts +1 -0
- synapse_sdk/plugins/README.md +934 -0
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +20 -0
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +623 -0
- synapse_sdk/plugins/categories/upload/actions/upload/enums.py +221 -0
- synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +149 -0
- synapse_sdk/plugins/categories/upload/actions/upload/run.py +178 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +139 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +6 -1
- synapse_sdk/plugins/models.py +13 -7
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/METADATA +1 -1
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/RECORD +19 -10
- synapse_sdk/plugins/categories/upload/actions/upload.py +0 -1368
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b22.dist-info → synapse_sdk-1.0.0b23.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .action import UploadAction
|
|
2
|
+
from .enums import LOG_MESSAGES, LogCode, UploadStatus
|
|
3
|
+
from .exceptions import ExcelParsingError, ExcelSecurityError
|
|
4
|
+
from .models import UploadParams
|
|
5
|
+
from .run import UploadRun
|
|
6
|
+
from .utils import ExcelMetadataUtils, ExcelSecurityConfig, PathAwareJSONEncoder
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'UploadAction',
|
|
10
|
+
'UploadRun',
|
|
11
|
+
'UploadParams',
|
|
12
|
+
'UploadStatus',
|
|
13
|
+
'LogCode',
|
|
14
|
+
'LOG_MESSAGES',
|
|
15
|
+
'ExcelSecurityError',
|
|
16
|
+
'ExcelParsingError',
|
|
17
|
+
'PathAwareJSONEncoder',
|
|
18
|
+
'ExcelSecurityConfig',
|
|
19
|
+
'ExcelMetadataUtils',
|
|
20
|
+
]
|
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Awaitable, Dict, List, Optional, TypeVar
|
|
8
|
+
|
|
9
|
+
from openpyxl import load_workbook
|
|
10
|
+
from openpyxl.utils.exceptions import InvalidFileException
|
|
11
|
+
|
|
12
|
+
from synapse_sdk.clients.exceptions import ClientError
|
|
13
|
+
from synapse_sdk.clients.utils import get_batched_list
|
|
14
|
+
from synapse_sdk.clients.validators.collections import FileSpecificationValidator
|
|
15
|
+
from synapse_sdk.plugins.categories.base import Action
|
|
16
|
+
from synapse_sdk.plugins.categories.decorators import register_action
|
|
17
|
+
from synapse_sdk.plugins.categories.upload.actions.upload.models import UploadParams
|
|
18
|
+
from synapse_sdk.plugins.enums import PluginCategory, RunMethod
|
|
19
|
+
from synapse_sdk.plugins.exceptions import ActionError
|
|
20
|
+
from synapse_sdk.utils.storage import get_pathlib
|
|
21
|
+
|
|
22
|
+
from .enums import LogCode, UploadStatus
|
|
23
|
+
from .exceptions import ExcelParsingError, ExcelSecurityError
|
|
24
|
+
from .run import UploadRun
|
|
25
|
+
from .utils import ExcelMetadataUtils, ExcelSecurityConfig
|
|
26
|
+
|
|
27
|
+
T = TypeVar('T')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@register_action
|
|
31
|
+
class UploadAction(Action):
|
|
32
|
+
"""Main upload action for processing and uploading files to storage.
|
|
33
|
+
|
|
34
|
+
Handles file upload operations including validation, file discovery,
|
|
35
|
+
Excel metadata processing, and data unit generation. Supports both
|
|
36
|
+
synchronous and asynchronous upload processing with comprehensive
|
|
37
|
+
progress tracking and error handling.
|
|
38
|
+
|
|
39
|
+
Features:
|
|
40
|
+
- Recursive directory scanning
|
|
41
|
+
- Excel metadata validation and processing
|
|
42
|
+
- Batch processing for large file sets
|
|
43
|
+
- Type-based file organization
|
|
44
|
+
- Security validation for Excel files
|
|
45
|
+
- Progress tracking with detailed metrics
|
|
46
|
+
- Comprehensive error logging
|
|
47
|
+
|
|
48
|
+
Class Attributes:
|
|
49
|
+
name (str): Action identifier ('upload')
|
|
50
|
+
category (PluginCategory): UPLOAD category
|
|
51
|
+
method (RunMethod): JOB execution method
|
|
52
|
+
run_class (type): UploadRun for specialized logging
|
|
53
|
+
params_model (type): UploadParams for parameter validation
|
|
54
|
+
progress_categories (dict): Progress tracking configuration
|
|
55
|
+
metrics_categories (dict): Metrics collection configuration
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> action = UploadAction(
|
|
59
|
+
... params={
|
|
60
|
+
... 'name': 'Data Upload',
|
|
61
|
+
... 'path': '/data/files',
|
|
62
|
+
... 'storage': 1,
|
|
63
|
+
... 'collection': 5
|
|
64
|
+
... },
|
|
65
|
+
... plugin_config=config
|
|
66
|
+
... )
|
|
67
|
+
>>> result = action.run_action()
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name = 'upload'
|
|
71
|
+
category = PluginCategory.UPLOAD
|
|
72
|
+
method = RunMethod.JOB
|
|
73
|
+
run_class = UploadRun
|
|
74
|
+
params_model = UploadParams
|
|
75
|
+
progress_categories = {
|
|
76
|
+
'analyze_collection': {
|
|
77
|
+
'proportion': 2,
|
|
78
|
+
},
|
|
79
|
+
'upload_data_files': {
|
|
80
|
+
'proportion': 38,
|
|
81
|
+
},
|
|
82
|
+
'generate_data_units': {
|
|
83
|
+
'proportion': 60,
|
|
84
|
+
},
|
|
85
|
+
}
|
|
86
|
+
metrics_categories = {
|
|
87
|
+
'data_files': {
|
|
88
|
+
'stand_by': 0,
|
|
89
|
+
'failed': 0,
|
|
90
|
+
'success': 0,
|
|
91
|
+
},
|
|
92
|
+
'data_units': {
|
|
93
|
+
'stand_by': 0,
|
|
94
|
+
'failed': 0,
|
|
95
|
+
'success': 0,
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def __init__(self, *args, **kwargs):
|
|
100
|
+
super().__init__(*args, **kwargs)
|
|
101
|
+
self.excel_config = ExcelSecurityConfig()
|
|
102
|
+
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
103
|
+
|
|
104
|
+
def get_uploader(self, path, file_specification, organized_files, params: Dict = {}):
|
|
105
|
+
"""Get uploader from entrypoint."""
|
|
106
|
+
return self.entrypoint(
|
|
107
|
+
self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
|
|
111
|
+
return [file_path for file_path in dir_path.rglob('*') if file_path.is_file()]
|
|
112
|
+
|
|
113
|
+
def _discover_files_non_recursive(self, dir_path: Path) -> List[Path]:
|
|
114
|
+
return [file_path for file_path in dir_path.glob('*') if file_path.is_file()]
|
|
115
|
+
|
|
116
|
+
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
117
|
+
file_size = excel_path.stat().st_size
|
|
118
|
+
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
119
|
+
raise ExcelSecurityError(
|
|
120
|
+
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
estimated_memory = file_size * 3
|
|
124
|
+
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
125
|
+
raise ExcelSecurityError(
|
|
126
|
+
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
127
|
+
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
131
|
+
self._validate_excel_security(excel_path)
|
|
132
|
+
excel_bytes = excel_path.read_bytes()
|
|
133
|
+
return BytesIO(excel_bytes)
|
|
134
|
+
|
|
135
|
+
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
136
|
+
if len(headers) < 2:
|
|
137
|
+
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
138
|
+
self._validate_excel_content(headers, 0)
|
|
139
|
+
return headers
|
|
140
|
+
|
|
141
|
+
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
142
|
+
if not row[0] or str(row[0]).strip() == '':
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
file_name = str(row[0]).strip()
|
|
146
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
147
|
+
self.run.log_message_with_code(LogCode.FILENAME_TOO_LONG, file_name[:50])
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
file_metadata: Dict[str, Any] = {}
|
|
151
|
+
for i, value in enumerate(row[1:], start=1):
|
|
152
|
+
if value is not None and i < len(headers):
|
|
153
|
+
header_value = headers[i]
|
|
154
|
+
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
155
|
+
|
|
156
|
+
column_name = self.excel_utils.validate_and_truncate_string(
|
|
157
|
+
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
158
|
+
)
|
|
159
|
+
str_value = self.excel_utils.validate_and_truncate_string(
|
|
160
|
+
str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
161
|
+
)
|
|
162
|
+
file_metadata[column_name] = str_value
|
|
163
|
+
|
|
164
|
+
return {file_name: file_metadata} if file_metadata else None
|
|
165
|
+
|
|
166
|
+
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
167
|
+
if worksheet is None:
|
|
168
|
+
raise ExcelParsingError('Excel file has no active worksheet')
|
|
169
|
+
|
|
170
|
+
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
171
|
+
headers: Optional[tuple] = None
|
|
172
|
+
data_row_count = 0
|
|
173
|
+
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
174
|
+
|
|
175
|
+
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
176
|
+
if not row or all(cell is None or str(cell).strip() == '' for cell in row):
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
if row_idx == 0:
|
|
180
|
+
headers = self._process_excel_headers(row)
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if headers is None:
|
|
184
|
+
raise ExcelParsingError('Excel file missing header row')
|
|
185
|
+
|
|
186
|
+
data_row_count += 1
|
|
187
|
+
|
|
188
|
+
if data_row_count % validation_interval == 0:
|
|
189
|
+
self._validate_excel_content(headers, data_row_count)
|
|
190
|
+
|
|
191
|
+
row_result = self._process_excel_data_row(row, headers)
|
|
192
|
+
if row_result:
|
|
193
|
+
metadata_dict.update(row_result)
|
|
194
|
+
|
|
195
|
+
self._validate_excel_content(headers or (), data_row_count)
|
|
196
|
+
|
|
197
|
+
return metadata_dict
|
|
198
|
+
|
|
199
|
+
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
200
|
+
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
201
|
+
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
202
|
+
|
|
203
|
+
if row_count > self.excel_config.MAX_ROWS:
|
|
204
|
+
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
205
|
+
|
|
206
|
+
def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
|
|
207
|
+
for extension in ['.xlsx', '.xls']:
|
|
208
|
+
excel_path = pathlib_cwd / f'meta{extension}'
|
|
209
|
+
if excel_path.exists() and excel_path.is_file():
|
|
210
|
+
return excel_path
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
|
|
214
|
+
excel_path = None
|
|
215
|
+
|
|
216
|
+
excel_metadata_path = self.params.get('excel_metadata_path')
|
|
217
|
+
if excel_metadata_path:
|
|
218
|
+
excel_path = pathlib_cwd / excel_metadata_path
|
|
219
|
+
if not excel_path.exists():
|
|
220
|
+
self.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
|
|
221
|
+
return {}
|
|
222
|
+
else:
|
|
223
|
+
excel_path = self._find_excel_metadata_file(pathlib_cwd)
|
|
224
|
+
if not excel_path:
|
|
225
|
+
return {}
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
self.run.log_message_with_code(LogCode.EXCEL_FILE_VALIDATION_STARTED)
|
|
229
|
+
|
|
230
|
+
excel_stream = self._prepare_excel_file(excel_path)
|
|
231
|
+
|
|
232
|
+
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
233
|
+
try:
|
|
234
|
+
self.run.log_message_with_code(LogCode.EXCEL_WORKBOOK_LOADED)
|
|
235
|
+
return self._process_excel_worksheet(workbook.active)
|
|
236
|
+
finally:
|
|
237
|
+
workbook.close()
|
|
238
|
+
|
|
239
|
+
except ExcelSecurityError as e:
|
|
240
|
+
self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VALIDATION_FAILED, str(e))
|
|
241
|
+
raise
|
|
242
|
+
except ExcelParsingError as e:
|
|
243
|
+
self.run.log_message_with_code(LogCode.EXCEL_PARSING_FAILED, str(e))
|
|
244
|
+
raise
|
|
245
|
+
except InvalidFileException as e:
|
|
246
|
+
self.run.log_message_with_code(LogCode.EXCEL_INVALID_FILE_FORMAT, str(e))
|
|
247
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
248
|
+
except MemoryError:
|
|
249
|
+
self.run.log_message_with_code(LogCode.EXCEL_FILE_TOO_LARGE)
|
|
250
|
+
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
251
|
+
except (OSError, IOError) as e:
|
|
252
|
+
self.run.log_message_with_code(LogCode.EXCEL_FILE_ACCESS_ERROR, str(e))
|
|
253
|
+
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
254
|
+
except Exception as e:
|
|
255
|
+
self.run.log_message_with_code(LogCode.EXCEL_UNEXPECTED_ERROR, str(e))
|
|
256
|
+
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
257
|
+
|
|
258
|
+
def start(self) -> Dict[str, Any]:
|
|
259
|
+
result: Dict[str, Any] = {}
|
|
260
|
+
|
|
261
|
+
storage_id = self.params.get('storage')
|
|
262
|
+
if storage_id is None:
|
|
263
|
+
raise ActionError('Storage parameter is required')
|
|
264
|
+
storage = self.client.get_storage(storage_id)
|
|
265
|
+
|
|
266
|
+
path = self.params.get('path')
|
|
267
|
+
if path is None:
|
|
268
|
+
raise ActionError('Path parameter is required')
|
|
269
|
+
pathlib_cwd = get_pathlib(storage, path)
|
|
270
|
+
|
|
271
|
+
excel_metadata: Dict[str, Dict[str, Any]] = {}
|
|
272
|
+
try:
|
|
273
|
+
excel_metadata = self._read_excel_metadata(pathlib_cwd)
|
|
274
|
+
if excel_metadata:
|
|
275
|
+
self.run.log_message_with_code(LogCode.EXCEL_METADATA_LOADED, len(excel_metadata))
|
|
276
|
+
except ExcelSecurityError as e:
|
|
277
|
+
self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VIOLATION, str(e))
|
|
278
|
+
return result
|
|
279
|
+
except ExcelParsingError as e:
|
|
280
|
+
if self.params.get('excel_metadata_path'):
|
|
281
|
+
self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
|
|
282
|
+
return result
|
|
283
|
+
else:
|
|
284
|
+
self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
|
|
285
|
+
excel_metadata = {}
|
|
286
|
+
|
|
287
|
+
file_specification_template = self._analyze_collection()
|
|
288
|
+
organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
|
|
289
|
+
|
|
290
|
+
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
|
|
291
|
+
|
|
292
|
+
organized_files = uploader.handle_upload_files()
|
|
293
|
+
|
|
294
|
+
if not self._validate_organized_files(organized_files, file_specification_template):
|
|
295
|
+
self.run.log_message_with_code(LogCode.VALIDATION_FAILED)
|
|
296
|
+
raise ActionError('Upload is aborted due to validation errors.')
|
|
297
|
+
|
|
298
|
+
if not organized_files:
|
|
299
|
+
self.run.log_message_with_code(LogCode.NO_FILES_FOUND)
|
|
300
|
+
raise ActionError('Upload is aborted due to missing files.')
|
|
301
|
+
|
|
302
|
+
if self.params.get('use_async_upload', True):
|
|
303
|
+
uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
|
|
304
|
+
else:
|
|
305
|
+
uploaded_files = self._upload_files(organized_files)
|
|
306
|
+
result['uploaded_files_count'] = len(uploaded_files)
|
|
307
|
+
|
|
308
|
+
if not uploaded_files:
|
|
309
|
+
self.run.log_message_with_code(LogCode.NO_FILES_UPLOADED)
|
|
310
|
+
raise ActionError('Upload is aborted due to no uploaded files.')
|
|
311
|
+
generated_data_units = self._generate_data_units(
|
|
312
|
+
uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
|
|
313
|
+
)
|
|
314
|
+
result['generated_data_units_count'] = len(generated_data_units)
|
|
315
|
+
|
|
316
|
+
if not generated_data_units:
|
|
317
|
+
self.run.log_message_with_code(LogCode.NO_DATA_UNITS_GENERATED)
|
|
318
|
+
raise ActionError('Upload is aborted due to no generated data units.')
|
|
319
|
+
|
|
320
|
+
self._cleanup_temp_directory()
|
|
321
|
+
|
|
322
|
+
self.run.log_message_with_code(LogCode.IMPORT_COMPLETED)
|
|
323
|
+
return result
|
|
324
|
+
|
|
325
|
+
def _analyze_collection(self) -> Dict[str, Any]:
|
|
326
|
+
self.run.set_progress(0, 2, category='analyze_collection')
|
|
327
|
+
|
|
328
|
+
collection_id = self.params.get('data_collection')
|
|
329
|
+
if collection_id is None:
|
|
330
|
+
raise ActionError('Data collection parameter is required')
|
|
331
|
+
self.run.set_progress(1, 2, category='analyze_collection')
|
|
332
|
+
|
|
333
|
+
collection = self.run.client.get_data_collection(collection_id)
|
|
334
|
+
self.run.set_progress(2, 2, category='analyze_collection')
|
|
335
|
+
|
|
336
|
+
return collection['file_specifications']
|
|
337
|
+
|
|
338
|
+
def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
339
|
+
organized_files_count = len(organized_files)
|
|
340
|
+
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
341
|
+
self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
|
|
342
|
+
|
|
343
|
+
client = self.run.client
|
|
344
|
+
collection_id = self.params.get('data_collection')
|
|
345
|
+
if collection_id is None:
|
|
346
|
+
raise ActionError('Data collection parameter is required')
|
|
347
|
+
upload_result = []
|
|
348
|
+
current_progress = 0
|
|
349
|
+
success_count = 0
|
|
350
|
+
failed_count = 0
|
|
351
|
+
|
|
352
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
353
|
+
|
|
354
|
+
for organized_file in organized_files:
|
|
355
|
+
try:
|
|
356
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file)
|
|
357
|
+
uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
358
|
+
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
359
|
+
success_count += 1
|
|
360
|
+
upload_result.append(uploaded_data_file)
|
|
361
|
+
except Exception as e:
|
|
362
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
363
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
|
|
364
|
+
failed_count += 1
|
|
365
|
+
|
|
366
|
+
current_progress += 1
|
|
367
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
368
|
+
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
369
|
+
|
|
370
|
+
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
371
|
+
|
|
372
|
+
return upload_result
|
|
373
|
+
|
|
374
|
+
def run_async(self, coro: Awaitable[T]) -> T:
|
|
375
|
+
import concurrent.futures
|
|
376
|
+
|
|
377
|
+
def _run_in_thread():
|
|
378
|
+
return asyncio.run(coro)
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
asyncio.get_running_loop()
|
|
382
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
383
|
+
future = executor.submit(_run_in_thread)
|
|
384
|
+
return future.result()
|
|
385
|
+
except RuntimeError:
|
|
386
|
+
return asyncio.run(coro)
|
|
387
|
+
|
|
388
|
+
async def _upload_files_async(
|
|
389
|
+
self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
|
|
390
|
+
) -> List[Dict[str, Any]]:
|
|
391
|
+
organized_files_count = len(organized_files)
|
|
392
|
+
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
393
|
+
self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
|
|
394
|
+
|
|
395
|
+
client = self.run.client
|
|
396
|
+
collection_id = self.params.get('data_collection')
|
|
397
|
+
if collection_id is None:
|
|
398
|
+
raise ActionError('Data collection parameter is required')
|
|
399
|
+
upload_result = []
|
|
400
|
+
success_count = 0
|
|
401
|
+
failed_count = 0
|
|
402
|
+
|
|
403
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
404
|
+
|
|
405
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
406
|
+
|
|
407
|
+
async def upload_single_file(organized_file):
|
|
408
|
+
async with semaphore:
|
|
409
|
+
loop = asyncio.get_event_loop()
|
|
410
|
+
try:
|
|
411
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file)
|
|
412
|
+
uploaded_data_file = await loop.run_in_executor(
|
|
413
|
+
None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
414
|
+
)
|
|
415
|
+
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
416
|
+
return {'status': 'success', 'result': uploaded_data_file}
|
|
417
|
+
except ClientError as e:
|
|
418
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
419
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Client error: {str(e)}')
|
|
420
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
|
|
421
|
+
except (OSError, IOError) as e:
|
|
422
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
423
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'File system error: {str(e)}')
|
|
424
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
|
|
425
|
+
except MemoryError as e:
|
|
426
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
427
|
+
self.run.log_message_with_code(
|
|
428
|
+
LogCode.FILE_UPLOAD_FAILED, f'Memory error (file too large): {str(e)}'
|
|
429
|
+
)
|
|
430
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
|
|
431
|
+
except asyncio.TimeoutError as e:
|
|
432
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
433
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Upload timeout: {str(e)}')
|
|
434
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
|
|
435
|
+
except ValueError as e:
|
|
436
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
437
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Data validation error: {str(e)}')
|
|
438
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
|
|
439
|
+
except Exception as e:
|
|
440
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
441
|
+
self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Unexpected error: {str(e)}')
|
|
442
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
|
|
443
|
+
|
|
444
|
+
tasks = [upload_single_file(organized_file) for organized_file in organized_files]
|
|
445
|
+
|
|
446
|
+
current_progress = 0
|
|
447
|
+
for completed_task in asyncio.as_completed(tasks):
|
|
448
|
+
result = await completed_task
|
|
449
|
+
current_progress += 1
|
|
450
|
+
|
|
451
|
+
if result['status'] == 'success':
|
|
452
|
+
success_count += 1
|
|
453
|
+
upload_result.append(result['result'])
|
|
454
|
+
else:
|
|
455
|
+
failed_count += 1
|
|
456
|
+
|
|
457
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
458
|
+
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
459
|
+
|
|
460
|
+
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
461
|
+
|
|
462
|
+
return upload_result
|
|
463
|
+
|
|
464
|
+
def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
|
|
465
|
+
upload_result_count = len(uploaded_files)
|
|
466
|
+
self.run.set_progress(0, upload_result_count, category='generate_data_units')
|
|
467
|
+
self.run.log_message_with_code(LogCode.GENERATING_DATA_UNITS)
|
|
468
|
+
|
|
469
|
+
client = self.run.client
|
|
470
|
+
generated_data_units = []
|
|
471
|
+
current_progress = 0
|
|
472
|
+
success_count = 0
|
|
473
|
+
failed_count = 0
|
|
474
|
+
|
|
475
|
+
batches = get_batched_list(uploaded_files, batch_size)
|
|
476
|
+
batches_count = len(batches)
|
|
477
|
+
|
|
478
|
+
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
479
|
+
|
|
480
|
+
for batch in batches:
|
|
481
|
+
try:
|
|
482
|
+
created_data_units = client.create_data_units(batch)
|
|
483
|
+
success_count += len(created_data_units)
|
|
484
|
+
generated_data_units.append(created_data_units)
|
|
485
|
+
for created_data_unit in created_data_units:
|
|
486
|
+
self.run.log_data_unit(
|
|
487
|
+
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
488
|
+
)
|
|
489
|
+
except Exception as e:
|
|
490
|
+
failed_count += len(batch)
|
|
491
|
+
self.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
|
|
492
|
+
for _ in batch:
|
|
493
|
+
self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
494
|
+
|
|
495
|
+
current_progress += 1
|
|
496
|
+
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
497
|
+
self.run.set_progress(current_progress, batches_count, category='generate_data_units')
|
|
498
|
+
|
|
499
|
+
self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
|
|
500
|
+
|
|
501
|
+
return sum(generated_data_units, [])
|
|
502
|
+
|
|
503
|
+
def _validate_organized_files(
|
|
504
|
+
self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
|
|
505
|
+
) -> bool:
|
|
506
|
+
validator = FileSpecificationValidator(file_specification_template, organized_files)
|
|
507
|
+
return validator.validate()
|
|
508
|
+
|
|
509
|
+
def _organize_files(
|
|
510
|
+
self,
|
|
511
|
+
directory: Path,
|
|
512
|
+
file_specification: List[Dict[str, Any]],
|
|
513
|
+
excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
514
|
+
) -> List[Dict[str, Any]]:
|
|
515
|
+
organized_files: List[Dict[str, Any]] = []
|
|
516
|
+
|
|
517
|
+
type_dirs: Dict[str, Path] = {}
|
|
518
|
+
|
|
519
|
+
for spec in file_specification:
|
|
520
|
+
spec_name = spec['name']
|
|
521
|
+
spec_dir = directory / spec_name
|
|
522
|
+
if spec_dir.exists() and spec_dir.is_dir():
|
|
523
|
+
type_dirs[spec_name] = spec_dir
|
|
524
|
+
|
|
525
|
+
if type_dirs:
|
|
526
|
+
self.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
|
|
527
|
+
|
|
528
|
+
if not type_dirs:
|
|
529
|
+
self.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
|
|
530
|
+
return organized_files
|
|
531
|
+
|
|
532
|
+
self.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
|
|
533
|
+
self.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
|
|
534
|
+
|
|
535
|
+
dataset_files = {}
|
|
536
|
+
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
537
|
+
|
|
538
|
+
is_recursive = self.params.get('is_recursive', True)
|
|
539
|
+
|
|
540
|
+
for spec_name, dir_path in type_dirs.items():
|
|
541
|
+
if is_recursive:
|
|
542
|
+
files_list = self._discover_files_recursive(dir_path)
|
|
543
|
+
else:
|
|
544
|
+
files_list = self._discover_files_non_recursive(dir_path)
|
|
545
|
+
|
|
546
|
+
for file_path in files_list:
|
|
547
|
+
file_name = file_path.stem
|
|
548
|
+
|
|
549
|
+
if file_name not in dataset_files:
|
|
550
|
+
dataset_files[file_name] = {}
|
|
551
|
+
|
|
552
|
+
if spec_name not in dataset_files[file_name]:
|
|
553
|
+
dataset_files[file_name][spec_name] = file_path
|
|
554
|
+
else:
|
|
555
|
+
existing_file = dataset_files[file_name][spec_name]
|
|
556
|
+
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
557
|
+
dataset_files[file_name][spec_name] = file_path
|
|
558
|
+
|
|
559
|
+
if not dataset_files:
|
|
560
|
+
self.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
|
|
561
|
+
return organized_files
|
|
562
|
+
|
|
563
|
+
self.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(dataset_files))
|
|
564
|
+
|
|
565
|
+
for file_name, files_dict in sorted(dataset_files.items()):
|
|
566
|
+
if all(req in files_dict for req in required_specs):
|
|
567
|
+
file_extensions = {}
|
|
568
|
+
for file_path in files_dict.values():
|
|
569
|
+
ext = file_path.suffix.lower()
|
|
570
|
+
if ext:
|
|
571
|
+
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
572
|
+
|
|
573
|
+
origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
|
|
574
|
+
|
|
575
|
+
meta_data: Dict[str, Any] = {
|
|
576
|
+
'origin_file_stem': file_name,
|
|
577
|
+
'origin_file_extension': origin_file_extension,
|
|
578
|
+
'created_at': datetime.now().isoformat(),
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
if excel_metadata and file_name in excel_metadata:
|
|
582
|
+
meta_data.update(excel_metadata[file_name])
|
|
583
|
+
|
|
584
|
+
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
585
|
+
else:
|
|
586
|
+
missing = [req for req in required_specs if req not in files_dict]
|
|
587
|
+
self.run.log_message_with_code(LogCode.MISSING_REQUIRED_FILES, file_name, ', '.join(missing))
|
|
588
|
+
|
|
589
|
+
return organized_files
|
|
590
|
+
|
|
591
|
+
def _get_file_size_mb(self, file_path: Path) -> float:
|
|
592
|
+
return file_path.stat().st_size / (1024 * 1024)
|
|
593
|
+
|
|
594
|
+
def _requires_chunked_upload(self, organized_file: Dict[str, Any]) -> bool:
|
|
595
|
+
max_file_size_mb = self.params.get('max_file_size_mb', 50)
|
|
596
|
+
for file_path in organized_file.get('files', {}).values():
|
|
597
|
+
if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
|
|
598
|
+
return True
|
|
599
|
+
return False
|
|
600
|
+
|
|
601
|
+
def _cleanup_temp_directory(self, temp_path: Optional[Path] = None) -> None:
|
|
602
|
+
if temp_path is None:
|
|
603
|
+
try:
|
|
604
|
+
temp_path = Path(os.getcwd()) / 'temp'
|
|
605
|
+
except (FileNotFoundError, OSError):
|
|
606
|
+
return
|
|
607
|
+
|
|
608
|
+
if not temp_path.exists():
|
|
609
|
+
return
|
|
610
|
+
|
|
611
|
+
shutil.rmtree(temp_path, ignore_errors=True)
|
|
612
|
+
self.run.log_message(f'Cleaned up temporary directory: {temp_path}')
|
|
613
|
+
|
|
614
|
+
def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
|
|
615
|
+
if not self.run:
|
|
616
|
+
raise ValueError('Run instance not properly initialized')
|
|
617
|
+
|
|
618
|
+
assert isinstance(self.run, UploadRun)
|
|
619
|
+
|
|
620
|
+
metrics = self.run.MetricsRecord(
|
|
621
|
+
stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
|
|
622
|
+
)
|
|
623
|
+
self.run.log_metrics(metrics, category)
|