synapse-sdk 1.0.0b18__py3-none-any.whl → 1.0.0b20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/devtools/docs/docs/contributing.md +1 -1
- synapse_sdk/devtools/docs/docs/features/index.md +4 -4
- synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +786 -0
- synapse_sdk/devtools/docs/docs/{features/plugins/index.md → plugins/plugins.md} +357 -21
- synapse_sdk/devtools/docs/docusaurus.config.ts +8 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +788 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/plugins.md +71 -0
- synapse_sdk/devtools/docs/package-lock.json +1366 -37
- synapse_sdk/devtools/docs/package.json +2 -1
- synapse_sdk/devtools/docs/sidebars.ts +8 -1
- synapse_sdk/plugins/categories/base.py +28 -2
- synapse_sdk/plugins/categories/export/actions/export.py +2 -1
- synapse_sdk/plugins/categories/export/templates/config.yaml +1 -1
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +375 -0
- synapse_sdk/plugins/categories/export/templates/plugin/export.py +56 -190
- synapse_sdk/plugins/categories/upload/actions/upload.py +181 -22
- synapse_sdk/plugins/categories/upload/templates/config.yaml +24 -2
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +8 -2
- synapse_sdk/plugins/models.py +28 -2
- synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
- synapse_sdk/plugins/templates/schema.json +7 -0
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/METADATA +1 -1
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/RECORD +27 -25
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/features/plugins/index.md +0 -30
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/top_level.txt +0 -0
|
@@ -1,197 +1,63 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from itertools import tee
|
|
3
1
|
from pathlib import Path
|
|
2
|
+
from typing import Generator
|
|
4
3
|
|
|
5
|
-
import
|
|
4
|
+
from . import BaseExporter
|
|
6
5
|
|
|
7
|
-
from synapse_sdk.plugins.categories.export.enums import ExportStatus
|
|
8
6
|
|
|
7
|
+
class Exporter(BaseExporter):
|
|
8
|
+
"""Plugin export action interface for organizing files.
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
run : Execution object
|
|
15
|
-
export_items (generator):
|
|
16
|
-
- data (dict): dm_schema_data information.
|
|
17
|
-
- files (dict): File information. Includes file URL, original file path, metadata, etc.
|
|
18
|
-
- id (int): ground_truth ID
|
|
19
|
-
path_root : pathlib object, the path to export
|
|
20
|
-
**params: Additional parameters
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
dict: Result
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
export_path = path_root / params['name']
|
|
27
|
-
unique_export_path = export_path
|
|
28
|
-
counter = 1
|
|
29
|
-
while unique_export_path.exists():
|
|
30
|
-
unique_export_path = export_path.with_name(f'{export_path.name}({counter})')
|
|
31
|
-
counter += 1
|
|
32
|
-
unique_export_path.mkdir(parents=True)
|
|
33
|
-
|
|
34
|
-
run.log_message('Starting export process.')
|
|
35
|
-
|
|
36
|
-
# results contains all information fetched through the list API.
|
|
37
|
-
# example:
|
|
38
|
-
# params.get('results', [])
|
|
39
|
-
|
|
40
|
-
save_original_file_flag = params.get('save_original_file')
|
|
41
|
-
errors_json_file_list = []
|
|
42
|
-
errors_original_file_list = []
|
|
43
|
-
|
|
44
|
-
# Path to save JSON files
|
|
45
|
-
json_output_path = unique_export_path / 'json'
|
|
46
|
-
json_output_path.mkdir(parents=True, exist_ok=True)
|
|
47
|
-
|
|
48
|
-
# Path to save original files
|
|
49
|
-
if save_original_file_flag:
|
|
50
|
-
origin_files_output_path = unique_export_path / 'origin_files'
|
|
51
|
-
origin_files_output_path.mkdir(parents=True, exist_ok=True)
|
|
52
|
-
|
|
53
|
-
export_items_count, export_items_process = tee(export_items)
|
|
54
|
-
total = sum(1 for _ in export_items_count)
|
|
55
|
-
|
|
56
|
-
original_file_metrics_record = run.MetricsRecord(stand_by=total, success=0, failed=0)
|
|
57
|
-
data_file_metrics_record = run.MetricsRecord(stand_by=total, success=0, failed=0)
|
|
58
|
-
# progress init
|
|
59
|
-
run.set_progress(0, total, category='dataset_conversion')
|
|
60
|
-
for no, export_item in enumerate(export_items_process, start=1):
|
|
61
|
-
run.set_progress(no, total, category='dataset_conversion')
|
|
62
|
-
if no == 1:
|
|
63
|
-
run.log_message('Converting dataset.')
|
|
64
|
-
preprocessed_data = before_convert(export_item)
|
|
65
|
-
converted_data = convert_data(preprocessed_data)
|
|
66
|
-
final_data = after_convert(converted_data)
|
|
67
|
-
|
|
68
|
-
# Call if original file extraction is needed
|
|
69
|
-
if save_original_file_flag:
|
|
70
|
-
if no == 1:
|
|
71
|
-
run.log_message('Saving original file.')
|
|
72
|
-
original_status = save_original_file(run, final_data, origin_files_output_path, errors_original_file_list)
|
|
73
|
-
|
|
74
|
-
original_file_metrics_record.stand_by -= 1
|
|
75
|
-
if original_status == ExportStatus.FAILED:
|
|
76
|
-
original_file_metrics_record.failed += 1
|
|
77
|
-
continue
|
|
78
|
-
else:
|
|
79
|
-
original_file_metrics_record.success += 1
|
|
80
|
-
|
|
81
|
-
run.log_metrics(record=original_file_metrics_record, category='original_file')
|
|
82
|
-
|
|
83
|
-
# Extract data as JSON files
|
|
84
|
-
if no == 1:
|
|
85
|
-
run.log_message('Saving json file.')
|
|
86
|
-
data_status = save_as_json(run, final_data, json_output_path, errors_json_file_list)
|
|
87
|
-
|
|
88
|
-
data_file_metrics_record.stand_by -= 1
|
|
89
|
-
if data_status == ExportStatus.FAILED:
|
|
90
|
-
data_file_metrics_record.failed += 1
|
|
91
|
-
continue
|
|
92
|
-
else:
|
|
93
|
-
data_file_metrics_record.success += 1
|
|
94
|
-
|
|
95
|
-
run.log_metrics(record=data_file_metrics_record, category='data_file')
|
|
96
|
-
|
|
97
|
-
run.end_log()
|
|
98
|
-
|
|
99
|
-
# Save error list files
|
|
100
|
-
if len(errors_json_file_list) > 0 or len(errors_original_file_list) > 0:
|
|
101
|
-
export_error_file = {'json_file_name': errors_json_file_list, 'origin_file_name': errors_original_file_list}
|
|
102
|
-
with (unique_export_path / 'error_file_list.json').open('w', encoding='utf-8') as f:
|
|
103
|
-
json.dump(export_error_file, f, indent=4, ensure_ascii=False)
|
|
104
|
-
|
|
105
|
-
return {'export_path': str(path_root)}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def convert_data(data):
|
|
109
|
-
"""Converts the data."""
|
|
110
|
-
return data
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def before_convert(data):
|
|
114
|
-
"""Preprocesses the data before conversion."""
|
|
115
|
-
return data
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def after_convert(data):
|
|
119
|
-
"""Post-processes the data after conversion."""
|
|
120
|
-
return data
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def get_original_file_name(files):
|
|
124
|
-
"""Retrieve the original file path from the given file information.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
files (dict): A dictionary containing file information, including file URL,
|
|
128
|
-
original file path, metadata, etc.
|
|
129
|
-
|
|
130
|
-
Returns:
|
|
131
|
-
file_name (str): The original file name extracted from the file information.
|
|
10
|
+
This class provides a minimal interface for plugin developers to implement
|
|
11
|
+
their own export logic.
|
|
132
12
|
"""
|
|
133
|
-
return files['file_name_original']
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def save_original_file(run, result, base_path, error_file_list):
|
|
137
|
-
"""Saves the original file.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
run : Execution object
|
|
141
|
-
result (dict): API response data containing file information.
|
|
142
|
-
base_path (Path): The directory where the file will be saved.
|
|
143
|
-
error_file_list (list): A list to store error files.
|
|
144
|
-
"""
|
|
145
|
-
file_url = result['files']['url']
|
|
146
|
-
file_name = get_original_file_name(result['files'])
|
|
147
|
-
response = requests.get(file_url)
|
|
148
|
-
file_info = {'file_name': file_name}
|
|
149
|
-
error_msg = ''
|
|
150
|
-
try:
|
|
151
|
-
with (base_path / file_name).open('wb') as file:
|
|
152
|
-
file.write(response.content)
|
|
153
|
-
status = ExportStatus.SUCCESS
|
|
154
|
-
except Exception as e:
|
|
155
|
-
error_msg = str(e)
|
|
156
|
-
error_file_list.append([file_name, error_msg])
|
|
157
|
-
status = ExportStatus.FAILED
|
|
158
|
-
|
|
159
|
-
run.export_log_original_file(result['id'], file_info, status, error_msg)
|
|
160
|
-
return status
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def save_as_json(run, result, base_path, error_file_list):
|
|
164
|
-
"""Saves the data as a JSON file.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
run : Execution object
|
|
168
|
-
result (dict): API response data containing file information.
|
|
169
|
-
base_path (Path): The directory where the file will be saved.
|
|
170
|
-
error_file_list (list): A list to store error files.
|
|
171
|
-
"""
|
|
172
|
-
# Default save file name: original file name
|
|
173
|
-
file_name = Path(get_original_file_name(result['files'])).stem
|
|
174
|
-
json_data = result['data']
|
|
175
|
-
file_info = {'file_name': f'{file_name}.json'}
|
|
176
|
-
|
|
177
|
-
if json_data is None:
|
|
178
|
-
error_msg = 'data is Null'
|
|
179
|
-
error_file_list.append([f'{file_name}.json', error_msg])
|
|
180
|
-
status = ExportStatus.FAILED
|
|
181
|
-
run.log_export_event('NULL_DATA_DETECTED', result['id'])
|
|
182
|
-
run.export_log_json_file(result['id'], file_info, status, error_msg)
|
|
183
|
-
|
|
184
|
-
return status
|
|
185
|
-
|
|
186
|
-
error_msg = ''
|
|
187
|
-
try:
|
|
188
|
-
with (base_path / f'{file_name}.json').open('w', encoding='utf-8') as f:
|
|
189
|
-
json.dump(json_data, f, indent=4, ensure_ascii=False)
|
|
190
|
-
status = ExportStatus.SUCCESS
|
|
191
|
-
except Exception as e:
|
|
192
|
-
error_msg = str(e)
|
|
193
|
-
error_file_list.append([f'{file_name}.json', str(e)])
|
|
194
|
-
status = ExportStatus.FAILED
|
|
195
13
|
|
|
196
|
-
|
|
197
|
-
|
|
14
|
+
def __init__(self, run, export_items: Generator, path_root: Path, **params):
|
|
15
|
+
"""Initialize the plugin export action class.
|
|
16
|
+
Args:
|
|
17
|
+
run: Plugin run object with logging capabilities.
|
|
18
|
+
export_items (generator):
|
|
19
|
+
- data (dict): dm_schema_data information.
|
|
20
|
+
- files (dict): File information. Includes file URL, original file path, metadata, etc.
|
|
21
|
+
- id (int): target ID (ex. assignment id, task id, ground_truth_event id)
|
|
22
|
+
path_root: pathlib object, the path to export
|
|
23
|
+
**params: Additional parameters
|
|
24
|
+
- name (str): The name of the action.
|
|
25
|
+
- description (str | None): The description of the action.
|
|
26
|
+
- storage (int): The storage ID to save the exported data.
|
|
27
|
+
- save_original_file (bool): Whether to save the original file.
|
|
28
|
+
- path (str): The path to save the exported data.
|
|
29
|
+
- target (str): The target source to export data from. (ex. ground_truth, assignment, task)
|
|
30
|
+
- filter (dict): The filter criteria to apply.
|
|
31
|
+
- extra_params (dict | None): Additional parameters for export customization.
|
|
32
|
+
Example: {"include_metadata": True, "compression": "gzip"}
|
|
33
|
+
- count (int): Total number of results.
|
|
34
|
+
- results (list): List of results fetched through the list API.
|
|
35
|
+
- project_id (int): Project ID.
|
|
36
|
+
- configuration (dict): Project configuration.
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(run, export_items, path_root, **params)
|
|
39
|
+
|
|
40
|
+
def export(self, export_items=None, results=None, **kwargs) -> dict:
|
|
41
|
+
"""Executes the export task using the base class implementation.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
export_items: Optional export items to process. If not provided, uses self.export_items.
|
|
45
|
+
results: Optional results data to process alongside export_items.
|
|
46
|
+
**kwargs: Additional parameters for export customization.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
dict: Result
|
|
50
|
+
"""
|
|
51
|
+
return super().export(export_items, results, **kwargs)
|
|
52
|
+
|
|
53
|
+
def convert_data(self, data):
|
|
54
|
+
"""Converts the data."""
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def before_convert(self, data):
|
|
58
|
+
"""Preprocesses the data before conversion."""
|
|
59
|
+
return data
|
|
60
|
+
|
|
61
|
+
def after_convert(self, data):
|
|
62
|
+
"""Post-processes the data after conversion."""
|
|
63
|
+
return data
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
import shutil
|
|
@@ -5,7 +6,7 @@ from datetime import datetime
|
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from io import BytesIO
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import Annotated, Any, Dict, List, Optional
|
|
9
|
+
from typing import Annotated, Any, Awaitable, Dict, List, Optional, TypeVar
|
|
9
10
|
|
|
10
11
|
from openpyxl import load_workbook
|
|
11
12
|
from openpyxl.utils.exceptions import InvalidFileException
|
|
@@ -25,6 +26,9 @@ from synapse_sdk.shared.enums import Context
|
|
|
25
26
|
from synapse_sdk.utils.pydantic.validators import non_blank
|
|
26
27
|
from synapse_sdk.utils.storage import get_pathlib
|
|
27
28
|
|
|
29
|
+
# Type variable for generic async return type
|
|
30
|
+
T = TypeVar('T')
|
|
31
|
+
|
|
28
32
|
|
|
29
33
|
class PathAwareJSONEncoder(json.JSONEncoder):
|
|
30
34
|
"""Custom JSON encoder that handles Path-like objects."""
|
|
@@ -178,7 +182,7 @@ class UploadRun(Run):
|
|
|
178
182
|
},
|
|
179
183
|
# Debug information - only for EventLog
|
|
180
184
|
'EXCEL_FILE_VALIDATION_STARTED': {
|
|
181
|
-
'message': 'Excel file validation started
|
|
185
|
+
'message': 'Excel file validation started',
|
|
182
186
|
'level': Context.INFO,
|
|
183
187
|
},
|
|
184
188
|
'EXCEL_WORKBOOK_LOADED': {
|
|
@@ -186,7 +190,7 @@ class UploadRun(Run):
|
|
|
186
190
|
'level': Context.INFO,
|
|
187
191
|
},
|
|
188
192
|
'FILE_ORGANIZATION_STARTED': {
|
|
189
|
-
'message': 'File organization started
|
|
193
|
+
'message': 'File organization started',
|
|
190
194
|
'level': Context.INFO,
|
|
191
195
|
},
|
|
192
196
|
'BATCH_PROCESSING_STARTED': {
|
|
@@ -202,7 +206,7 @@ class UploadRun(Run):
|
|
|
202
206
|
'level': Context.INFO,
|
|
203
207
|
},
|
|
204
208
|
'EXCEL_FILE_NOT_FOUND_PATH': {
|
|
205
|
-
'message': 'Excel metadata file not found
|
|
209
|
+
'message': 'Excel metadata file not found',
|
|
206
210
|
'level': Context.WARNING,
|
|
207
211
|
},
|
|
208
212
|
'EXCEL_SECURITY_VALIDATION_FAILED': {
|
|
@@ -394,6 +398,9 @@ class ExcelMetadataUtils:
|
|
|
394
398
|
class UploadParams(BaseModel):
|
|
395
399
|
"""Upload action parameters.
|
|
396
400
|
|
|
401
|
+
This class defines all configuration parameters for the upload action, including
|
|
402
|
+
advanced asynchronous upload capabilities for improved performance.
|
|
403
|
+
|
|
397
404
|
Args:
|
|
398
405
|
name (str): The name of the action.
|
|
399
406
|
description (str | None): The description of the action.
|
|
@@ -405,6 +412,17 @@ class UploadParams(BaseModel):
|
|
|
405
412
|
excel_metadata_path (str | None): Path to excel file containing metadata.
|
|
406
413
|
Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
|
|
407
414
|
is_recursive (bool): Enable recursive file discovery in subdirectories. Defaults to False.
|
|
415
|
+
use_async_upload (bool): Enable asynchronous upload data file processing for improved performance.
|
|
416
|
+
Defaults to True.
|
|
417
|
+
max_file_size_mb (int): The maximum file size not using chunked_upload in MB. Defaults to 50MB.
|
|
418
|
+
creating_data_unit_batch_size (int): The batch size for creating data units. Defaults to 100.
|
|
419
|
+
extra_params (dict | None): Extra parameters for the action.
|
|
420
|
+
Example: {"include_metadata": True, "compression": "gzip"}
|
|
421
|
+
|
|
422
|
+
Note:
|
|
423
|
+
Async upload requires plugin developers to implement handle_upload_files_async()
|
|
424
|
+
method for maximum benefit. Default implementation provides compatibility
|
|
425
|
+
but limited performance improvement.
|
|
408
426
|
"""
|
|
409
427
|
|
|
410
428
|
name: Annotated[str, AfterValidator(non_blank)]
|
|
@@ -416,7 +434,9 @@ class UploadParams(BaseModel):
|
|
|
416
434
|
excel_metadata_path: str | None = None
|
|
417
435
|
is_recursive: bool = False
|
|
418
436
|
max_file_size_mb: int = 50
|
|
419
|
-
creating_data_unit_batch_size: int =
|
|
437
|
+
creating_data_unit_batch_size: int = 1
|
|
438
|
+
use_async_upload: bool = True
|
|
439
|
+
extra_params: dict | None = None
|
|
420
440
|
|
|
421
441
|
@field_validator('storage', mode='before')
|
|
422
442
|
@classmethod
|
|
@@ -540,8 +560,6 @@ class UploadAction(Action):
|
|
|
540
560
|
analyze_collection: The progress category for the analyze collection process.
|
|
541
561
|
data_file_upload: The progress category for the upload process.
|
|
542
562
|
generate_data_units: The progress category for the generate data units process.
|
|
543
|
-
generate_tasks: The progress category for the generate tasks process.
|
|
544
|
-
generate_ground_truths: The progress category for the generate ground truths process.
|
|
545
563
|
|
|
546
564
|
Metrics Categories:
|
|
547
565
|
data_file: The metrics category for the data file.
|
|
@@ -581,9 +599,11 @@ class UploadAction(Action):
|
|
|
581
599
|
self.excel_config = ExcelSecurityConfig()
|
|
582
600
|
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
583
601
|
|
|
584
|
-
def get_uploader(self, path, file_specification, organized_files):
|
|
602
|
+
def get_uploader(self, path, file_specification, organized_files, params: Dict = None):
|
|
585
603
|
"""Get uploader from entrypoint."""
|
|
586
|
-
return self.entrypoint(
|
|
604
|
+
return self.entrypoint(
|
|
605
|
+
self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
|
|
606
|
+
)
|
|
587
607
|
|
|
588
608
|
def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
|
|
589
609
|
"""Discover files recursively in a directory."""
|
|
@@ -797,10 +817,11 @@ class UploadAction(Action):
|
|
|
797
817
|
excel_path = None
|
|
798
818
|
|
|
799
819
|
# Check if user provided a specific excel_metadata_path
|
|
800
|
-
|
|
801
|
-
|
|
820
|
+
excel_metadata_path = self.params.get('excel_metadata_path')
|
|
821
|
+
if excel_metadata_path:
|
|
822
|
+
excel_path = pathlib_cwd / excel_metadata_path
|
|
802
823
|
if not excel_path.exists():
|
|
803
|
-
self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH'
|
|
824
|
+
self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH')
|
|
804
825
|
return {}
|
|
805
826
|
else:
|
|
806
827
|
# Look for default meta.xlsx or meta.xls
|
|
@@ -810,7 +831,7 @@ class UploadAction(Action):
|
|
|
810
831
|
return {}
|
|
811
832
|
|
|
812
833
|
try:
|
|
813
|
-
self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED'
|
|
834
|
+
self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED')
|
|
814
835
|
|
|
815
836
|
# Prepare Excel file with security validation
|
|
816
837
|
excel_stream = self._prepare_excel_file(excel_path)
|
|
@@ -852,8 +873,15 @@ class UploadAction(Action):
|
|
|
852
873
|
result: Dict[str, Any] = {}
|
|
853
874
|
|
|
854
875
|
# Setup path object with path and storage.
|
|
855
|
-
|
|
856
|
-
|
|
876
|
+
storage_id = self.params.get('storage')
|
|
877
|
+
if storage_id is None:
|
|
878
|
+
raise ActionError('Storage parameter is required')
|
|
879
|
+
storage = self.client.get_storage(storage_id)
|
|
880
|
+
|
|
881
|
+
path = self.params.get('path')
|
|
882
|
+
if path is None:
|
|
883
|
+
raise ActionError('Path parameter is required')
|
|
884
|
+
pathlib_cwd = get_pathlib(storage, path)
|
|
857
885
|
|
|
858
886
|
# Read excel metadata if configured or default file exists
|
|
859
887
|
excel_metadata: Dict[str, Dict[str, Any]] = {}
|
|
@@ -881,7 +909,7 @@ class UploadAction(Action):
|
|
|
881
909
|
organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
|
|
882
910
|
|
|
883
911
|
# Initialize uploader.
|
|
884
|
-
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
|
|
912
|
+
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
|
|
885
913
|
|
|
886
914
|
# Get organized files from the uploader (plugin developer's custom implementation)
|
|
887
915
|
# or use the default organization method if uploader doesn't provide valid files
|
|
@@ -896,7 +924,11 @@ class UploadAction(Action):
|
|
|
896
924
|
if not organized_files:
|
|
897
925
|
self.run.log_message_with_code('NO_FILES_FOUND')
|
|
898
926
|
raise ActionError('Upload is aborted due to missing files.')
|
|
899
|
-
|
|
927
|
+
# Choose upload method based on async parameter
|
|
928
|
+
if self.params.get('use_async_upload', True):
|
|
929
|
+
uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
|
|
930
|
+
else:
|
|
931
|
+
uploaded_files = self._upload_files(organized_files)
|
|
900
932
|
result['uploaded_files_count'] = len(uploaded_files)
|
|
901
933
|
|
|
902
934
|
# Generate data units for the uploaded data.
|
|
@@ -904,7 +936,7 @@ class UploadAction(Action):
|
|
|
904
936
|
self.run.log_message_with_code('NO_FILES_UPLOADED')
|
|
905
937
|
raise ActionError('Upload is aborted due to no uploaded files.')
|
|
906
938
|
generated_data_units = self._generate_data_units(
|
|
907
|
-
uploaded_files, self.params.get('creating_data_unit_batch_size')
|
|
939
|
+
uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
|
|
908
940
|
)
|
|
909
941
|
result['generated_data_units_count'] = len(generated_data_units)
|
|
910
942
|
|
|
@@ -929,7 +961,9 @@ class UploadAction(Action):
|
|
|
929
961
|
# Initialize progress
|
|
930
962
|
self.run.set_progress(0, 2, category='analyze_collection')
|
|
931
963
|
|
|
932
|
-
collection_id = self.params
|
|
964
|
+
collection_id = self.params.get('data_collection')
|
|
965
|
+
if collection_id is None:
|
|
966
|
+
raise ActionError('Data collection parameter is required')
|
|
933
967
|
self.run.set_progress(1, 2, category='analyze_collection')
|
|
934
968
|
|
|
935
969
|
collection = self.run.client.get_data_collection(collection_id)
|
|
@@ -949,7 +983,9 @@ class UploadAction(Action):
|
|
|
949
983
|
self.run.log_message_with_code('UPLOADING_DATA_FILES')
|
|
950
984
|
|
|
951
985
|
client = self.run.client
|
|
952
|
-
collection_id = self.params
|
|
986
|
+
collection_id = self.params.get('data_collection')
|
|
987
|
+
if collection_id is None:
|
|
988
|
+
raise ActionError('Data collection parameter is required')
|
|
953
989
|
upload_result = []
|
|
954
990
|
current_progress = 0
|
|
955
991
|
success_count = 0
|
|
@@ -980,6 +1016,129 @@ class UploadAction(Action):
|
|
|
980
1016
|
|
|
981
1017
|
return upload_result
|
|
982
1018
|
|
|
1019
|
+
def run_async(self, coro: Awaitable[T]) -> T:
|
|
1020
|
+
"""Run async coroutine safely using asyncio.run().
|
|
1021
|
+
|
|
1022
|
+
This method properly manages event loop lifecycle and prevents
|
|
1023
|
+
resource exhaustion from repeated event loop creation.
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
coro: The coroutine to execute
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
The result of the coroutine execution
|
|
1030
|
+
|
|
1031
|
+
Raises:
|
|
1032
|
+
RuntimeError: If called from within an existing event loop
|
|
1033
|
+
"""
|
|
1034
|
+
import concurrent.futures
|
|
1035
|
+
|
|
1036
|
+
def _run_in_thread():
|
|
1037
|
+
"""Run the coroutine in a separate thread to avoid event loop conflicts."""
|
|
1038
|
+
return asyncio.run(coro)
|
|
1039
|
+
|
|
1040
|
+
# Check if we're already in an event loop
|
|
1041
|
+
try:
|
|
1042
|
+
# If this doesn't raise, we're in an event loop
|
|
1043
|
+
asyncio.get_running_loop()
|
|
1044
|
+
# Run in thread pool to avoid "RuntimeError: cannot be called from a running event loop"
|
|
1045
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
1046
|
+
future = executor.submit(_run_in_thread)
|
|
1047
|
+
return future.result()
|
|
1048
|
+
except RuntimeError:
|
|
1049
|
+
# No event loop running, safe to use asyncio.run directly
|
|
1050
|
+
return asyncio.run(coro)
|
|
1051
|
+
|
|
1052
|
+
async def _upload_files_async(
|
|
1053
|
+
self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
|
|
1054
|
+
) -> List[Dict[str, Any]]:
|
|
1055
|
+
"""Upload files to synapse-backend asynchronously with concurrency control."""
|
|
1056
|
+
# Initialize progress
|
|
1057
|
+
organized_files_count = len(organized_files)
|
|
1058
|
+
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
1059
|
+
self.run.log_message_with_code('UPLOADING_DATA_FILES')
|
|
1060
|
+
|
|
1061
|
+
client = self.run.client
|
|
1062
|
+
collection_id = self.params.get('data_collection')
|
|
1063
|
+
if collection_id is None:
|
|
1064
|
+
raise ActionError('Data collection parameter is required')
|
|
1065
|
+
upload_result = []
|
|
1066
|
+
success_count = 0
|
|
1067
|
+
failed_count = 0
|
|
1068
|
+
|
|
1069
|
+
# Initialize metrics
|
|
1070
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
1071
|
+
|
|
1072
|
+
# Control concurrency with semaphore
|
|
1073
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
1074
|
+
|
|
1075
|
+
async def upload_single_file(organized_file):
|
|
1076
|
+
async with semaphore:
|
|
1077
|
+
loop = asyncio.get_event_loop()
|
|
1078
|
+
try:
|
|
1079
|
+
# Determine if chunked upload should be used based on file size
|
|
1080
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file)
|
|
1081
|
+
# Run sync upload_data_file in thread pool
|
|
1082
|
+
uploaded_data_file = await loop.run_in_executor(
|
|
1083
|
+
None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
1084
|
+
)
|
|
1085
|
+
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
1086
|
+
return {'status': 'success', 'result': uploaded_data_file}
|
|
1087
|
+
except ClientError as e:
|
|
1088
|
+
# Handle API client errors (network, authentication, server errors)
|
|
1089
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1090
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Client error: {str(e)}')
|
|
1091
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
|
|
1092
|
+
except (OSError, IOError) as e:
|
|
1093
|
+
# Handle file system errors (file not found, permissions, disk full)
|
|
1094
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1095
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'File system error: {str(e)}')
|
|
1096
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
|
|
1097
|
+
except MemoryError as e:
|
|
1098
|
+
# Handle out of memory errors (large files)
|
|
1099
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1100
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Memory error (file too large): {str(e)}')
|
|
1101
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
|
|
1102
|
+
except asyncio.TimeoutError as e:
|
|
1103
|
+
# Handle timeout errors (slow network, large files)
|
|
1104
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1105
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Upload timeout: {str(e)}')
|
|
1106
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
|
|
1107
|
+
except ValueError as e:
|
|
1108
|
+
# Handle data validation errors (invalid file format, metadata issues)
|
|
1109
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1110
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Data validation error: {str(e)}')
|
|
1111
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
|
|
1112
|
+
except Exception as e:
|
|
1113
|
+
# Handle any remaining unexpected errors
|
|
1114
|
+
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
1115
|
+
self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Unexpected error: {str(e)}')
|
|
1116
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
|
|
1117
|
+
|
|
1118
|
+
# Create tasks for all files
|
|
1119
|
+
tasks = [upload_single_file(organized_file) for organized_file in organized_files]
|
|
1120
|
+
|
|
1121
|
+
# Process files with progress updates
|
|
1122
|
+
current_progress = 0
|
|
1123
|
+
for completed_task in asyncio.as_completed(tasks):
|
|
1124
|
+
result = await completed_task
|
|
1125
|
+
current_progress += 1
|
|
1126
|
+
|
|
1127
|
+
if result['status'] == 'success':
|
|
1128
|
+
success_count += 1
|
|
1129
|
+
upload_result.append(result['result'])
|
|
1130
|
+
else:
|
|
1131
|
+
failed_count += 1
|
|
1132
|
+
|
|
1133
|
+
# Update metrics and progress
|
|
1134
|
+
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
1135
|
+
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
1136
|
+
|
|
1137
|
+
# Finish progress
|
|
1138
|
+
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
1139
|
+
|
|
1140
|
+
return upload_result
|
|
1141
|
+
|
|
983
1142
|
def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
|
|
984
1143
|
"""Generate data units for the uploaded data.
|
|
985
1144
|
|
|
@@ -1073,14 +1232,14 @@ class UploadAction(Action):
|
|
|
1073
1232
|
return organized_files
|
|
1074
1233
|
|
|
1075
1234
|
self.run.log_message_with_code('TYPE_STRUCTURE_DETECTED')
|
|
1076
|
-
self.run.log_message_with_code('FILE_ORGANIZATION_STARTED'
|
|
1235
|
+
self.run.log_message_with_code('FILE_ORGANIZATION_STARTED')
|
|
1077
1236
|
|
|
1078
1237
|
# Collect and process files in a single pass
|
|
1079
1238
|
dataset_files = {}
|
|
1080
1239
|
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
1081
1240
|
|
|
1082
1241
|
# Get recursive setting from params
|
|
1083
|
-
is_recursive = self.params.get('is_recursive',
|
|
1242
|
+
is_recursive = self.params.get('is_recursive', True)
|
|
1084
1243
|
|
|
1085
1244
|
# Process all files from all type directories
|
|
1086
1245
|
for spec_name, dir_path in type_dirs.items():
|
|
@@ -3,5 +3,27 @@ actions:
|
|
|
3
3
|
entrypoint: plugin.upload.Uploader
|
|
4
4
|
options:
|
|
5
5
|
supported_data_type: image # A primary data type of synapse backend collection. (e.g. 'image', 'text', 'video', 'pcd', 'audio')
|
|
6
|
-
ui_schema:
|
|
7
|
-
|
|
6
|
+
ui_schema: # UI schema for the input of extra params
|
|
7
|
+
- $formkit: "radio"
|
|
8
|
+
name: "file_format"
|
|
9
|
+
label: "File Format"
|
|
10
|
+
help: "Select the file format for upload processing"
|
|
11
|
+
required: false
|
|
12
|
+
value: "original"
|
|
13
|
+
options:
|
|
14
|
+
- label: "Keep Original"
|
|
15
|
+
value: "original"
|
|
16
|
+
- label: "Convert to JPEG"
|
|
17
|
+
value: "jpeg"
|
|
18
|
+
- label: "Convert to PNG"
|
|
19
|
+
value: "png"
|
|
20
|
+
- $formkit: "checkbox"
|
|
21
|
+
name: "include_metadata"
|
|
22
|
+
label: "Include Metadata"
|
|
23
|
+
help: "Include file metadata during upload"
|
|
24
|
+
value: true
|
|
25
|
+
- $formkit: "text"
|
|
26
|
+
name: "custom_tag"
|
|
27
|
+
label: "Custom Tag"
|
|
28
|
+
help: "Add a custom tag to uploaded files"
|
|
29
|
+
placeholder: "Enter custom tag"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import Dict, List
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class Uploader:
|
|
@@ -9,7 +9,9 @@ class Uploader:
|
|
|
9
9
|
their own file organization logic.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
def __init__(
|
|
12
|
+
def __init__(
|
|
13
|
+
self, run, path: Path, file_specification: List = None, organized_files: List = None, extra_params: Dict = None
|
|
14
|
+
):
|
|
13
15
|
"""Initialize the plugin upload action class.
|
|
14
16
|
|
|
15
17
|
Args:
|
|
@@ -17,11 +19,15 @@ class Uploader:
|
|
|
17
19
|
path: Path object pointing to the upload target directory.
|
|
18
20
|
file_specification: List of specifications that define the structure of files to be uploaded.
|
|
19
21
|
Each specification contains details like file name, type, and requirements.
|
|
22
|
+
organized_files: List of pre-organized files based on the default logic.
|
|
23
|
+
Each item is a dictionary with 'files' and 'meta' keys.
|
|
24
|
+
extra_params: Additional parameters for customization.
|
|
20
25
|
"""
|
|
21
26
|
self.run = run
|
|
22
27
|
self.path = path
|
|
23
28
|
self.file_specification = file_specification
|
|
24
29
|
self.organized_files = organized_files
|
|
30
|
+
self.extra_params = extra_params
|
|
25
31
|
|
|
26
32
|
def handle_upload_files(self) -> List:
|
|
27
33
|
"""Customize the organization of files for upload.
|