synapse-sdk 1.0.0b18__py3-none-any.whl → 1.0.0b20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (28) hide show
  1. synapse_sdk/devtools/docs/docs/contributing.md +1 -1
  2. synapse_sdk/devtools/docs/docs/features/index.md +4 -4
  3. synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +786 -0
  4. synapse_sdk/devtools/docs/docs/{features/plugins/index.md → plugins/plugins.md} +357 -21
  5. synapse_sdk/devtools/docs/docusaurus.config.ts +8 -0
  6. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +788 -0
  7. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/plugins.md +71 -0
  8. synapse_sdk/devtools/docs/package-lock.json +1366 -37
  9. synapse_sdk/devtools/docs/package.json +2 -1
  10. synapse_sdk/devtools/docs/sidebars.ts +8 -1
  11. synapse_sdk/plugins/categories/base.py +28 -2
  12. synapse_sdk/plugins/categories/export/actions/export.py +2 -1
  13. synapse_sdk/plugins/categories/export/templates/config.yaml +1 -1
  14. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +375 -0
  15. synapse_sdk/plugins/categories/export/templates/plugin/export.py +56 -190
  16. synapse_sdk/plugins/categories/upload/actions/upload.py +181 -22
  17. synapse_sdk/plugins/categories/upload/templates/config.yaml +24 -2
  18. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +8 -2
  19. synapse_sdk/plugins/models.py +28 -2
  20. synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
  21. synapse_sdk/plugins/templates/schema.json +7 -0
  22. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/METADATA +1 -1
  23. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/RECORD +27 -25
  24. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/features/plugins/index.md +0 -30
  25. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/WHEEL +0 -0
  26. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/entry_points.txt +0 -0
  27. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/licenses/LICENSE +0 -0
  28. {synapse_sdk-1.0.0b18.dist-info → synapse_sdk-1.0.0b20.dist-info}/top_level.txt +0 -0
@@ -1,197 +1,63 @@
1
- import json
2
- from itertools import tee
3
1
  from pathlib import Path
2
+ from typing import Generator
4
3
 
5
- import requests
4
+ from . import BaseExporter
6
5
 
7
- from synapse_sdk.plugins.categories.export.enums import ExportStatus
8
6
 
7
+ class Exporter(BaseExporter):
8
+ """Plugin export action interface for organizing files.
9
9
 
10
- def export(run, export_items, path_root, **params):
11
- """Executes the export task.
12
-
13
- Args:
14
- run : Execution object
15
- export_items (generator):
16
- - data (dict): dm_schema_data information.
17
- - files (dict): File information. Includes file URL, original file path, metadata, etc.
18
- - id (int): ground_truth ID
19
- path_root : pathlib object, the path to export
20
- **params: Additional parameters
21
-
22
- Returns:
23
- dict: Result
24
- """
25
-
26
- export_path = path_root / params['name']
27
- unique_export_path = export_path
28
- counter = 1
29
- while unique_export_path.exists():
30
- unique_export_path = export_path.with_name(f'{export_path.name}({counter})')
31
- counter += 1
32
- unique_export_path.mkdir(parents=True)
33
-
34
- run.log_message('Starting export process.')
35
-
36
- # results contains all information fetched through the list API.
37
- # example:
38
- # params.get('results', [])
39
-
40
- save_original_file_flag = params.get('save_original_file')
41
- errors_json_file_list = []
42
- errors_original_file_list = []
43
-
44
- # Path to save JSON files
45
- json_output_path = unique_export_path / 'json'
46
- json_output_path.mkdir(parents=True, exist_ok=True)
47
-
48
- # Path to save original files
49
- if save_original_file_flag:
50
- origin_files_output_path = unique_export_path / 'origin_files'
51
- origin_files_output_path.mkdir(parents=True, exist_ok=True)
52
-
53
- export_items_count, export_items_process = tee(export_items)
54
- total = sum(1 for _ in export_items_count)
55
-
56
- original_file_metrics_record = run.MetricsRecord(stand_by=total, success=0, failed=0)
57
- data_file_metrics_record = run.MetricsRecord(stand_by=total, success=0, failed=0)
58
- # progress init
59
- run.set_progress(0, total, category='dataset_conversion')
60
- for no, export_item in enumerate(export_items_process, start=1):
61
- run.set_progress(no, total, category='dataset_conversion')
62
- if no == 1:
63
- run.log_message('Converting dataset.')
64
- preprocessed_data = before_convert(export_item)
65
- converted_data = convert_data(preprocessed_data)
66
- final_data = after_convert(converted_data)
67
-
68
- # Call if original file extraction is needed
69
- if save_original_file_flag:
70
- if no == 1:
71
- run.log_message('Saving original file.')
72
- original_status = save_original_file(run, final_data, origin_files_output_path, errors_original_file_list)
73
-
74
- original_file_metrics_record.stand_by -= 1
75
- if original_status == ExportStatus.FAILED:
76
- original_file_metrics_record.failed += 1
77
- continue
78
- else:
79
- original_file_metrics_record.success += 1
80
-
81
- run.log_metrics(record=original_file_metrics_record, category='original_file')
82
-
83
- # Extract data as JSON files
84
- if no == 1:
85
- run.log_message('Saving json file.')
86
- data_status = save_as_json(run, final_data, json_output_path, errors_json_file_list)
87
-
88
- data_file_metrics_record.stand_by -= 1
89
- if data_status == ExportStatus.FAILED:
90
- data_file_metrics_record.failed += 1
91
- continue
92
- else:
93
- data_file_metrics_record.success += 1
94
-
95
- run.log_metrics(record=data_file_metrics_record, category='data_file')
96
-
97
- run.end_log()
98
-
99
- # Save error list files
100
- if len(errors_json_file_list) > 0 or len(errors_original_file_list) > 0:
101
- export_error_file = {'json_file_name': errors_json_file_list, 'origin_file_name': errors_original_file_list}
102
- with (unique_export_path / 'error_file_list.json').open('w', encoding='utf-8') as f:
103
- json.dump(export_error_file, f, indent=4, ensure_ascii=False)
104
-
105
- return {'export_path': str(path_root)}
106
-
107
-
108
- def convert_data(data):
109
- """Converts the data."""
110
- return data
111
-
112
-
113
- def before_convert(data):
114
- """Preprocesses the data before conversion."""
115
- return data
116
-
117
-
118
- def after_convert(data):
119
- """Post-processes the data after conversion."""
120
- return data
121
-
122
-
123
- def get_original_file_name(files):
124
- """Retrieve the original file path from the given file information.
125
-
126
- Args:
127
- files (dict): A dictionary containing file information, including file URL,
128
- original file path, metadata, etc.
129
-
130
- Returns:
131
- file_name (str): The original file name extracted from the file information.
10
+ This class provides a minimal interface for plugin developers to implement
11
+ their own export logic.
132
12
  """
133
- return files['file_name_original']
134
-
135
-
136
- def save_original_file(run, result, base_path, error_file_list):
137
- """Saves the original file.
138
-
139
- Args:
140
- run : Execution object
141
- result (dict): API response data containing file information.
142
- base_path (Path): The directory where the file will be saved.
143
- error_file_list (list): A list to store error files.
144
- """
145
- file_url = result['files']['url']
146
- file_name = get_original_file_name(result['files'])
147
- response = requests.get(file_url)
148
- file_info = {'file_name': file_name}
149
- error_msg = ''
150
- try:
151
- with (base_path / file_name).open('wb') as file:
152
- file.write(response.content)
153
- status = ExportStatus.SUCCESS
154
- except Exception as e:
155
- error_msg = str(e)
156
- error_file_list.append([file_name, error_msg])
157
- status = ExportStatus.FAILED
158
-
159
- run.export_log_original_file(result['id'], file_info, status, error_msg)
160
- return status
161
-
162
-
163
- def save_as_json(run, result, base_path, error_file_list):
164
- """Saves the data as a JSON file.
165
-
166
- Args:
167
- run : Execution object
168
- result (dict): API response data containing file information.
169
- base_path (Path): The directory where the file will be saved.
170
- error_file_list (list): A list to store error files.
171
- """
172
- # Default save file name: original file name
173
- file_name = Path(get_original_file_name(result['files'])).stem
174
- json_data = result['data']
175
- file_info = {'file_name': f'{file_name}.json'}
176
-
177
- if json_data is None:
178
- error_msg = 'data is Null'
179
- error_file_list.append([f'{file_name}.json', error_msg])
180
- status = ExportStatus.FAILED
181
- run.log_export_event('NULL_DATA_DETECTED', result['id'])
182
- run.export_log_json_file(result['id'], file_info, status, error_msg)
183
-
184
- return status
185
-
186
- error_msg = ''
187
- try:
188
- with (base_path / f'{file_name}.json').open('w', encoding='utf-8') as f:
189
- json.dump(json_data, f, indent=4, ensure_ascii=False)
190
- status = ExportStatus.SUCCESS
191
- except Exception as e:
192
- error_msg = str(e)
193
- error_file_list.append([f'{file_name}.json', str(e)])
194
- status = ExportStatus.FAILED
195
13
 
196
- run.export_log_json_file(result['id'], file_info, status, error_msg)
197
- return status
14
+ def __init__(self, run, export_items: Generator, path_root: Path, **params):
15
+ """Initialize the plugin export action class.
16
+ Args:
17
+ run: Plugin run object with logging capabilities.
18
+ export_items (generator):
19
+ - data (dict): dm_schema_data information.
20
+ - files (dict): File information. Includes file URL, original file path, metadata, etc.
21
+ - id (int): target ID (ex. assignment id, task id, ground_truth_event id)
22
+ path_root: pathlib object, the path to export
23
+ **params: Additional parameters
24
+ - name (str): The name of the action.
25
+ - description (str | None): The description of the action.
26
+ - storage (int): The storage ID to save the exported data.
27
+ - save_original_file (bool): Whether to save the original file.
28
+ - path (str): The path to save the exported data.
29
+ - target (str): The target source to export data from. (ex. ground_truth, assignment, task)
30
+ - filter (dict): The filter criteria to apply.
31
+ - extra_params (dict | None): Additional parameters for export customization.
32
+ Example: {"include_metadata": True, "compression": "gzip"}
33
+ - count (int): Total number of results.
34
+ - results (list): List of results fetched through the list API.
35
+ - project_id (int): Project ID.
36
+ - configuration (dict): Project configuration.
37
+ """
38
+ super().__init__(run, export_items, path_root, **params)
39
+
40
+ def export(self, export_items=None, results=None, **kwargs) -> dict:
41
+ """Executes the export task using the base class implementation.
42
+
43
+ Args:
44
+ export_items: Optional export items to process. If not provided, uses self.export_items.
45
+ results: Optional results data to process alongside export_items.
46
+ **kwargs: Additional parameters for export customization.
47
+
48
+ Returns:
49
+ dict: Result
50
+ """
51
+ return super().export(export_items, results, **kwargs)
52
+
53
+ def convert_data(self, data):
54
+ """Converts the data."""
55
+ return data
56
+
57
+ def before_convert(self, data):
58
+ """Preprocesses the data before conversion."""
59
+ return data
60
+
61
+ def after_convert(self, data):
62
+ """Post-processes the data after conversion."""
63
+ return data
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import json
2
3
  import os
3
4
  import shutil
@@ -5,7 +6,7 @@ from datetime import datetime
5
6
  from enum import Enum
6
7
  from io import BytesIO
7
8
  from pathlib import Path
8
- from typing import Annotated, Any, Dict, List, Optional
9
+ from typing import Annotated, Any, Awaitable, Dict, List, Optional, TypeVar
9
10
 
10
11
  from openpyxl import load_workbook
11
12
  from openpyxl.utils.exceptions import InvalidFileException
@@ -25,6 +26,9 @@ from synapse_sdk.shared.enums import Context
25
26
  from synapse_sdk.utils.pydantic.validators import non_blank
26
27
  from synapse_sdk.utils.storage import get_pathlib
27
28
 
29
+ # Type variable for generic async return type
30
+ T = TypeVar('T')
31
+
28
32
 
29
33
  class PathAwareJSONEncoder(json.JSONEncoder):
30
34
  """Custom JSON encoder that handles Path-like objects."""
@@ -178,7 +182,7 @@ class UploadRun(Run):
178
182
  },
179
183
  # Debug information - only for EventLog
180
184
  'EXCEL_FILE_VALIDATION_STARTED': {
181
- 'message': 'Excel file validation started: {}',
185
+ 'message': 'Excel file validation started',
182
186
  'level': Context.INFO,
183
187
  },
184
188
  'EXCEL_WORKBOOK_LOADED': {
@@ -186,7 +190,7 @@ class UploadRun(Run):
186
190
  'level': Context.INFO,
187
191
  },
188
192
  'FILE_ORGANIZATION_STARTED': {
189
- 'message': 'File organization started for directory: {}',
193
+ 'message': 'File organization started',
190
194
  'level': Context.INFO,
191
195
  },
192
196
  'BATCH_PROCESSING_STARTED': {
@@ -202,7 +206,7 @@ class UploadRun(Run):
202
206
  'level': Context.INFO,
203
207
  },
204
208
  'EXCEL_FILE_NOT_FOUND_PATH': {
205
- 'message': 'Excel metadata file not found: {}',
209
+ 'message': 'Excel metadata file not found',
206
210
  'level': Context.WARNING,
207
211
  },
208
212
  'EXCEL_SECURITY_VALIDATION_FAILED': {
@@ -394,6 +398,9 @@ class ExcelMetadataUtils:
394
398
  class UploadParams(BaseModel):
395
399
  """Upload action parameters.
396
400
 
401
+ This class defines all configuration parameters for the upload action, including
402
+ advanced asynchronous upload capabilities for improved performance.
403
+
397
404
  Args:
398
405
  name (str): The name of the action.
399
406
  description (str | None): The description of the action.
@@ -405,6 +412,17 @@ class UploadParams(BaseModel):
405
412
  excel_metadata_path (str | None): Path to excel file containing metadata.
406
413
  Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
407
414
  is_recursive (bool): Enable recursive file discovery in subdirectories. Defaults to False.
415
+ use_async_upload (bool): Enable asynchronous upload data file processing for improved performance.
416
+ Defaults to True.
417
+ max_file_size_mb (int): The maximum file size not using chunked_upload in MB. Defaults to 50MB.
418
+ creating_data_unit_batch_size (int): The batch size for creating data units. Defaults to 100.
419
+ extra_params (dict | None): Extra parameters for the action.
420
+ Example: {"include_metadata": True, "compression": "gzip"}
421
+
422
+ Note:
423
+ Async upload requires plugin developers to implement handle_upload_files_async()
424
+ method for maximum benefit. Default implementation provides compatibility
425
+ but limited performance improvement.
408
426
  """
409
427
 
410
428
  name: Annotated[str, AfterValidator(non_blank)]
@@ -416,7 +434,9 @@ class UploadParams(BaseModel):
416
434
  excel_metadata_path: str | None = None
417
435
  is_recursive: bool = False
418
436
  max_file_size_mb: int = 50
419
- creating_data_unit_batch_size: int = 100
437
+ creating_data_unit_batch_size: int = 1
438
+ use_async_upload: bool = True
439
+ extra_params: dict | None = None
420
440
 
421
441
  @field_validator('storage', mode='before')
422
442
  @classmethod
@@ -540,8 +560,6 @@ class UploadAction(Action):
540
560
  analyze_collection: The progress category for the analyze collection process.
541
561
  data_file_upload: The progress category for the upload process.
542
562
  generate_data_units: The progress category for the generate data units process.
543
- generate_tasks: The progress category for the generate tasks process.
544
- generate_ground_truths: The progress category for the generate ground truths process.
545
563
 
546
564
  Metrics Categories:
547
565
  data_file: The metrics category for the data file.
@@ -581,9 +599,11 @@ class UploadAction(Action):
581
599
  self.excel_config = ExcelSecurityConfig()
582
600
  self.excel_utils = ExcelMetadataUtils(self.excel_config)
583
601
 
584
- def get_uploader(self, path, file_specification, organized_files):
602
+ def get_uploader(self, path, file_specification, organized_files, params: Dict = None):
585
603
  """Get uploader from entrypoint."""
586
- return self.entrypoint(self.run, path, file_specification, organized_files)
604
+ return self.entrypoint(
605
+ self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
606
+ )
587
607
 
588
608
  def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
589
609
  """Discover files recursively in a directory."""
@@ -797,10 +817,11 @@ class UploadAction(Action):
797
817
  excel_path = None
798
818
 
799
819
  # Check if user provided a specific excel_metadata_path
800
- if self.params.get('excel_metadata_path'):
801
- excel_path = pathlib_cwd / self.params['excel_metadata_path']
820
+ excel_metadata_path = self.params.get('excel_metadata_path')
821
+ if excel_metadata_path:
822
+ excel_path = pathlib_cwd / excel_metadata_path
802
823
  if not excel_path.exists():
803
- self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH', str(excel_path))
824
+ self.run.log_message_with_code('EXCEL_FILE_NOT_FOUND_PATH')
804
825
  return {}
805
826
  else:
806
827
  # Look for default meta.xlsx or meta.xls
@@ -810,7 +831,7 @@ class UploadAction(Action):
810
831
  return {}
811
832
 
812
833
  try:
813
- self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED', str(excel_path))
834
+ self.run.log_message_with_code('EXCEL_FILE_VALIDATION_STARTED')
814
835
 
815
836
  # Prepare Excel file with security validation
816
837
  excel_stream = self._prepare_excel_file(excel_path)
@@ -852,8 +873,15 @@ class UploadAction(Action):
852
873
  result: Dict[str, Any] = {}
853
874
 
854
875
  # Setup path object with path and storage.
855
- storage = self.client.get_storage(self.params['storage'])
856
- pathlib_cwd = get_pathlib(storage, self.params['path'])
876
+ storage_id = self.params.get('storage')
877
+ if storage_id is None:
878
+ raise ActionError('Storage parameter is required')
879
+ storage = self.client.get_storage(storage_id)
880
+
881
+ path = self.params.get('path')
882
+ if path is None:
883
+ raise ActionError('Path parameter is required')
884
+ pathlib_cwd = get_pathlib(storage, path)
857
885
 
858
886
  # Read excel metadata if configured or default file exists
859
887
  excel_metadata: Dict[str, Dict[str, Any]] = {}
@@ -881,7 +909,7 @@ class UploadAction(Action):
881
909
  organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
882
910
 
883
911
  # Initialize uploader.
884
- uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
912
+ uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
885
913
 
886
914
  # Get organized files from the uploader (plugin developer's custom implementation)
887
915
  # or use the default organization method if uploader doesn't provide valid files
@@ -896,7 +924,11 @@ class UploadAction(Action):
896
924
  if not organized_files:
897
925
  self.run.log_message_with_code('NO_FILES_FOUND')
898
926
  raise ActionError('Upload is aborted due to missing files.')
899
- uploaded_files = self._upload_files(organized_files)
927
+ # Choose upload method based on async parameter
928
+ if self.params.get('use_async_upload', True):
929
+ uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
930
+ else:
931
+ uploaded_files = self._upload_files(organized_files)
900
932
  result['uploaded_files_count'] = len(uploaded_files)
901
933
 
902
934
  # Generate data units for the uploaded data.
@@ -904,7 +936,7 @@ class UploadAction(Action):
904
936
  self.run.log_message_with_code('NO_FILES_UPLOADED')
905
937
  raise ActionError('Upload is aborted due to no uploaded files.')
906
938
  generated_data_units = self._generate_data_units(
907
- uploaded_files, self.params.get('creating_data_unit_batch_size')
939
+ uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
908
940
  )
909
941
  result['generated_data_units_count'] = len(generated_data_units)
910
942
 
@@ -929,7 +961,9 @@ class UploadAction(Action):
929
961
  # Initialize progress
930
962
  self.run.set_progress(0, 2, category='analyze_collection')
931
963
 
932
- collection_id = self.params['data_collection']
964
+ collection_id = self.params.get('data_collection')
965
+ if collection_id is None:
966
+ raise ActionError('Data collection parameter is required')
933
967
  self.run.set_progress(1, 2, category='analyze_collection')
934
968
 
935
969
  collection = self.run.client.get_data_collection(collection_id)
@@ -949,7 +983,9 @@ class UploadAction(Action):
949
983
  self.run.log_message_with_code('UPLOADING_DATA_FILES')
950
984
 
951
985
  client = self.run.client
952
- collection_id = self.params['data_collection']
986
+ collection_id = self.params.get('data_collection')
987
+ if collection_id is None:
988
+ raise ActionError('Data collection parameter is required')
953
989
  upload_result = []
954
990
  current_progress = 0
955
991
  success_count = 0
@@ -980,6 +1016,129 @@ class UploadAction(Action):
980
1016
 
981
1017
  return upload_result
982
1018
 
1019
+ def run_async(self, coro: Awaitable[T]) -> T:
1020
+ """Run async coroutine safely using asyncio.run().
1021
+
1022
+ This method properly manages event loop lifecycle and prevents
1023
+ resource exhaustion from repeated event loop creation.
1024
+
1025
+ Args:
1026
+ coro: The coroutine to execute
1027
+
1028
+ Returns:
1029
+ The result of the coroutine execution
1030
+
1031
+ Raises:
1032
+ RuntimeError: If called from within an existing event loop
1033
+ """
1034
+ import concurrent.futures
1035
+
1036
+ def _run_in_thread():
1037
+ """Run the coroutine in a separate thread to avoid event loop conflicts."""
1038
+ return asyncio.run(coro)
1039
+
1040
+ # Check if we're already in an event loop
1041
+ try:
1042
+ # If this doesn't raise, we're in an event loop
1043
+ asyncio.get_running_loop()
1044
+ # Run in thread pool to avoid "RuntimeError: cannot be called from a running event loop"
1045
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
1046
+ future = executor.submit(_run_in_thread)
1047
+ return future.result()
1048
+ except RuntimeError:
1049
+ # No event loop running, safe to use asyncio.run directly
1050
+ return asyncio.run(coro)
1051
+
1052
+ async def _upload_files_async(
1053
+ self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
1054
+ ) -> List[Dict[str, Any]]:
1055
+ """Upload files to synapse-backend asynchronously with concurrency control."""
1056
+ # Initialize progress
1057
+ organized_files_count = len(organized_files)
1058
+ self.run.set_progress(0, organized_files_count, category='upload_data_files')
1059
+ self.run.log_message_with_code('UPLOADING_DATA_FILES')
1060
+
1061
+ client = self.run.client
1062
+ collection_id = self.params.get('data_collection')
1063
+ if collection_id is None:
1064
+ raise ActionError('Data collection parameter is required')
1065
+ upload_result = []
1066
+ success_count = 0
1067
+ failed_count = 0
1068
+
1069
+ # Initialize metrics
1070
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
1071
+
1072
+ # Control concurrency with semaphore
1073
+ semaphore = asyncio.Semaphore(max_concurrent)
1074
+
1075
+ async def upload_single_file(organized_file):
1076
+ async with semaphore:
1077
+ loop = asyncio.get_event_loop()
1078
+ try:
1079
+ # Determine if chunked upload should be used based on file size
1080
+ use_chunked_upload = self._requires_chunked_upload(organized_file)
1081
+ # Run sync upload_data_file in thread pool
1082
+ uploaded_data_file = await loop.run_in_executor(
1083
+ None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
1084
+ )
1085
+ self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
1086
+ return {'status': 'success', 'result': uploaded_data_file}
1087
+ except ClientError as e:
1088
+ # Handle API client errors (network, authentication, server errors)
1089
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1090
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Client error: {str(e)}')
1091
+ return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
1092
+ except (OSError, IOError) as e:
1093
+ # Handle file system errors (file not found, permissions, disk full)
1094
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1095
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'File system error: {str(e)}')
1096
+ return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
1097
+ except MemoryError as e:
1098
+ # Handle out of memory errors (large files)
1099
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1100
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Memory error (file too large): {str(e)}')
1101
+ return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
1102
+ except asyncio.TimeoutError as e:
1103
+ # Handle timeout errors (slow network, large files)
1104
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1105
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Upload timeout: {str(e)}')
1106
+ return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
1107
+ except ValueError as e:
1108
+ # Handle data validation errors (invalid file format, metadata issues)
1109
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1110
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Data validation error: {str(e)}')
1111
+ return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
1112
+ except Exception as e:
1113
+ # Handle any remaining unexpected errors
1114
+ self.run.log_data_file(organized_file, UploadStatus.FAILED)
1115
+ self.run.log_message_with_code('FILE_UPLOAD_FAILED', f'Unexpected error: {str(e)}')
1116
+ return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
1117
+
1118
+ # Create tasks for all files
1119
+ tasks = [upload_single_file(organized_file) for organized_file in organized_files]
1120
+
1121
+ # Process files with progress updates
1122
+ current_progress = 0
1123
+ for completed_task in asyncio.as_completed(tasks):
1124
+ result = await completed_task
1125
+ current_progress += 1
1126
+
1127
+ if result['status'] == 'success':
1128
+ success_count += 1
1129
+ upload_result.append(result['result'])
1130
+ else:
1131
+ failed_count += 1
1132
+
1133
+ # Update metrics and progress
1134
+ self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
1135
+ self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
1136
+
1137
+ # Finish progress
1138
+ self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
1139
+
1140
+ return upload_result
1141
+
983
1142
  def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
984
1143
  """Generate data units for the uploaded data.
985
1144
 
@@ -1073,14 +1232,14 @@ class UploadAction(Action):
1073
1232
  return organized_files
1074
1233
 
1075
1234
  self.run.log_message_with_code('TYPE_STRUCTURE_DETECTED')
1076
- self.run.log_message_with_code('FILE_ORGANIZATION_STARTED', str(directory))
1235
+ self.run.log_message_with_code('FILE_ORGANIZATION_STARTED')
1077
1236
 
1078
1237
  # Collect and process files in a single pass
1079
1238
  dataset_files = {}
1080
1239
  required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
1081
1240
 
1082
1241
  # Get recursive setting from params
1083
- is_recursive = self.params.get('is_recursive', False)
1242
+ is_recursive = self.params.get('is_recursive', True)
1084
1243
 
1085
1244
  # Process all files from all type directories
1086
1245
  for spec_name, dir_path in type_dirs.items():
@@ -3,5 +3,27 @@ actions:
3
3
  entrypoint: plugin.upload.Uploader
4
4
  options:
5
5
  supported_data_type: image # A primary data type of synapse backend collection. (e.g. 'image', 'text', 'video', 'pcd', 'audio')
6
- ui_schema: |
7
- Dumped FormKit Schema for upload plugin custom options
6
+ ui_schema: # UI schema for the input of extra params
7
+ - $formkit: "radio"
8
+ name: "file_format"
9
+ label: "File Format"
10
+ help: "Select the file format for upload processing"
11
+ required: false
12
+ value: "original"
13
+ options:
14
+ - label: "Keep Original"
15
+ value: "original"
16
+ - label: "Convert to JPEG"
17
+ value: "jpeg"
18
+ - label: "Convert to PNG"
19
+ value: "png"
20
+ - $formkit: "checkbox"
21
+ name: "include_metadata"
22
+ label: "Include Metadata"
23
+ help: "Include file metadata during upload"
24
+ value: true
25
+ - $formkit: "text"
26
+ name: "custom_tag"
27
+ label: "Custom Tag"
28
+ help: "Add a custom tag to uploaded files"
29
+ placeholder: "Enter custom tag"
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path
2
- from typing import List
2
+ from typing import Dict, List
3
3
 
4
4
 
5
5
  class Uploader:
@@ -9,7 +9,9 @@ class Uploader:
9
9
  their own file organization logic.
10
10
  """
11
11
 
12
- def __init__(self, run, path: Path, file_specification: List = None, organized_files: List = None):
12
+ def __init__(
13
+ self, run, path: Path, file_specification: List = None, organized_files: List = None, extra_params: Dict = None
14
+ ):
13
15
  """Initialize the plugin upload action class.
14
16
 
15
17
  Args:
@@ -17,11 +19,15 @@ class Uploader:
17
19
  path: Path object pointing to the upload target directory.
18
20
  file_specification: List of specifications that define the structure of files to be uploaded.
19
21
  Each specification contains details like file name, type, and requirements.
22
+ organized_files: List of pre-organized files based on the default logic.
23
+ Each item is a dictionary with 'files' and 'meta' keys.
24
+ extra_params: Additional parameters for customization.
20
25
  """
21
26
  self.run = run
22
27
  self.path = path
23
28
  self.file_specification = file_specification
24
29
  self.organized_files = organized_files
30
+ self.extra_params = extra_params
25
31
 
26
32
  def handle_upload_files(self) -> List:
27
33
  """Customize the organization of files for upload.