synapse-sdk 2025.9.1__py3-none-any.whl → 2025.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (81) hide show
  1. synapse_sdk/devtools/docs/docs/api/clients/annotation-mixin.md +378 -0
  2. synapse_sdk/devtools/docs/docs/api/clients/backend.md +368 -1
  3. synapse_sdk/devtools/docs/docs/api/clients/core-mixin.md +477 -0
  4. synapse_sdk/devtools/docs/docs/api/clients/data-collection-mixin.md +422 -0
  5. synapse_sdk/devtools/docs/docs/api/clients/hitl-mixin.md +554 -0
  6. synapse_sdk/devtools/docs/docs/api/clients/index.md +391 -0
  7. synapse_sdk/devtools/docs/docs/api/clients/integration-mixin.md +571 -0
  8. synapse_sdk/devtools/docs/docs/api/clients/ml-mixin.md +578 -0
  9. synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +1463 -0
  10. synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +161 -34
  11. synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +1497 -213
  12. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/annotation-mixin.md +289 -0
  13. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/backend.md +378 -11
  14. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/core-mixin.md +417 -0
  15. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/data-collection-mixin.md +356 -0
  16. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/hitl-mixin.md +192 -0
  17. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/index.md +391 -0
  18. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/integration-mixin.md +479 -0
  19. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ml-mixin.md +284 -0
  20. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +1463 -0
  21. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +161 -34
  22. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +1752 -572
  23. synapse_sdk/devtools/docs/sidebars.ts +7 -0
  24. synapse_sdk/plugins/README.md +1 -2
  25. synapse_sdk/plugins/categories/base.py +7 -0
  26. synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
  27. synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
  28. synapse_sdk/plugins/categories/export/actions/export/action.py +160 -0
  29. synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
  30. synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
  31. synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
  32. synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
  33. synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
  34. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +1 -1
  35. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +1 -2
  36. synapse_sdk/plugins/categories/upload/actions/upload/action.py +154 -531
  37. synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
  38. synapse_sdk/plugins/categories/upload/actions/upload/factory.py +143 -0
  39. synapse_sdk/plugins/categories/upload/actions/upload/models.py +66 -29
  40. synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +182 -0
  41. synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
  42. synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
  43. synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +106 -0
  44. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
  45. synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +62 -0
  46. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +80 -0
  47. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +66 -0
  48. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +101 -0
  49. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +89 -0
  50. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +96 -0
  51. synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +61 -0
  52. synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +86 -0
  54. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
  55. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
  56. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +34 -0
  57. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
  58. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +233 -0
  59. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +253 -0
  60. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
  61. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
  62. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
  63. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
  64. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/async_upload.py +109 -0
  65. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +43 -0
  66. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
  67. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +45 -0
  68. synapse_sdk/plugins/categories/upload/actions/upload/utils.py +194 -83
  69. synapse_sdk/plugins/categories/upload/templates/config.yaml +4 -0
  70. synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +269 -0
  71. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +71 -27
  72. synapse_sdk/plugins/models.py +7 -0
  73. synapse_sdk/shared/__init__.py +21 -0
  74. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/METADATA +2 -1
  75. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/RECORD +79 -28
  76. synapse_sdk/plugins/categories/export/actions/export.py +0 -385
  77. synapse_sdk/plugins/categories/export/enums.py +0 -7
  78. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/WHEEL +0 -0
  79. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/entry_points.txt +0 -0
  80. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/licenses/LICENSE +0 -0
  81. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.4.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,41 @@
1
- import asyncio
2
- import os
3
- import shutil
4
- from datetime import datetime
5
- from io import BytesIO
6
- from pathlib import Path
7
- from typing import Any, Awaitable, Dict, List, Optional, TypeVar
8
-
9
- from openpyxl import load_workbook
10
- from openpyxl.utils.exceptions import InvalidFileException
11
-
12
- from synapse_sdk.clients.exceptions import ClientError
13
- from synapse_sdk.clients.utils import get_batched_list
14
- from synapse_sdk.clients.validators.collections import FileSpecificationValidator
1
+ from typing import Any, Dict
2
+
15
3
  from synapse_sdk.plugins.categories.base import Action
16
4
  from synapse_sdk.plugins.categories.decorators import register_action
17
- from synapse_sdk.plugins.categories.upload.actions.upload.models import UploadParams
18
5
  from synapse_sdk.plugins.enums import PluginCategory, RunMethod
19
6
  from synapse_sdk.plugins.exceptions import ActionError
20
- from synapse_sdk.utils.storage import get_pathlib
21
7
 
22
- from .enums import LogCode, UploadStatus
23
- from .exceptions import ExcelParsingError, ExcelSecurityError
8
+ from .context import UploadContext
9
+ from .factory import StrategyFactory
10
+ from .models import UploadParams
11
+ from .orchestrator import UploadOrchestrator
12
+ from .registry import StepRegistry
24
13
  from .run import UploadRun
25
- from .utils import ExcelMetadataUtils, ExcelSecurityConfig
26
-
27
- T = TypeVar('T')
14
+ from .steps.cleanup import CleanupStep
15
+ from .steps.collection import AnalyzeCollectionStep
16
+ from .steps.generate import GenerateDataUnitsStep
17
+ from .steps.initialize import InitializeStep
18
+ from .steps.metadata import ProcessMetadataStep
19
+ from .steps.organize import OrganizeFilesStep
20
+ from .steps.upload import UploadFilesStep
21
+ from .steps.validate import ValidateFilesStep
22
+ from .utils import ExcelSecurityConfig
28
23
 
29
24
 
30
25
  @register_action
31
26
  class UploadAction(Action):
32
- """Main upload action for processing and uploading files to storage.
27
+ """Upload action for processing and uploading files to storage.
33
28
 
34
- Handles file upload operations including validation, file discovery,
35
- Excel metadata processing, and data unit generation. Supports both
36
- synchronous and asynchronous upload processing with comprehensive
37
- progress tracking and error handling.
29
+ This implementation uses Strategy and Facade patterns to provide a clean,
30
+ extensible architecture for upload operations. The monolithic legacy
31
+ implementation has been refactored into pluggable strategies and workflow steps.
38
32
 
39
33
  Features:
40
- - Recursive directory scanning
41
- - Excel metadata validation and processing
42
- - Batch processing for large file sets
43
- - Type-based file organization
44
- - Security validation for Excel files
45
- - Progress tracking with detailed metrics
46
- - Comprehensive error logging
34
+ - Strategy pattern for pluggable behaviors (validation, file discovery, etc.)
35
+ - Facade pattern with UploadOrchestrator for simplified workflow management
36
+ - Step-based workflow with automatic rollback on failures
37
+ - Comprehensive error handling and progress tracking
38
+ - Easy extensibility for new strategies and workflow steps
47
39
 
48
40
  Class Attributes:
49
41
  name (str): Action identifier ('upload')
@@ -60,11 +52,11 @@ class UploadAction(Action):
60
52
  ... 'name': 'Data Upload',
61
53
  ... 'path': '/data/files',
62
54
  ... 'storage': 1,
63
- ... 'collection': 5
55
+ ... 'data_collection': 5
64
56
  ... },
65
57
  ... plugin_config=config
66
58
  ... )
67
- >>> result = action.run_action()
59
+ >>> result = action.start()
68
60
  """
69
61
 
70
62
  name = 'upload'
@@ -97,527 +89,158 @@ class UploadAction(Action):
97
89
  }
98
90
 
99
91
  def __init__(self, *args, **kwargs):
92
+ """Initialize the upload action."""
100
93
  super().__init__(*args, **kwargs)
101
- self.excel_config = ExcelSecurityConfig()
102
- self.excel_utils = ExcelMetadataUtils(self.excel_config)
103
-
104
- def get_uploader(self, path, file_specification, organized_files, params: Dict = {}):
105
- """Get uploader from entrypoint."""
106
- return self.entrypoint(
107
- self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
108
- )
109
-
110
- def _discover_files_recursive(self, dir_path: Path) -> List[Path]:
111
- return [file_path for file_path in dir_path.rglob('*') if file_path.is_file()]
112
-
113
- def _discover_files_non_recursive(self, dir_path: Path) -> List[Path]:
114
- return [file_path for file_path in dir_path.glob('*') if file_path.is_file()]
115
-
116
- def _validate_excel_security(self, excel_path: Path) -> None:
117
- file_size = excel_path.stat().st_size
118
- if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
119
- raise ExcelSecurityError(
120
- f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
121
- )
122
-
123
- estimated_memory = file_size * 3
124
- if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
125
- raise ExcelSecurityError(
126
- f'Excel file may consume too much memory: ~{estimated_memory} bytes '
127
- f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
128
- )
129
-
130
- def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
131
- self._validate_excel_security(excel_path)
132
- excel_bytes = excel_path.read_bytes()
133
- return BytesIO(excel_bytes)
134
-
135
- def _process_excel_headers(self, headers: tuple) -> tuple:
136
- if len(headers) < 2:
137
- raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
138
- self._validate_excel_content(headers, 0)
139
- return headers
140
-
141
- def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
142
- if not row[0] or str(row[0]).strip() == '':
143
- return None
144
-
145
- file_name = str(row[0]).strip()
146
- if not self.excel_utils.is_valid_filename_length(file_name):
147
- self.run.log_message_with_code(LogCode.FILENAME_TOO_LONG, file_name[:50])
148
- return None
149
-
150
- file_metadata: Dict[str, Any] = {}
151
- for i, value in enumerate(row[1:], start=1):
152
- if value is not None and i < len(headers):
153
- header_value = headers[i]
154
- column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
155
-
156
- column_name = self.excel_utils.validate_and_truncate_string(
157
- column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
158
- )
159
- str_value = self.excel_utils.validate_and_truncate_string(
160
- str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
161
- )
162
- file_metadata[column_name] = str_value
163
-
164
- return {file_name: file_metadata} if file_metadata else None
165
-
166
- def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
167
- if worksheet is None:
168
- raise ExcelParsingError('Excel file has no active worksheet')
169
-
170
- metadata_dict: Dict[str, Dict[str, Any]] = {}
171
- headers: Optional[tuple] = None
172
- data_row_count = 0
173
- validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
174
-
175
- for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
176
- if not row or all(cell is None or str(cell).strip() == '' for cell in row):
177
- continue
178
-
179
- if row_idx == 0:
180
- headers = self._process_excel_headers(row)
181
- continue
182
-
183
- if headers is None:
184
- raise ExcelParsingError('Excel file missing header row')
185
-
186
- data_row_count += 1
187
-
188
- if data_row_count % validation_interval == 0:
189
- self._validate_excel_content(headers, data_row_count)
190
-
191
- row_result = self._process_excel_data_row(row, headers)
192
- if row_result:
193
- metadata_dict.update(row_result)
194
-
195
- self._validate_excel_content(headers or (), data_row_count)
196
-
197
- return metadata_dict
198
-
199
- def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
200
- if len(headers) > self.excel_config.MAX_COLUMNS:
201
- raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
202
-
203
- if row_count > self.excel_config.MAX_ROWS:
204
- raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
205
-
206
- def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
207
- for extension in ['.xlsx', '.xls']:
208
- excel_path = pathlib_cwd / f'meta{extension}'
209
- if excel_path.exists() and excel_path.is_file():
210
- return excel_path
211
- return None
212
-
213
- def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
214
- excel_path = None
215
-
216
- excel_metadata_path = self.params.get('excel_metadata_path')
217
- if excel_metadata_path:
218
- excel_path = pathlib_cwd / excel_metadata_path
219
- if not excel_path.exists():
220
- self.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
221
- return {}
222
- else:
223
- excel_path = self._find_excel_metadata_file(pathlib_cwd)
224
- if not excel_path:
225
- return {}
226
94
 
227
- try:
228
- self.run.log_message_with_code(LogCode.EXCEL_FILE_VALIDATION_STARTED)
229
-
230
- excel_stream = self._prepare_excel_file(excel_path)
231
-
232
- workbook = load_workbook(excel_stream, read_only=True, data_only=True)
233
- try:
234
- self.run.log_message_with_code(LogCode.EXCEL_WORKBOOK_LOADED)
235
- return self._process_excel_worksheet(workbook.active)
236
- finally:
237
- workbook.close()
238
-
239
- except ExcelSecurityError as e:
240
- self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VALIDATION_FAILED, str(e))
241
- raise
242
- except ExcelParsingError as e:
243
- self.run.log_message_with_code(LogCode.EXCEL_PARSING_FAILED, str(e))
244
- raise
245
- except InvalidFileException as e:
246
- self.run.log_message_with_code(LogCode.EXCEL_INVALID_FILE_FORMAT, str(e))
247
- raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
248
- except MemoryError:
249
- self.run.log_message_with_code(LogCode.EXCEL_FILE_TOO_LARGE)
250
- raise ExcelSecurityError('Excel file exceeds memory limits')
251
- except (OSError, IOError) as e:
252
- self.run.log_message_with_code(LogCode.EXCEL_FILE_ACCESS_ERROR, str(e))
253
- raise ExcelParsingError(f'File access error: {str(e)}')
254
- except Exception as e:
255
- self.run.log_message_with_code(LogCode.EXCEL_UNEXPECTED_ERROR, str(e))
256
- raise ExcelParsingError(f'Unexpected error: {str(e)}')
95
+ # Initialize Excel configuration from config.yaml
96
+ self.excel_config = ExcelSecurityConfig.from_action_config(self.config)
97
+ self.strategy_factory = StrategyFactory()
98
+ self.step_registry = StepRegistry()
99
+ self._configure_workflow()
100
+
101
+ def _configure_workflow(self) -> None:
102
+ """Configure workflow steps based on parameters.
103
+
104
+ Registers all workflow steps in the correct order. Steps can be
105
+ dynamically added, removed, or reordered for different use cases.
106
+ """
107
+ # Register steps in execution order
108
+ self.step_registry.register(InitializeStep())
109
+ self.step_registry.register(ProcessMetadataStep())
110
+ self.step_registry.register(AnalyzeCollectionStep())
111
+ self.step_registry.register(OrganizeFilesStep())
112
+ self.step_registry.register(ValidateFilesStep())
113
+ self.step_registry.register(UploadFilesStep())
114
+ self.step_registry.register(GenerateDataUnitsStep())
115
+ self.step_registry.register(CleanupStep())
257
116
 
258
117
  def start(self) -> Dict[str, Any]:
259
- result: Dict[str, Any] = {}
118
+ """Execute upload workflow with uploader integration.
260
119
 
261
- storage_id = self.params.get('storage')
262
- if storage_id is None:
263
- raise ActionError('Storage parameter is required')
264
- storage = self.client.get_storage(storage_id)
120
+ This method integrates the essential uploader mechanism with the new
121
+ strategy pattern architecture while maintaining backward compatibility.
265
122
 
266
- path = self.params.get('path')
267
- if path is None:
268
- raise ActionError('Path parameter is required')
269
- pathlib_cwd = get_pathlib(storage, path)
123
+ Returns:
124
+ Dict[str, Any]: Upload result with file counts, success status, and metrics
270
125
 
271
- excel_metadata: Dict[str, Dict[str, Any]] = {}
126
+ Raises:
127
+ ActionError: If upload workflow fails
128
+ """
272
129
  try:
273
- excel_metadata = self._read_excel_metadata(pathlib_cwd)
274
- if excel_metadata:
275
- self.run.log_message_with_code(LogCode.EXCEL_METADATA_LOADED, len(excel_metadata))
276
- except ExcelSecurityError as e:
277
- self.run.log_message_with_code(LogCode.EXCEL_SECURITY_VIOLATION, str(e))
278
- return result
279
- except ExcelParsingError as e:
280
- if self.params.get('excel_metadata_path'):
281
- self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
282
- return result
283
- else:
284
- self.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
285
- excel_metadata = {}
286
-
287
- file_specification_template = self._analyze_collection()
288
- organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
289
-
290
- uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
291
-
292
- organized_files = uploader.handle_upload_files()
293
-
294
- if not self._validate_organized_files(organized_files, file_specification_template):
295
- self.run.log_message_with_code(LogCode.VALIDATION_FAILED)
296
- raise ActionError('Upload is aborted due to validation errors.')
297
-
298
- if not organized_files:
299
- self.run.log_message_with_code(LogCode.NO_FILES_FOUND)
300
- raise ActionError('Upload is aborted due to missing files.')
301
-
302
- if self.params.get('use_async_upload', True):
303
- uploaded_files = self.run_async(self._upload_files_async(organized_files, 10))
304
- else:
305
- uploaded_files = self._upload_files(organized_files)
306
- result['uploaded_files_count'] = len(uploaded_files)
307
-
308
- if not uploaded_files:
309
- self.run.log_message_with_code(LogCode.NO_FILES_UPLOADED)
310
- raise ActionError('Upload is aborted due to no uploaded files.')
311
- generated_data_units = self._generate_data_units(
312
- uploaded_files, self.params.get('creating_data_unit_batch_size', 1)
313
- )
314
- result['generated_data_units_count'] = len(generated_data_units)
130
+ # Ensure params is not None
131
+ params = self.params or {}
315
132
 
316
- if not generated_data_units:
317
- self.run.log_message_with_code(LogCode.NO_DATA_UNITS_GENERATED)
318
- raise ActionError('Upload is aborted due to no generated data units.')
133
+ # Create upload context for sharing state between steps
134
+ context = UploadContext(params, self.run, self.client)
319
135
 
320
- self._cleanup_temp_directory()
136
+ # Configure strategies based on parameters with context
137
+ strategies = self._configure_strategies(context)
321
138
 
322
- self.run.log_message_with_code(LogCode.IMPORT_COMPLETED)
323
- return result
139
+ # Create orchestrator but run it with uploader integration
140
+ orchestrator = UploadOrchestrator(context, self.step_registry, strategies)
324
141
 
325
- def _analyze_collection(self) -> Dict[str, Any]:
326
- self.run.set_progress(0, 2, category='analyze_collection')
142
+ # Execute the workflow steps, but intercept after organize step
143
+ result = self._execute_with_uploader_integration(orchestrator, context)
327
144
 
328
- collection_id = self.params.get('data_collection')
329
- if collection_id is None:
330
- raise ActionError('Data collection parameter is required')
331
- self.run.set_progress(1, 2, category='analyze_collection')
332
-
333
- collection = self.run.client.get_data_collection(collection_id)
334
- self.run.set_progress(2, 2, category='analyze_collection')
335
-
336
- return collection['file_specifications']
337
-
338
- def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
339
- organized_files_count = len(organized_files)
340
- self.run.set_progress(0, organized_files_count, category='upload_data_files')
341
- self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
342
-
343
- client = self.run.client
344
- collection_id = self.params.get('data_collection')
345
- if collection_id is None:
346
- raise ActionError('Data collection parameter is required')
347
- upload_result = []
348
- current_progress = 0
349
- success_count = 0
350
- failed_count = 0
351
-
352
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
353
-
354
- for organized_file in organized_files:
355
- try:
356
- use_chunked_upload = self._requires_chunked_upload(organized_file)
357
- uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
358
- self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
359
- success_count += 1
360
- upload_result.append(uploaded_data_file)
361
- except Exception as e:
362
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
363
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
364
- failed_count += 1
145
+ return result
365
146
 
366
- current_progress += 1
367
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
368
- self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
147
+ except Exception as e:
148
+ # Log the error and re-raise as ActionError
149
+ if self.run:
150
+ self.run.log_message(f'Upload workflow failed: {str(e)}')
151
+ raise ActionError(f'Upload failed: {str(e)}')
152
+
153
+ def _execute_with_uploader_integration(self, orchestrator, context) -> Dict[str, Any]:
154
+ """Execute workflow with proper uploader integration."""
155
+ # Inject strategies into context before executing steps
156
+ orchestrator._inject_strategies_into_context()
157
+
158
+ # Run initial steps up to file organization
159
+ steps = orchestrator.step_registry.get_steps()
160
+
161
+ # Execute steps one by one until we reach the organization step
162
+ for i, step in enumerate(steps):
163
+ if step.name in ['initialize', 'process_metadata', 'analyze_collection', 'organize_files']:
164
+ try:
165
+ result = step.safe_execute(context)
166
+ context.update(result)
167
+ if not result.success:
168
+ raise Exception(f"Step '{step.name}' failed: {result.error}")
169
+ except Exception as e:
170
+ raise ActionError(f"Failed at step '{step.name}': {str(e)}")
369
171
 
370
- self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
172
+ # Get organized files from context
173
+ organized_files = context.get('organized_files', [])
174
+ file_specification_template = context.get('file_specification_template', {})
175
+ pathlib_cwd = context.get('pathlib_cwd')
371
176
 
372
- return upload_result
177
+ if not organized_files or not file_specification_template or not pathlib_cwd:
178
+ raise ActionError('Required data not available from workflow steps')
373
179
 
374
- def run_async(self, coro: Awaitable[T]) -> T:
375
- import concurrent.futures
180
+ # CRITICAL: Integrate with existing uploader mechanism
181
+ uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
182
+ organized_files = uploader.handle_upload_files()
376
183
 
377
- def _run_in_thread():
378
- return asyncio.run(coro)
184
+ # Update context with processed files
185
+ context.set('organized_files', organized_files)
379
186
 
380
- try:
381
- asyncio.get_running_loop()
382
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
383
- future = executor.submit(_run_in_thread)
384
- return future.result()
385
- except RuntimeError:
386
- return asyncio.run(coro)
387
-
388
- async def _upload_files_async(
389
- self, organized_files: List[Dict[str, Any]], max_concurrent: int = 10
390
- ) -> List[Dict[str, Any]]:
391
- organized_files_count = len(organized_files)
392
- self.run.set_progress(0, organized_files_count, category='upload_data_files')
393
- self.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
394
-
395
- client = self.run.client
396
- collection_id = self.params.get('data_collection')
397
- if collection_id is None:
398
- raise ActionError('Data collection parameter is required')
399
- upload_result = []
400
- success_count = 0
401
- failed_count = 0
402
-
403
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
404
-
405
- semaphore = asyncio.Semaphore(max_concurrent)
406
-
407
- async def upload_single_file(organized_file):
408
- async with semaphore:
409
- loop = asyncio.get_event_loop()
187
+ # Execute remaining steps
188
+ for step in steps:
189
+ if step.name in ['validate_files', 'upload_files', 'generate_data_units', 'cleanup']:
410
190
  try:
411
- use_chunked_upload = self._requires_chunked_upload(organized_file)
412
- uploaded_data_file = await loop.run_in_executor(
413
- None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
414
- )
415
- self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
416
- return {'status': 'success', 'result': uploaded_data_file}
417
- except ClientError as e:
418
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
419
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Client error: {str(e)}')
420
- return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
421
- except (OSError, IOError) as e:
422
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
423
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'File system error: {str(e)}')
424
- return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
425
- except MemoryError as e:
426
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
427
- self.run.log_message_with_code(
428
- LogCode.FILE_UPLOAD_FAILED, f'Memory error (file too large): {str(e)}'
429
- )
430
- return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
431
- except asyncio.TimeoutError as e:
432
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
433
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Upload timeout: {str(e)}')
434
- return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
435
- except ValueError as e:
436
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
437
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Data validation error: {str(e)}')
438
- return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
191
+ result = step.safe_execute(context)
192
+ context.update(result)
193
+ if not result.success:
194
+ raise Exception(f"Step '{step.name}' failed: {result.error}")
439
195
  except Exception as e:
440
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
441
- self.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Unexpected error: {str(e)}')
442
- return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
443
-
444
- tasks = [upload_single_file(organized_file) for organized_file in organized_files]
445
-
446
- current_progress = 0
447
- for completed_task in asyncio.as_completed(tasks):
448
- result = await completed_task
449
- current_progress += 1
450
-
451
- if result['status'] == 'success':
452
- success_count += 1
453
- upload_result.append(result['result'])
454
- else:
455
- failed_count += 1
456
-
457
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
458
- self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
459
-
460
- self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
461
-
462
- return upload_result
463
-
464
- def _generate_data_units(self, uploaded_files: List[Dict[str, Any]], batch_size: int) -> List[Dict[str, Any]]:
465
- upload_result_count = len(uploaded_files)
466
- self.run.set_progress(0, upload_result_count, category='generate_data_units')
467
- self.run.log_message_with_code(LogCode.GENERATING_DATA_UNITS)
468
-
469
- client = self.run.client
470
- generated_data_units = []
471
- current_progress = 0
472
- success_count = 0
473
- failed_count = 0
474
-
475
- batches = get_batched_list(uploaded_files, batch_size)
476
- batches_count = len(batches)
477
-
478
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
479
-
480
- for batch in batches:
481
- try:
482
- created_data_units = client.create_data_units(batch)
483
- success_count += len(created_data_units)
484
- generated_data_units.append(created_data_units)
485
- for created_data_unit in created_data_units:
486
- self.run.log_data_unit(
487
- created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
488
- )
489
- except Exception as e:
490
- failed_count += len(batch)
491
- self.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
492
- for _ in batch:
493
- self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
494
-
495
- current_progress += 1
496
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
497
- self.run.set_progress(current_progress, batches_count, category='generate_data_units')
498
-
499
- self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
500
-
501
- return sum(generated_data_units, [])
502
-
503
- def _validate_organized_files(
504
- self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
505
- ) -> bool:
506
- validator = FileSpecificationValidator(file_specification_template, organized_files)
507
- return validator.validate()
508
-
509
- def _organize_files(
510
- self,
511
- directory: Path,
512
- file_specification: List[Dict[str, Any]],
513
- excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
514
- ) -> List[Dict[str, Any]]:
515
- organized_files: List[Dict[str, Any]] = []
516
-
517
- type_dirs: Dict[str, Path] = {}
518
-
519
- for spec in file_specification:
520
- spec_name = spec['name']
521
- spec_dir = directory / spec_name
522
- if spec_dir.exists() and spec_dir.is_dir():
523
- type_dirs[spec_name] = spec_dir
524
-
525
- if type_dirs:
526
- self.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
527
-
528
- if not type_dirs:
529
- self.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
530
- return organized_files
531
-
532
- self.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
533
- self.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
534
-
535
- dataset_files = {}
536
- required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
537
-
538
- is_recursive = self.params.get('is_recursive', True)
539
-
540
- for spec_name, dir_path in type_dirs.items():
541
- if is_recursive:
542
- files_list = self._discover_files_recursive(dir_path)
543
- else:
544
- files_list = self._discover_files_non_recursive(dir_path)
545
-
546
- for file_path in files_list:
547
- file_name = file_path.stem
548
-
549
- if file_name not in dataset_files:
550
- dataset_files[file_name] = {}
551
-
552
- if spec_name not in dataset_files[file_name]:
553
- dataset_files[file_name][spec_name] = file_path
554
- else:
555
- existing_file = dataset_files[file_name][spec_name]
556
- if file_path.stat().st_mtime > existing_file.stat().st_mtime:
557
- dataset_files[file_name][spec_name] = file_path
558
-
559
- if not dataset_files:
560
- self.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
561
- return organized_files
562
-
563
- self.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(dataset_files))
564
-
565
- for file_name, files_dict in sorted(dataset_files.items()):
566
- if all(req in files_dict for req in required_specs):
567
- file_extensions = {}
568
- for file_path in files_dict.values():
569
- ext = file_path.suffix.lower()
570
- if ext:
571
- file_extensions[ext] = file_extensions.get(ext, 0) + 1
572
-
573
- origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
574
-
575
- meta_data: Dict[str, Any] = {
576
- 'origin_file_stem': file_name,
577
- 'origin_file_extension': origin_file_extension,
578
- 'created_at': datetime.now().isoformat(),
579
- }
580
-
581
- if excel_metadata and file_name in excel_metadata:
582
- meta_data.update(excel_metadata[file_name])
583
-
584
- organized_files.append({'files': files_dict, 'meta': meta_data})
585
- else:
586
- missing = [req for req in required_specs if req not in files_dict]
587
- self.run.log_message_with_code(LogCode.MISSING_REQUIRED_FILES, file_name, ', '.join(missing))
588
-
589
- return organized_files
590
-
591
- def _get_file_size_mb(self, file_path: Path) -> float:
592
- return file_path.stat().st_size / (1024 * 1024)
196
+ raise ActionError(f"Failed at step '{step.name}': {str(e)}")
593
197
 
594
- def _requires_chunked_upload(self, organized_file: Dict[str, Any]) -> bool:
595
- max_file_size_mb = self.params.get('max_file_size_mb', 50)
596
- for file_path in organized_file.get('files', {}).values():
597
- if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
598
- return True
599
- return False
198
+ # Return the final result from context
199
+ return context.get_result()
600
200
 
601
- def _cleanup_temp_directory(self, temp_path: Optional[Path] = None) -> None:
602
- if temp_path is None:
603
- try:
604
- temp_path = Path(os.getcwd()) / 'temp'
605
- except (FileNotFoundError, OSError):
606
- return
201
+ def _configure_strategies(self, context=None) -> Dict[str, Any]:
202
+ """Configure strategies based on parameters.
607
203
 
608
- if not temp_path.exists():
609
- return
610
-
611
- shutil.rmtree(temp_path, ignore_errors=True)
612
- self.run.log_message(f'Cleaned up temporary directory: {temp_path}')
204
+ Uses the Strategy pattern to create appropriate strategy implementations
205
+ based on the action parameters. This allows for runtime selection of
206
+ different behaviors (sync vs async upload, recursive vs flat discovery, etc.).
613
207
 
614
- def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
615
- if not self.run:
616
- raise ValueError('Run instance not properly initialized')
208
+ Args:
209
+ context: UploadContext for strategies that need access to client/run
617
210
 
618
- assert isinstance(self.run, UploadRun)
211
+ Returns:
212
+ Dict[str, Any]: Dictionary of strategy instances keyed by type
213
+ """
214
+ # Ensure params is not None
215
+ params = self.params or {}
619
216
 
620
- metrics = self.run.MetricsRecord(
621
- stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
217
+ return {
218
+ 'validation': self.strategy_factory.create_validation_strategy(params, context),
219
+ 'file_discovery': self.strategy_factory.create_file_discovery_strategy(params, context),
220
+ 'metadata': self.strategy_factory.create_metadata_strategy(params, context),
221
+ 'upload': self.strategy_factory.create_upload_strategy(params, context),
222
+ 'data_unit': self.strategy_factory.create_data_unit_strategy(params, context),
223
+ }
224
+
225
+ def get_uploader(self, path, file_specification, organized_files, params: Dict = {}):
226
+ """Get uploader from entrypoint (compatibility method).
227
+
228
+ This method is kept for backward compatibility with existing code
229
+ that may still call it directly.
230
+ """
231
+ return self.entrypoint(
232
+ self.run, path, file_specification, organized_files, extra_params=params.get('extra_params')
622
233
  )
623
- self.run.log_metrics(metrics, category)
234
+
235
+ def get_workflow_summary(self) -> Dict[str, Any]:
236
+ """Get summary of configured workflow.
237
+
238
+ Returns:
239
+ Dict[str, Any]: Summary of steps and strategies
240
+ """
241
+ return {
242
+ 'steps': [step.name for step in self.step_registry.get_steps()],
243
+ 'step_count': len(self.step_registry),
244
+ 'total_progress_weight': self.step_registry.get_total_progress_weight(),
245
+ 'available_strategies': self.strategy_factory.get_available_strategies(),
246
+ }