synapse-sdk 2025.9.5__py3-none-any.whl → 2025.10.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/clients/base.py +129 -9
- synapse_sdk/devtools/docs/docs/api/clients/base.md +230 -8
- synapse_sdk/devtools/docs/docs/api/plugins/models.md +58 -3
- synapse_sdk/devtools/docs/docs/plugins/categories/neural-net-plugins/train-action-overview.md +663 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-action.md +934 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-overview.md +585 -0
- synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-template.md +715 -0
- synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +39 -0
- synapse_sdk/devtools/docs/docs/plugins/plugins.md +12 -5
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/base.md +230 -8
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/plugins/models.md +114 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/neural-net-plugins/train-action-overview.md +621 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-action.md +934 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-overview.md +585 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-template.md +715 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +39 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current.json +16 -4
- synapse_sdk/devtools/docs/sidebars.ts +45 -1
- synapse_sdk/plugins/README.md +487 -80
- synapse_sdk/plugins/categories/base.py +1 -0
- synapse_sdk/plugins/categories/export/actions/export/action.py +8 -3
- synapse_sdk/plugins/categories/export/actions/export/utils.py +108 -8
- synapse_sdk/plugins/categories/export/templates/config.yaml +18 -0
- synapse_sdk/plugins/categories/export/templates/plugin/export.py +97 -0
- synapse_sdk/plugins/categories/neural_net/actions/train.py +592 -22
- synapse_sdk/plugins/categories/neural_net/actions/tune.py +150 -3
- synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +145 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +97 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +250 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +284 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +87 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +127 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +2 -1
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +8 -1
- synapse_sdk/plugins/categories/upload/actions/upload/context.py +0 -1
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +134 -94
- synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +2 -2
- synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +6 -2
- synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +24 -9
- synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +130 -18
- synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +147 -37
- synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +10 -5
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +31 -6
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +65 -37
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +17 -2
- synapse_sdk/plugins/categories/upload/templates/README.md +394 -0
- synapse_sdk/plugins/models.py +62 -0
- synapse_sdk/utils/file/download.py +261 -0
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/METADATA +15 -2
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/RECORD +74 -43
- synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +0 -1463
- synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +0 -1964
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +0 -1463
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +0 -2077
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/WHEEL +0 -0
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/top_level.txt +0 -0
|
@@ -29,19 +29,34 @@ class InitializeStep(BaseStep):
|
|
|
29
29
|
except Exception as e:
|
|
30
30
|
return self.create_error_result(f'Failed to get storage {storage_id}: {str(e)}')
|
|
31
31
|
|
|
32
|
-
#
|
|
32
|
+
# Check if we're in multi-path mode
|
|
33
|
+
use_single_path = context.get_param('use_single_path', True)
|
|
34
|
+
|
|
35
|
+
# Get and validate path (only required in single-path mode)
|
|
33
36
|
path = context.get_param('path')
|
|
34
|
-
|
|
35
|
-
return self.create_error_result('Path parameter is required')
|
|
37
|
+
pathlib_cwd = None
|
|
36
38
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
if use_single_path:
|
|
40
|
+
# Single-path mode: global path is required
|
|
41
|
+
if path is None:
|
|
42
|
+
return self.create_error_result('Path parameter is required in single-path mode')
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
pathlib_cwd = get_pathlib(storage, path)
|
|
46
|
+
context.set_pathlib_cwd(pathlib_cwd)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
return self.create_error_result(f'Failed to get path {path}: {str(e)}')
|
|
49
|
+
else:
|
|
50
|
+
# Multi-path mode: global path is optional (each asset has its own path)
|
|
51
|
+
if path:
|
|
52
|
+
try:
|
|
53
|
+
pathlib_cwd = get_pathlib(storage, path)
|
|
54
|
+
context.set_pathlib_cwd(pathlib_cwd)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
return self.create_error_result(f'Failed to get path {path}: {str(e)}')
|
|
42
57
|
|
|
43
58
|
# Return success with rollback data
|
|
44
|
-
rollback_data = {'storage_id': storage_id, 'path': path}
|
|
59
|
+
rollback_data = {'storage_id': storage_id, 'path': path, 'use_single_path': use_single_path}
|
|
45
60
|
|
|
46
61
|
return self.create_success_result(
|
|
47
62
|
data={'storage': storage, 'pathlib_cwd': pathlib_cwd}, rollback_data=rollback_data
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import tempfile
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
|
|
3
5
|
from ..context import StepResult, UploadContext
|
|
4
6
|
from ..enums import LogCode
|
|
5
7
|
from ..exceptions import ExcelParsingError, ExcelSecurityError
|
|
8
|
+
from ..models import ExcelMetadataFile
|
|
6
9
|
from .base import BaseStep
|
|
7
10
|
|
|
8
11
|
|
|
@@ -25,28 +28,48 @@ class ProcessMetadataStep(BaseStep):
|
|
|
25
28
|
return self.create_success_result(data={'metadata': {}})
|
|
26
29
|
|
|
27
30
|
excel_metadata = {}
|
|
31
|
+
temp_file_to_cleanup = None
|
|
28
32
|
|
|
29
33
|
try:
|
|
30
|
-
# Check if Excel metadata
|
|
31
|
-
excel_metadata_path
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
34
|
+
# Check if Excel metadata is specified - try both parameters
|
|
35
|
+
# TODO: Plan to deprecate excel_metadata_path in a few versions (backward compatibility)
|
|
36
|
+
excel_metadata_path_config = context.get_param('excel_metadata_path')
|
|
37
|
+
excel_metadata_config = context.get_param('excel_metadata')
|
|
38
|
+
|
|
39
|
+
if excel_metadata_path_config:
|
|
40
|
+
# Traditional path-based approach (will be deprecated in future)
|
|
41
|
+
excel_path, is_temp = self._resolve_excel_path_from_string(excel_metadata_path_config, context)
|
|
42
|
+
|
|
43
|
+
if not excel_path or not excel_path.exists():
|
|
44
|
+
context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
|
|
45
|
+
return self.create_success_result(data={'metadata': {}})
|
|
46
|
+
|
|
47
|
+
excel_metadata = metadata_strategy.extract(excel_path)
|
|
48
|
+
|
|
49
|
+
elif excel_metadata_config:
|
|
50
|
+
# Base64 encoded approach
|
|
51
|
+
excel_path, is_temp = self._resolve_excel_path_from_base64(excel_metadata_config, context)
|
|
52
|
+
|
|
53
|
+
if not excel_path or not excel_path.exists():
|
|
42
54
|
context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
|
|
43
55
|
return self.create_success_result(data={'metadata': {}})
|
|
56
|
+
|
|
57
|
+
# Track temp file for cleanup
|
|
58
|
+
if is_temp:
|
|
59
|
+
temp_file_to_cleanup = excel_path
|
|
60
|
+
|
|
44
61
|
excel_metadata = metadata_strategy.extract(excel_path)
|
|
45
62
|
else:
|
|
46
63
|
# Look for default metadata files (meta.xlsx, meta.xls)
|
|
47
|
-
|
|
48
|
-
if
|
|
49
|
-
|
|
64
|
+
# Only possible in single-path mode where pathlib_cwd is set
|
|
65
|
+
if context.pathlib_cwd:
|
|
66
|
+
excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
|
|
67
|
+
if excel_path:
|
|
68
|
+
excel_metadata = metadata_strategy.extract(excel_path)
|
|
69
|
+
else:
|
|
70
|
+
context.run.log_message(
|
|
71
|
+
'No Excel metadata specified and multi-path mode - skipping metadata processing'
|
|
72
|
+
)
|
|
50
73
|
|
|
51
74
|
# Validate extracted metadata
|
|
52
75
|
if excel_metadata:
|
|
@@ -65,9 +88,9 @@ class ProcessMetadataStep(BaseStep):
|
|
|
65
88
|
return self.create_error_result(f'Excel security violation: {str(e)}')
|
|
66
89
|
|
|
67
90
|
except ExcelParsingError as e:
|
|
68
|
-
# If excel_metadata_path was specified, this is an error
|
|
91
|
+
# If excel_metadata_path or excel_metadata was specified, this is an error
|
|
69
92
|
# If we were just looking for default files, it's not an error
|
|
70
|
-
if context.get_param('excel_metadata_path'):
|
|
93
|
+
if context.get_param('excel_metadata_path') or context.get_param('excel_metadata'):
|
|
71
94
|
context.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
|
|
72
95
|
return self.create_error_result(f'Excel parsing error: {str(e)}')
|
|
73
96
|
else:
|
|
@@ -77,6 +100,15 @@ class ProcessMetadataStep(BaseStep):
|
|
|
77
100
|
except Exception as e:
|
|
78
101
|
return self.create_error_result(f'Unexpected error processing metadata: {str(e)}')
|
|
79
102
|
|
|
103
|
+
finally:
|
|
104
|
+
# Clean up temporary file if it was created from base64
|
|
105
|
+
if temp_file_to_cleanup and temp_file_to_cleanup.exists():
|
|
106
|
+
try:
|
|
107
|
+
temp_file_to_cleanup.unlink()
|
|
108
|
+
context.run.log_message(f'Cleaned up temporary Excel file: {temp_file_to_cleanup}')
|
|
109
|
+
except Exception as e:
|
|
110
|
+
context.run.log_message(f'Failed to clean up temporary file {temp_file_to_cleanup}: {str(e)}')
|
|
111
|
+
|
|
80
112
|
def can_skip(self, context: UploadContext) -> bool:
|
|
81
113
|
"""Metadata step can be skipped if no metadata strategy is configured."""
|
|
82
114
|
return 'metadata' not in context.strategies
|
|
@@ -86,8 +118,88 @@ class ProcessMetadataStep(BaseStep):
|
|
|
86
118
|
# Clear any loaded metadata
|
|
87
119
|
context.metadata.clear()
|
|
88
120
|
|
|
121
|
+
def _resolve_excel_path_from_string(self, excel_path_str: str, context: UploadContext) -> tuple[Path | None, bool]:
|
|
122
|
+
"""Resolve Excel metadata path from a string path.
|
|
123
|
+
|
|
124
|
+
Note: This method supports the excel_metadata_path parameter which will be deprecated
|
|
125
|
+
in a future version. Consider using _resolve_excel_path_from_base64 instead.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
excel_path_str: File path string to the Excel metadata file
|
|
129
|
+
context: Upload context for resolving relative paths
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (resolved_path, is_temporary_file)
|
|
133
|
+
- resolved_path: Path object pointing to the Excel file, or None if resolution failed
|
|
134
|
+
- is_temporary_file: Always False for path-based approach
|
|
135
|
+
|
|
136
|
+
Examples:
|
|
137
|
+
>>> path, is_temp = self._resolve_excel_path_from_string("/data/meta.xlsx", context)
|
|
138
|
+
"""
|
|
139
|
+
# TODO: Plan to deprecate this method in a few versions (backward compatibility)
|
|
140
|
+
# Try absolute path first
|
|
141
|
+
path = Path(excel_path_str)
|
|
142
|
+
if path.exists() and path.is_file():
|
|
143
|
+
return path, False
|
|
144
|
+
|
|
145
|
+
# Try relative to cwd (only if pathlib_cwd is set)
|
|
146
|
+
if context.pathlib_cwd:
|
|
147
|
+
path = context.pathlib_cwd / excel_path_str
|
|
148
|
+
return (path, False) if path.exists() else (None, False)
|
|
149
|
+
|
|
150
|
+
# In multi-path mode without pathlib_cwd, can only use absolute paths
|
|
151
|
+
return (None, False)
|
|
152
|
+
|
|
153
|
+
def _resolve_excel_path_from_base64(
|
|
154
|
+
self, excel_config: dict | ExcelMetadataFile, context: UploadContext
|
|
155
|
+
) -> tuple[Path | None, bool]:
|
|
156
|
+
"""Resolve Excel metadata path from base64 encoded data.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
excel_config: Either a dict or an ExcelMetadataFile object with base64 data
|
|
160
|
+
context: Upload context for logging
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Tuple of (resolved_path, is_temporary_file)
|
|
164
|
+
- resolved_path: Path object pointing to the temporary Excel file, or None if decoding failed
|
|
165
|
+
- is_temporary_file: Always True for base64 approach (requires cleanup)
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
>>> config = ExcelMetadataFile(data="UEsDB...", filename="meta.xlsx")
|
|
169
|
+
>>> path, is_temp = self._resolve_excel_path_from_base64(config, context)
|
|
170
|
+
"""
|
|
171
|
+
if isinstance(excel_config, dict):
|
|
172
|
+
excel_config = ExcelMetadataFile(**excel_config)
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
# Decode base64 data
|
|
176
|
+
decoded_data = base64.b64decode(excel_config.data, validate=True)
|
|
177
|
+
|
|
178
|
+
# Create temp file
|
|
179
|
+
temp_dir = Path(tempfile.gettempdir())
|
|
180
|
+
filename = excel_config.filename
|
|
181
|
+
temp_file = temp_dir / filename
|
|
182
|
+
temp_file.write_bytes(decoded_data)
|
|
183
|
+
|
|
184
|
+
context.run.log_message(f'Decoded base64 Excel metadata to temporary file: {temp_file}')
|
|
185
|
+
return temp_file, True
|
|
186
|
+
|
|
187
|
+
except Exception as e:
|
|
188
|
+
context.run.log_message(f'Failed to decode base64 Excel metadata: {str(e)}')
|
|
189
|
+
return None, False
|
|
190
|
+
|
|
89
191
|
def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Path:
|
|
90
|
-
"""Find default Excel metadata file.
|
|
192
|
+
"""Find default Excel metadata file.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
pathlib_cwd: Working directory path (must not be None)
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Path to Excel metadata file, or None if not found
|
|
199
|
+
"""
|
|
200
|
+
if not pathlib_cwd:
|
|
201
|
+
return None
|
|
202
|
+
|
|
91
203
|
# Check .xlsx first as it's more common
|
|
92
204
|
excel_path = pathlib_cwd / 'meta.xlsx'
|
|
93
205
|
if excel_path.exists():
|
|
@@ -24,51 +24,152 @@ class OrganizeFilesStep(BaseStep):
|
|
|
24
24
|
return self.create_error_result('File specifications not available')
|
|
25
25
|
|
|
26
26
|
try:
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
type_dirs[spec_name] = spec_dir
|
|
34
|
-
|
|
35
|
-
if type_dirs:
|
|
36
|
-
context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
|
|
27
|
+
# Check which mode we're in
|
|
28
|
+
use_single_path = context.get_param('use_single_path', True)
|
|
29
|
+
|
|
30
|
+
if use_single_path:
|
|
31
|
+
# Single path mode: all assets use same base path
|
|
32
|
+
return self._execute_single_path_mode(context, file_discovery_strategy)
|
|
37
33
|
else:
|
|
38
|
-
|
|
39
|
-
return self.
|
|
34
|
+
# Multi-path mode: each asset has its own path
|
|
35
|
+
return self._execute_multi_path_mode(context, file_discovery_strategy)
|
|
40
36
|
|
|
41
|
-
|
|
42
|
-
|
|
37
|
+
except Exception as e:
|
|
38
|
+
return self.create_error_result(f'File organization failed: {str(e)}')
|
|
43
39
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
40
|
+
def _execute_single_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
|
|
41
|
+
"""Execute file organization in single path mode (traditional)."""
|
|
42
|
+
# Create type directories mapping
|
|
43
|
+
type_dirs = {}
|
|
44
|
+
for spec in context.file_specifications:
|
|
45
|
+
spec_name = spec['name']
|
|
46
|
+
spec_dir = context.pathlib_cwd / spec_name
|
|
47
|
+
if spec_dir.exists() and spec_dir.is_dir():
|
|
48
|
+
type_dirs[spec_name] = spec_dir
|
|
49
|
+
|
|
50
|
+
if type_dirs:
|
|
51
|
+
context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
|
|
52
|
+
else:
|
|
53
|
+
context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
|
|
54
|
+
return self.create_success_result(data={'organized_files': []})
|
|
55
|
+
|
|
56
|
+
context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
|
|
57
|
+
context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
|
|
58
|
+
|
|
59
|
+
# Discover files in type directories
|
|
60
|
+
all_files = []
|
|
61
|
+
is_recursive = context.get_param('is_recursive', True)
|
|
62
|
+
|
|
63
|
+
for spec_name, dir_path in type_dirs.items():
|
|
64
|
+
files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
|
|
65
|
+
all_files.extend(files_in_dir)
|
|
66
|
+
|
|
67
|
+
if not all_files:
|
|
68
|
+
context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
|
|
69
|
+
return self.create_success_result(data={'organized_files': []})
|
|
70
|
+
|
|
71
|
+
# Organize files using strategy
|
|
72
|
+
organized_files = file_discovery_strategy.organize(
|
|
73
|
+
all_files, context.file_specifications, context.metadata or {}, type_dirs
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if organized_files:
|
|
77
|
+
context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
|
|
78
|
+
context.add_organized_files(organized_files)
|
|
79
|
+
|
|
80
|
+
return self.create_success_result(
|
|
81
|
+
data={'organized_files': organized_files},
|
|
82
|
+
rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _execute_multi_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
|
|
86
|
+
"""Execute file organization in multi-path mode (each asset has own path)."""
|
|
87
|
+
from synapse_sdk.utils.storage import get_pathlib
|
|
88
|
+
|
|
89
|
+
assets = context.get_param('assets', {})
|
|
90
|
+
if not assets:
|
|
91
|
+
return self.create_error_result('Multi-path mode requires assets configuration')
|
|
92
|
+
|
|
93
|
+
# Validate that all required specs have asset paths
|
|
94
|
+
required_specs = [spec['name'] for spec in context.file_specifications if spec.get('is_required', False)]
|
|
95
|
+
missing_required = [spec for spec in required_specs if spec not in assets]
|
|
96
|
+
|
|
97
|
+
if missing_required:
|
|
98
|
+
return self.create_error_result(
|
|
99
|
+
f'Multi-path mode requires asset paths for required specs: {", ".join(missing_required)}'
|
|
100
|
+
)
|
|
47
101
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
102
|
+
context.run.log_message(f'Using multi-path mode with {len(assets)} asset configurations')
|
|
103
|
+
context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
|
|
104
|
+
|
|
105
|
+
# Collect all files and specs first
|
|
106
|
+
all_files = []
|
|
107
|
+
type_dirs = {}
|
|
108
|
+
specs_with_files = []
|
|
109
|
+
|
|
110
|
+
for spec in context.file_specifications:
|
|
111
|
+
spec_name = spec['name']
|
|
112
|
+
is_required = spec.get('is_required', False)
|
|
113
|
+
|
|
114
|
+
# Skip if no asset configuration for this spec (only allowed for optional specs)
|
|
115
|
+
if spec_name not in assets:
|
|
116
|
+
if is_required:
|
|
117
|
+
# This should not happen due to validation above, but double-check
|
|
118
|
+
return self.create_error_result(f'Required spec {spec_name} missing asset path')
|
|
119
|
+
context.run.log_message(f'Skipping optional spec {spec_name}: no asset path configured')
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
asset_config = assets[spec_name]
|
|
123
|
+
|
|
124
|
+
# Get the asset path from storage
|
|
125
|
+
try:
|
|
126
|
+
asset_path = get_pathlib(context.storage, asset_config.get('path', ''))
|
|
127
|
+
type_dirs[spec_name] = asset_path
|
|
128
|
+
except Exception as e:
|
|
129
|
+
context.run.log_message(f'Error accessing path for {spec_name}: {str(e)}', 'WARNING')
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
if not asset_path.exists():
|
|
133
|
+
context.run.log_message(f'Path does not exist for {spec_name}: {asset_config.path}', 'WARNING')
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Discover files for this asset
|
|
137
|
+
is_recursive = asset_config.get('is_recursive', True)
|
|
138
|
+
context.run.log_message(
|
|
139
|
+
f'Discovering files for {spec_name} at {asset_config.get("path", "")} (recursive={is_recursive})'
|
|
140
|
+
)
|
|
51
141
|
|
|
52
|
-
|
|
53
|
-
context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
|
|
54
|
-
return self.create_success_result(data={'organized_files': []})
|
|
142
|
+
files = file_discovery_strategy.discover(asset_path, is_recursive)
|
|
55
143
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
)
|
|
144
|
+
if not files:
|
|
145
|
+
context.run.log_message(f'No files found for {spec_name}', 'WARNING')
|
|
146
|
+
continue
|
|
60
147
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
148
|
+
all_files.extend(files)
|
|
149
|
+
specs_with_files.append(spec)
|
|
150
|
+
context.run.log_message(f'Found {len(files)} files for {spec_name}')
|
|
64
151
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
152
|
+
# Organize all files together to group by dataset_key
|
|
153
|
+
all_organized_files = []
|
|
154
|
+
if all_files and specs_with_files:
|
|
155
|
+
context.run.log_message(f'Organizing {len(all_files)} files across {len(specs_with_files)} specs')
|
|
156
|
+
context.run.log_message(f'Type directories: {list(type_dirs.keys())}')
|
|
157
|
+
|
|
158
|
+
all_organized_files = file_discovery_strategy.organize(
|
|
159
|
+
all_files, specs_with_files, context.metadata or {}, type_dirs
|
|
68
160
|
)
|
|
69
161
|
|
|
70
|
-
|
|
71
|
-
|
|
162
|
+
if all_organized_files:
|
|
163
|
+
context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(all_organized_files))
|
|
164
|
+
context.run.log_message(f'Created {len(all_organized_files)} data units from {len(all_files)} files')
|
|
165
|
+
context.add_organized_files(all_organized_files)
|
|
166
|
+
else:
|
|
167
|
+
context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
|
|
168
|
+
|
|
169
|
+
return self.create_success_result(
|
|
170
|
+
data={'organized_files': all_organized_files},
|
|
171
|
+
rollback_data={'files_count': len(all_organized_files), 'type_dirs': list(type_dirs.keys())},
|
|
172
|
+
)
|
|
72
173
|
|
|
73
174
|
def can_skip(self, context: UploadContext) -> bool:
|
|
74
175
|
"""File organization cannot be skipped."""
|
|
@@ -82,8 +183,17 @@ class OrganizeFilesStep(BaseStep):
|
|
|
82
183
|
|
|
83
184
|
def validate_prerequisites(self, context: UploadContext) -> None:
|
|
84
185
|
"""Validate prerequisites for file organization."""
|
|
85
|
-
|
|
86
|
-
|
|
186
|
+
use_single_path = context.get_param('use_single_path', True)
|
|
187
|
+
|
|
188
|
+
# In single-path mode, pathlib_cwd is required
|
|
189
|
+
if use_single_path and not context.pathlib_cwd:
|
|
190
|
+
raise ValueError('Working directory path not set in single-path mode')
|
|
191
|
+
|
|
192
|
+
# In multi-path mode, pathlib_cwd is optional (each asset has its own path)
|
|
193
|
+
if not use_single_path:
|
|
194
|
+
assets = context.get_param('assets', {})
|
|
195
|
+
if not assets:
|
|
196
|
+
raise ValueError('Multi-path mode requires assets configuration')
|
|
87
197
|
|
|
88
198
|
if not context.file_specifications:
|
|
89
199
|
raise ValueError('File specifications not available')
|
|
@@ -34,7 +34,9 @@ class UploadFilesStep(BaseStep):
|
|
|
34
34
|
context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
|
|
35
35
|
|
|
36
36
|
# Initialize metrics
|
|
37
|
-
|
|
37
|
+
initial_metrics = {'stand_by': organized_files_count, 'success': 0, 'failed': 0}
|
|
38
|
+
context.update_metrics('data_files', initial_metrics)
|
|
39
|
+
context.run.set_metrics(initial_metrics, category='data_files')
|
|
38
40
|
|
|
39
41
|
# Create upload configuration
|
|
40
42
|
upload_config = UploadConfig(
|
|
@@ -55,10 +57,13 @@ class UploadFilesStep(BaseStep):
|
|
|
55
57
|
context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
|
|
56
58
|
|
|
57
59
|
# Update final metrics
|
|
58
|
-
|
|
59
|
-
'
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
final_metrics = {
|
|
61
|
+
'stand_by': 0,
|
|
62
|
+
'success': len(uploaded_files),
|
|
63
|
+
'failed': organized_files_count - len(uploaded_files),
|
|
64
|
+
}
|
|
65
|
+
context.update_metrics('data_files', final_metrics)
|
|
66
|
+
context.run.set_metrics(final_metrics, category='data_files')
|
|
62
67
|
|
|
63
68
|
# Complete progress
|
|
64
69
|
context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
@@ -10,7 +10,11 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
|
|
|
10
10
|
|
|
11
11
|
def discover(self, path: Path, recursive: bool) -> List[Path]:
|
|
12
12
|
"""Discover files non-recursively in the given path."""
|
|
13
|
-
|
|
13
|
+
# Exclude system files
|
|
14
|
+
excluded_files = {'.DS_Store', 'Thumbs.db', 'desktop.ini'}
|
|
15
|
+
return [
|
|
16
|
+
file_path for file_path in path.glob('*') if file_path.is_file() and file_path.name not in excluded_files
|
|
17
|
+
]
|
|
14
18
|
|
|
15
19
|
def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
|
|
16
20
|
"""Organize files according to specifications with metadata."""
|
|
@@ -39,9 +43,14 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
|
|
|
39
43
|
# Performance optimization 2: Build metadata index for faster lookups
|
|
40
44
|
metadata_index = self._build_metadata_index(metadata)
|
|
41
45
|
|
|
42
|
-
# Group files by
|
|
46
|
+
# Group files by dataset_key (stem-based matching) - flat discovery (no subdirectories)
|
|
47
|
+
# Strategy:
|
|
48
|
+
# 1. Group all files (required + optional) by their file stem
|
|
49
|
+
# 2. Only create data units for groups that have ALL required files
|
|
50
|
+
# 3. Optional files are automatically included if they match the stem
|
|
43
51
|
dataset_files = {}
|
|
44
52
|
required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
|
|
53
|
+
optional_specs = [spec['name'] for spec in specs if not spec.get('is_required', False)]
|
|
45
54
|
|
|
46
55
|
for file_path in files:
|
|
47
56
|
# Determine which type directory this file belongs to
|
|
@@ -64,20 +73,36 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
|
|
|
64
73
|
# If stat fails, keep existing file
|
|
65
74
|
pass
|
|
66
75
|
|
|
67
|
-
# Create organized files for datasets with
|
|
76
|
+
# Create organized files ONLY for datasets with ALL required files
|
|
77
|
+
# Optional files are included automatically if they match the stem
|
|
68
78
|
for file_name, files_dict in sorted(dataset_files.items()):
|
|
69
|
-
if all
|
|
70
|
-
|
|
79
|
+
# Check if all required files are present
|
|
80
|
+
has_all_required = all(req in files_dict for req in required_specs)
|
|
81
|
+
|
|
82
|
+
if has_all_required:
|
|
83
|
+
# Extract original file stem from actual file paths (more reliable)
|
|
84
|
+
# Collect stems from all files in the group
|
|
85
|
+
file_stems = {}
|
|
71
86
|
file_extensions = {}
|
|
87
|
+
|
|
72
88
|
for file_path in files_dict.values():
|
|
89
|
+
stem = file_path.stem
|
|
73
90
|
ext = file_path.suffix.lower()
|
|
91
|
+
|
|
92
|
+
# Count stems (to handle multiple files with slightly different names)
|
|
93
|
+
if stem:
|
|
94
|
+
file_stems[stem] = file_stems.get(stem, 0) + 1
|
|
95
|
+
|
|
96
|
+
# Count extensions
|
|
74
97
|
if ext:
|
|
75
98
|
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
76
99
|
|
|
100
|
+
# Use the most common stem (usually they're all the same)
|
|
101
|
+
original_stem = max(file_stems, key=file_stems.get) if file_stems else file_name
|
|
77
102
|
origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
|
|
78
103
|
|
|
79
104
|
meta_data = {
|
|
80
|
-
'origin_file_stem':
|
|
105
|
+
'origin_file_stem': original_stem,
|
|
81
106
|
'origin_file_extension': origin_file_extension,
|
|
82
107
|
'created_at': datetime.now().isoformat(),
|
|
83
108
|
}
|