synapse-sdk 2025.9.5__py3-none-any.whl → 2025.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (78) hide show
  1. synapse_sdk/clients/base.py +129 -9
  2. synapse_sdk/devtools/docs/docs/api/clients/base.md +230 -8
  3. synapse_sdk/devtools/docs/docs/api/plugins/models.md +58 -3
  4. synapse_sdk/devtools/docs/docs/plugins/categories/neural-net-plugins/train-action-overview.md +663 -0
  5. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
  6. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
  7. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
  8. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
  9. synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-action.md +934 -0
  10. synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-overview.md +585 -0
  11. synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-template.md +715 -0
  12. synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +39 -0
  13. synapse_sdk/devtools/docs/docs/plugins/plugins.md +12 -5
  14. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/base.md +230 -8
  15. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/plugins/models.md +114 -0
  16. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/neural-net-plugins/train-action-overview.md +621 -0
  17. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
  18. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
  19. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
  20. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
  21. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-action.md +934 -0
  22. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-overview.md +585 -0
  23. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-template.md +715 -0
  24. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +39 -0
  25. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current.json +16 -4
  26. synapse_sdk/devtools/docs/sidebars.ts +45 -1
  27. synapse_sdk/plugins/README.md +487 -80
  28. synapse_sdk/plugins/categories/base.py +1 -0
  29. synapse_sdk/plugins/categories/export/actions/export/action.py +8 -3
  30. synapse_sdk/plugins/categories/export/actions/export/utils.py +108 -8
  31. synapse_sdk/plugins/categories/export/templates/config.yaml +18 -0
  32. synapse_sdk/plugins/categories/export/templates/plugin/export.py +97 -0
  33. synapse_sdk/plugins/categories/neural_net/actions/train.py +592 -22
  34. synapse_sdk/plugins/categories/neural_net/actions/tune.py +150 -3
  35. synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
  36. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
  37. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
  38. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
  39. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +145 -0
  40. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
  41. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
  42. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
  43. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +97 -0
  44. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +250 -0
  45. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
  46. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
  47. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +284 -0
  48. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
  49. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
  50. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +87 -0
  51. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +127 -0
  52. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +2 -1
  54. synapse_sdk/plugins/categories/upload/actions/upload/action.py +8 -1
  55. synapse_sdk/plugins/categories/upload/actions/upload/context.py +0 -1
  56. synapse_sdk/plugins/categories/upload/actions/upload/models.py +134 -94
  57. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +2 -2
  58. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +6 -2
  59. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +24 -9
  60. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +130 -18
  61. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +147 -37
  62. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +10 -5
  63. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +31 -6
  64. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +65 -37
  65. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +17 -2
  66. synapse_sdk/plugins/categories/upload/templates/README.md +394 -0
  67. synapse_sdk/plugins/models.py +62 -0
  68. synapse_sdk/utils/file/download.py +261 -0
  69. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/METADATA +15 -2
  70. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/RECORD +74 -43
  71. synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +0 -1463
  72. synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +0 -1964
  73. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +0 -1463
  74. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +0 -2077
  75. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/WHEEL +0 -0
  76. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/entry_points.txt +0 -0
  77. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/licenses/LICENSE +0 -0
  78. {synapse_sdk-2025.9.5.dist-info → synapse_sdk-2025.10.6.dist-info}/top_level.txt +0 -0
@@ -29,19 +29,34 @@ class InitializeStep(BaseStep):
29
29
  except Exception as e:
30
30
  return self.create_error_result(f'Failed to get storage {storage_id}: {str(e)}')
31
31
 
32
- # Get and validate path
32
+ # Check if we're in multi-path mode
33
+ use_single_path = context.get_param('use_single_path', True)
34
+
35
+ # Get and validate path (only required in single-path mode)
33
36
  path = context.get_param('path')
34
- if path is None:
35
- return self.create_error_result('Path parameter is required')
37
+ pathlib_cwd = None
36
38
 
37
- try:
38
- pathlib_cwd = get_pathlib(storage, path)
39
- context.set_pathlib_cwd(pathlib_cwd)
40
- except Exception as e:
41
- return self.create_error_result(f'Failed to get path {path}: {str(e)}')
39
+ if use_single_path:
40
+ # Single-path mode: global path is required
41
+ if path is None:
42
+ return self.create_error_result('Path parameter is required in single-path mode')
43
+
44
+ try:
45
+ pathlib_cwd = get_pathlib(storage, path)
46
+ context.set_pathlib_cwd(pathlib_cwd)
47
+ except Exception as e:
48
+ return self.create_error_result(f'Failed to get path {path}: {str(e)}')
49
+ else:
50
+ # Multi-path mode: global path is optional (each asset has its own path)
51
+ if path:
52
+ try:
53
+ pathlib_cwd = get_pathlib(storage, path)
54
+ context.set_pathlib_cwd(pathlib_cwd)
55
+ except Exception as e:
56
+ return self.create_error_result(f'Failed to get path {path}: {str(e)}')
42
57
 
43
58
  # Return success with rollback data
44
- rollback_data = {'storage_id': storage_id, 'path': path}
59
+ rollback_data = {'storage_id': storage_id, 'path': path, 'use_single_path': use_single_path}
45
60
 
46
61
  return self.create_success_result(
47
62
  data={'storage': storage, 'pathlib_cwd': pathlib_cwd}, rollback_data=rollback_data
@@ -1,8 +1,11 @@
1
+ import base64
2
+ import tempfile
1
3
  from pathlib import Path
2
4
 
3
5
  from ..context import StepResult, UploadContext
4
6
  from ..enums import LogCode
5
7
  from ..exceptions import ExcelParsingError, ExcelSecurityError
8
+ from ..models import ExcelMetadataFile
6
9
  from .base import BaseStep
7
10
 
8
11
 
@@ -25,28 +28,48 @@ class ProcessMetadataStep(BaseStep):
25
28
  return self.create_success_result(data={'metadata': {}})
26
29
 
27
30
  excel_metadata = {}
31
+ temp_file_to_cleanup = None
28
32
 
29
33
  try:
30
- # Check if Excel metadata path is specified
31
- excel_metadata_path = context.get_param('excel_metadata_path')
32
- if excel_metadata_path:
33
- # Convert string to Path object
34
- if isinstance(excel_metadata_path, str):
35
- excel_metadata_path = Path(excel_metadata_path)
36
-
37
- if excel_metadata_path.exists() and excel_metadata_path.is_file():
38
- excel_path = excel_metadata_path
39
- else:
40
- excel_path = context.pathlib_cwd / excel_metadata_path
41
- if not excel_path.exists():
34
+ # Check if Excel metadata is specified - try both parameters
35
+ # TODO: Plan to deprecate excel_metadata_path in a few versions (backward compatibility)
36
+ excel_metadata_path_config = context.get_param('excel_metadata_path')
37
+ excel_metadata_config = context.get_param('excel_metadata')
38
+
39
+ if excel_metadata_path_config:
40
+ # Traditional path-based approach (will be deprecated in future)
41
+ excel_path, is_temp = self._resolve_excel_path_from_string(excel_metadata_path_config, context)
42
+
43
+ if not excel_path or not excel_path.exists():
44
+ context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
45
+ return self.create_success_result(data={'metadata': {}})
46
+
47
+ excel_metadata = metadata_strategy.extract(excel_path)
48
+
49
+ elif excel_metadata_config:
50
+ # Base64 encoded approach
51
+ excel_path, is_temp = self._resolve_excel_path_from_base64(excel_metadata_config, context)
52
+
53
+ if not excel_path or not excel_path.exists():
42
54
  context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
43
55
  return self.create_success_result(data={'metadata': {}})
56
+
57
+ # Track temp file for cleanup
58
+ if is_temp:
59
+ temp_file_to_cleanup = excel_path
60
+
44
61
  excel_metadata = metadata_strategy.extract(excel_path)
45
62
  else:
46
63
  # Look for default metadata files (meta.xlsx, meta.xls)
47
- excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
48
- if excel_path:
49
- excel_metadata = metadata_strategy.extract(excel_path)
64
+ # Only possible in single-path mode where pathlib_cwd is set
65
+ if context.pathlib_cwd:
66
+ excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
67
+ if excel_path:
68
+ excel_metadata = metadata_strategy.extract(excel_path)
69
+ else:
70
+ context.run.log_message(
71
+ 'No Excel metadata specified and multi-path mode - skipping metadata processing'
72
+ )
50
73
 
51
74
  # Validate extracted metadata
52
75
  if excel_metadata:
@@ -65,9 +88,9 @@ class ProcessMetadataStep(BaseStep):
65
88
  return self.create_error_result(f'Excel security violation: {str(e)}')
66
89
 
67
90
  except ExcelParsingError as e:
68
- # If excel_metadata_path was specified, this is an error
91
+ # If excel_metadata_path or excel_metadata was specified, this is an error
69
92
  # If we were just looking for default files, it's not an error
70
- if context.get_param('excel_metadata_path'):
93
+ if context.get_param('excel_metadata_path') or context.get_param('excel_metadata'):
71
94
  context.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
72
95
  return self.create_error_result(f'Excel parsing error: {str(e)}')
73
96
  else:
@@ -77,6 +100,15 @@ class ProcessMetadataStep(BaseStep):
77
100
  except Exception as e:
78
101
  return self.create_error_result(f'Unexpected error processing metadata: {str(e)}')
79
102
 
103
+ finally:
104
+ # Clean up temporary file if it was created from base64
105
+ if temp_file_to_cleanup and temp_file_to_cleanup.exists():
106
+ try:
107
+ temp_file_to_cleanup.unlink()
108
+ context.run.log_message(f'Cleaned up temporary Excel file: {temp_file_to_cleanup}')
109
+ except Exception as e:
110
+ context.run.log_message(f'Failed to clean up temporary file {temp_file_to_cleanup}: {str(e)}')
111
+
80
112
  def can_skip(self, context: UploadContext) -> bool:
81
113
  """Metadata step can be skipped if no metadata strategy is configured."""
82
114
  return 'metadata' not in context.strategies
@@ -86,8 +118,88 @@ class ProcessMetadataStep(BaseStep):
86
118
  # Clear any loaded metadata
87
119
  context.metadata.clear()
88
120
 
121
+ def _resolve_excel_path_from_string(self, excel_path_str: str, context: UploadContext) -> tuple[Path | None, bool]:
122
+ """Resolve Excel metadata path from a string path.
123
+
124
+ Note: This method supports the excel_metadata_path parameter which will be deprecated
125
+ in a future version. Consider using _resolve_excel_path_from_base64 instead.
126
+
127
+ Args:
128
+ excel_path_str: File path string to the Excel metadata file
129
+ context: Upload context for resolving relative paths
130
+
131
+ Returns:
132
+ Tuple of (resolved_path, is_temporary_file)
133
+ - resolved_path: Path object pointing to the Excel file, or None if resolution failed
134
+ - is_temporary_file: Always False for path-based approach
135
+
136
+ Examples:
137
+ >>> path, is_temp = self._resolve_excel_path_from_string("/data/meta.xlsx", context)
138
+ """
139
+ # TODO: Plan to deprecate this method in a few versions (backward compatibility)
140
+ # Try absolute path first
141
+ path = Path(excel_path_str)
142
+ if path.exists() and path.is_file():
143
+ return path, False
144
+
145
+ # Try relative to cwd (only if pathlib_cwd is set)
146
+ if context.pathlib_cwd:
147
+ path = context.pathlib_cwd / excel_path_str
148
+ return (path, False) if path.exists() else (None, False)
149
+
150
+ # In multi-path mode without pathlib_cwd, can only use absolute paths
151
+ return (None, False)
152
+
153
+ def _resolve_excel_path_from_base64(
154
+ self, excel_config: dict | ExcelMetadataFile, context: UploadContext
155
+ ) -> tuple[Path | None, bool]:
156
+ """Resolve Excel metadata path from base64 encoded data.
157
+
158
+ Args:
159
+ excel_config: Either a dict or an ExcelMetadataFile object with base64 data
160
+ context: Upload context for logging
161
+
162
+ Returns:
163
+ Tuple of (resolved_path, is_temporary_file)
164
+ - resolved_path: Path object pointing to the temporary Excel file, or None if decoding failed
165
+ - is_temporary_file: Always True for base64 approach (requires cleanup)
166
+
167
+ Examples:
168
+ >>> config = ExcelMetadataFile(data="UEsDB...", filename="meta.xlsx")
169
+ >>> path, is_temp = self._resolve_excel_path_from_base64(config, context)
170
+ """
171
+ if isinstance(excel_config, dict):
172
+ excel_config = ExcelMetadataFile(**excel_config)
173
+
174
+ try:
175
+ # Decode base64 data
176
+ decoded_data = base64.b64decode(excel_config.data, validate=True)
177
+
178
+ # Create temp file
179
+ temp_dir = Path(tempfile.gettempdir())
180
+ filename = excel_config.filename
181
+ temp_file = temp_dir / filename
182
+ temp_file.write_bytes(decoded_data)
183
+
184
+ context.run.log_message(f'Decoded base64 Excel metadata to temporary file: {temp_file}')
185
+ return temp_file, True
186
+
187
+ except Exception as e:
188
+ context.run.log_message(f'Failed to decode base64 Excel metadata: {str(e)}')
189
+ return None, False
190
+
89
191
  def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Path:
90
- """Find default Excel metadata file."""
192
+ """Find default Excel metadata file.
193
+
194
+ Args:
195
+ pathlib_cwd: Working directory path (must not be None)
196
+
197
+ Returns:
198
+ Path to Excel metadata file, or None if not found
199
+ """
200
+ if not pathlib_cwd:
201
+ return None
202
+
91
203
  # Check .xlsx first as it's more common
92
204
  excel_path = pathlib_cwd / 'meta.xlsx'
93
205
  if excel_path.exists():
@@ -24,51 +24,152 @@ class OrganizeFilesStep(BaseStep):
24
24
  return self.create_error_result('File specifications not available')
25
25
 
26
26
  try:
27
- # Create type directories mapping
28
- type_dirs = {}
29
- for spec in context.file_specifications:
30
- spec_name = spec['name']
31
- spec_dir = context.pathlib_cwd / spec_name
32
- if spec_dir.exists() and spec_dir.is_dir():
33
- type_dirs[spec_name] = spec_dir
34
-
35
- if type_dirs:
36
- context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
27
+ # Check which mode we're in
28
+ use_single_path = context.get_param('use_single_path', True)
29
+
30
+ if use_single_path:
31
+ # Single path mode: all assets use same base path
32
+ return self._execute_single_path_mode(context, file_discovery_strategy)
37
33
  else:
38
- context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
39
- return self.create_success_result(data={'organized_files': []})
34
+ # Multi-path mode: each asset has its own path
35
+ return self._execute_multi_path_mode(context, file_discovery_strategy)
40
36
 
41
- context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
42
- context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
37
+ except Exception as e:
38
+ return self.create_error_result(f'File organization failed: {str(e)}')
43
39
 
44
- # Discover files in type directories
45
- all_files = []
46
- is_recursive = context.get_param('is_recursive', True)
40
+ def _execute_single_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
41
+ """Execute file organization in single path mode (traditional)."""
42
+ # Create type directories mapping
43
+ type_dirs = {}
44
+ for spec in context.file_specifications:
45
+ spec_name = spec['name']
46
+ spec_dir = context.pathlib_cwd / spec_name
47
+ if spec_dir.exists() and spec_dir.is_dir():
48
+ type_dirs[spec_name] = spec_dir
49
+
50
+ if type_dirs:
51
+ context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
52
+ else:
53
+ context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
54
+ return self.create_success_result(data={'organized_files': []})
55
+
56
+ context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
57
+ context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
58
+
59
+ # Discover files in type directories
60
+ all_files = []
61
+ is_recursive = context.get_param('is_recursive', True)
62
+
63
+ for spec_name, dir_path in type_dirs.items():
64
+ files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
65
+ all_files.extend(files_in_dir)
66
+
67
+ if not all_files:
68
+ context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
69
+ return self.create_success_result(data={'organized_files': []})
70
+
71
+ # Organize files using strategy
72
+ organized_files = file_discovery_strategy.organize(
73
+ all_files, context.file_specifications, context.metadata or {}, type_dirs
74
+ )
75
+
76
+ if organized_files:
77
+ context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
78
+ context.add_organized_files(organized_files)
79
+
80
+ return self.create_success_result(
81
+ data={'organized_files': organized_files},
82
+ rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
83
+ )
84
+
85
+ def _execute_multi_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
86
+ """Execute file organization in multi-path mode (each asset has own path)."""
87
+ from synapse_sdk.utils.storage import get_pathlib
88
+
89
+ assets = context.get_param('assets', {})
90
+ if not assets:
91
+ return self.create_error_result('Multi-path mode requires assets configuration')
92
+
93
+ # Validate that all required specs have asset paths
94
+ required_specs = [spec['name'] for spec in context.file_specifications if spec.get('is_required', False)]
95
+ missing_required = [spec for spec in required_specs if spec not in assets]
96
+
97
+ if missing_required:
98
+ return self.create_error_result(
99
+ f'Multi-path mode requires asset paths for required specs: {", ".join(missing_required)}'
100
+ )
47
101
 
48
- for spec_name, dir_path in type_dirs.items():
49
- files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
50
- all_files.extend(files_in_dir)
102
+ context.run.log_message(f'Using multi-path mode with {len(assets)} asset configurations')
103
+ context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
104
+
105
+ # Collect all files and specs first
106
+ all_files = []
107
+ type_dirs = {}
108
+ specs_with_files = []
109
+
110
+ for spec in context.file_specifications:
111
+ spec_name = spec['name']
112
+ is_required = spec.get('is_required', False)
113
+
114
+ # Skip if no asset configuration for this spec (only allowed for optional specs)
115
+ if spec_name not in assets:
116
+ if is_required:
117
+ # This should not happen due to validation above, but double-check
118
+ return self.create_error_result(f'Required spec {spec_name} missing asset path')
119
+ context.run.log_message(f'Skipping optional spec {spec_name}: no asset path configured')
120
+ continue
121
+
122
+ asset_config = assets[spec_name]
123
+
124
+ # Get the asset path from storage
125
+ try:
126
+ asset_path = get_pathlib(context.storage, asset_config.get('path', ''))
127
+ type_dirs[spec_name] = asset_path
128
+ except Exception as e:
129
+ context.run.log_message(f'Error accessing path for {spec_name}: {str(e)}', 'WARNING')
130
+ continue
131
+
132
+ if not asset_path.exists():
133
+ context.run.log_message(f'Path does not exist for {spec_name}: {asset_config.path}', 'WARNING')
134
+ continue
135
+
136
+ # Discover files for this asset
137
+ is_recursive = asset_config.get('is_recursive', True)
138
+ context.run.log_message(
139
+ f'Discovering files for {spec_name} at {asset_config.get("path", "")} (recursive={is_recursive})'
140
+ )
51
141
 
52
- if not all_files:
53
- context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
54
- return self.create_success_result(data={'organized_files': []})
142
+ files = file_discovery_strategy.discover(asset_path, is_recursive)
55
143
 
56
- # Organize files using strategy
57
- organized_files = file_discovery_strategy.organize(
58
- all_files, context.file_specifications, context.metadata or {}, type_dirs
59
- )
144
+ if not files:
145
+ context.run.log_message(f'No files found for {spec_name}', 'WARNING')
146
+ continue
60
147
 
61
- if organized_files:
62
- context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
63
- context.add_organized_files(organized_files)
148
+ all_files.extend(files)
149
+ specs_with_files.append(spec)
150
+ context.run.log_message(f'Found {len(files)} files for {spec_name}')
64
151
 
65
- return self.create_success_result(
66
- data={'organized_files': organized_files},
67
- rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
152
+ # Organize all files together to group by dataset_key
153
+ all_organized_files = []
154
+ if all_files and specs_with_files:
155
+ context.run.log_message(f'Organizing {len(all_files)} files across {len(specs_with_files)} specs')
156
+ context.run.log_message(f'Type directories: {list(type_dirs.keys())}')
157
+
158
+ all_organized_files = file_discovery_strategy.organize(
159
+ all_files, specs_with_files, context.metadata or {}, type_dirs
68
160
  )
69
161
 
70
- except Exception as e:
71
- return self.create_error_result(f'File organization failed: {str(e)}')
162
+ if all_organized_files:
163
+ context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(all_organized_files))
164
+ context.run.log_message(f'Created {len(all_organized_files)} data units from {len(all_files)} files')
165
+ context.add_organized_files(all_organized_files)
166
+ else:
167
+ context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
168
+
169
+ return self.create_success_result(
170
+ data={'organized_files': all_organized_files},
171
+ rollback_data={'files_count': len(all_organized_files), 'type_dirs': list(type_dirs.keys())},
172
+ )
72
173
 
73
174
  def can_skip(self, context: UploadContext) -> bool:
74
175
  """File organization cannot be skipped."""
@@ -82,8 +183,17 @@ class OrganizeFilesStep(BaseStep):
82
183
 
83
184
  def validate_prerequisites(self, context: UploadContext) -> None:
84
185
  """Validate prerequisites for file organization."""
85
- if not context.pathlib_cwd:
86
- raise ValueError('Working directory path not set')
186
+ use_single_path = context.get_param('use_single_path', True)
187
+
188
+ # In single-path mode, pathlib_cwd is required
189
+ if use_single_path and not context.pathlib_cwd:
190
+ raise ValueError('Working directory path not set in single-path mode')
191
+
192
+ # In multi-path mode, pathlib_cwd is optional (each asset has its own path)
193
+ if not use_single_path:
194
+ assets = context.get_param('assets', {})
195
+ if not assets:
196
+ raise ValueError('Multi-path mode requires assets configuration')
87
197
 
88
198
  if not context.file_specifications:
89
199
  raise ValueError('File specifications not available')
@@ -34,7 +34,9 @@ class UploadFilesStep(BaseStep):
34
34
  context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
35
35
 
36
36
  # Initialize metrics
37
- context.update_metrics('data_files', {'stand_by': organized_files_count, 'success': 0, 'failed': 0})
37
+ initial_metrics = {'stand_by': organized_files_count, 'success': 0, 'failed': 0}
38
+ context.update_metrics('data_files', initial_metrics)
39
+ context.run.set_metrics(initial_metrics, category='data_files')
38
40
 
39
41
  # Create upload configuration
40
42
  upload_config = UploadConfig(
@@ -55,10 +57,13 @@ class UploadFilesStep(BaseStep):
55
57
  context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
56
58
 
57
59
  # Update final metrics
58
- context.update_metrics(
59
- 'data_files',
60
- {'stand_by': 0, 'success': len(uploaded_files), 'failed': organized_files_count - len(uploaded_files)},
61
- )
60
+ final_metrics = {
61
+ 'stand_by': 0,
62
+ 'success': len(uploaded_files),
63
+ 'failed': organized_files_count - len(uploaded_files),
64
+ }
65
+ context.update_metrics('data_files', final_metrics)
66
+ context.run.set_metrics(final_metrics, category='data_files')
62
67
 
63
68
  # Complete progress
64
69
  context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
@@ -10,7 +10,11 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
10
10
 
11
11
  def discover(self, path: Path, recursive: bool) -> List[Path]:
12
12
  """Discover files non-recursively in the given path."""
13
- return [file_path for file_path in path.glob('*') if file_path.is_file()]
13
+ # Exclude system files
14
+ excluded_files = {'.DS_Store', 'Thumbs.db', 'desktop.ini'}
15
+ return [
16
+ file_path for file_path in path.glob('*') if file_path.is_file() and file_path.name not in excluded_files
17
+ ]
14
18
 
15
19
  def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
16
20
  """Organize files according to specifications with metadata."""
@@ -39,9 +43,14 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
39
43
  # Performance optimization 2: Build metadata index for faster lookups
40
44
  metadata_index = self._build_metadata_index(metadata)
41
45
 
42
- # Group files by dataset (flat discovery)
46
+ # Group files by dataset_key (stem-based matching) - flat discovery (no subdirectories)
47
+ # Strategy:
48
+ # 1. Group all files (required + optional) by their file stem
49
+ # 2. Only create data units for groups that have ALL required files
50
+ # 3. Optional files are automatically included if they match the stem
43
51
  dataset_files = {}
44
52
  required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
53
+ optional_specs = [spec['name'] for spec in specs if not spec.get('is_required', False)]
45
54
 
46
55
  for file_path in files:
47
56
  # Determine which type directory this file belongs to
@@ -64,20 +73,36 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
64
73
  # If stat fails, keep existing file
65
74
  pass
66
75
 
67
- # Create organized files for datasets with all required files
76
+ # Create organized files ONLY for datasets with ALL required files
77
+ # Optional files are included automatically if they match the stem
68
78
  for file_name, files_dict in sorted(dataset_files.items()):
69
- if all(req in files_dict for req in required_specs):
70
- # Calculate most common file extension (optimized)
79
+ # Check if all required files are present
80
+ has_all_required = all(req in files_dict for req in required_specs)
81
+
82
+ if has_all_required:
83
+ # Extract original file stem from actual file paths (more reliable)
84
+ # Collect stems from all files in the group
85
+ file_stems = {}
71
86
  file_extensions = {}
87
+
72
88
  for file_path in files_dict.values():
89
+ stem = file_path.stem
73
90
  ext = file_path.suffix.lower()
91
+
92
+ # Count stems (to handle multiple files with slightly different names)
93
+ if stem:
94
+ file_stems[stem] = file_stems.get(stem, 0) + 1
95
+
96
+ # Count extensions
74
97
  if ext:
75
98
  file_extensions[ext] = file_extensions.get(ext, 0) + 1
76
99
 
100
+ # Use the most common stem (usually they're all the same)
101
+ original_stem = max(file_stems, key=file_stems.get) if file_stems else file_name
77
102
  origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
78
103
 
79
104
  meta_data = {
80
- 'origin_file_stem': file_name,
105
+ 'origin_file_stem': original_stem,
81
106
  'origin_file_extension': origin_file_extension,
82
107
  'created_at': datetime.now().isoformat(),
83
108
  }