synapse-sdk 2025.10.3__py3-none-any.whl → 2025.10.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (29) hide show
  1. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
  2. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
  3. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
  4. synapse_sdk/devtools/docs/docs/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
  5. synapse_sdk/devtools/docs/docs/plugins/categories/upload-plugins/upload-plugin-overview.md +25 -0
  6. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview.md +198 -0
  7. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-action-development.md +1645 -0
  8. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-overview.md +717 -0
  9. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/pre-annotation-plugins/to-task-template-development.md +1380 -0
  10. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/categories/upload-plugins/upload-plugin-overview.md +25 -0
  11. synapse_sdk/devtools/docs/sidebars.ts +14 -0
  12. synapse_sdk/plugins/categories/export/actions/export/action.py +8 -3
  13. synapse_sdk/plugins/categories/export/actions/export/utils.py +108 -8
  14. synapse_sdk/plugins/categories/upload/actions/upload/action.py +8 -1
  15. synapse_sdk/plugins/categories/upload/actions/upload/context.py +0 -1
  16. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +6 -2
  17. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +24 -9
  18. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +27 -7
  19. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +45 -12
  20. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +10 -5
  21. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +31 -6
  22. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +65 -37
  23. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +17 -2
  24. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/METADATA +1 -1
  25. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/RECORD +29 -21
  26. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/WHEEL +0 -0
  27. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/entry_points.txt +0 -0
  28. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/licenses/LICENSE +0 -0
  29. {synapse_sdk-2025.10.3.dist-info → synapse_sdk-2025.10.5.dist-info}/top_level.txt +0 -0
@@ -97,6 +97,31 @@ sidebar_position: 1
97
97
  }
98
98
  ```
99
99
 
100
+ **선택적 파일 사양:**
101
+
102
+ 다중 경로 모드에서는 데이터 컬렉션의 파일 사양 템플릿에서 파일 사양을 선택적으로 표시할 수 있습니다:
103
+
104
+ - **필수 사양** (`is_required: true`): `assets` 매개변수에 자산 경로가 반드시 있어야 합니다
105
+ - **선택적 사양** (`is_required: false`): `assets`에서 생략 가능 - 시스템이 건너뜁니다
106
+
107
+ 선택적 사양이 생략된 예제:
108
+
109
+ ```json
110
+ {
111
+ "name": "다중 소스 업로드",
112
+ "use_single_path": false,
113
+ "assets": {
114
+ "pcd_1": {"path": "/sensors/lidar", "is_recursive": false},
115
+ "image_1": {"path": "/cameras/front", "is_recursive": true}
116
+ // "json_meta_1"은 선택사항이며 생략됨
117
+ },
118
+ "storage": 1,
119
+ "data_collection": 5
120
+ }
121
+ ```
122
+
123
+ 시스템 로그: `"Skipping optional spec json_meta_1: no asset path configured"`
124
+
100
125
  ## 기본 사용법
101
126
 
102
127
  ### CLI 사용법
@@ -54,6 +54,20 @@ const sidebars: SidebarsConfig = {
54
54
  'plugins/categories/upload-plugins/upload-plugin-template',
55
55
  ],
56
56
  },
57
+ {
58
+ type: 'category',
59
+ label: 'Pre-annotation Plugins',
60
+ link: {
61
+ type: 'doc',
62
+ id: 'plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview',
63
+ },
64
+ items: [
65
+ 'plugins/categories/pre-annotation-plugins/pre-annotation-plugin-overview',
66
+ 'plugins/categories/pre-annotation-plugins/to-task-overview',
67
+ 'plugins/categories/pre-annotation-plugins/to-task-action-development',
68
+ 'plugins/categories/pre-annotation-plugins/to-task-template-development',
69
+ ],
70
+ },
57
71
  ],
58
72
  },
59
73
  {
@@ -95,11 +95,11 @@ class ExportAction(Action):
95
95
  PydanticCustomError: If data retrieval fails
96
96
  """
97
97
  try:
98
- result_list = handler.get_results(self.client, filters)
98
+ result_list = handler.get_results(self.client, filters, run=self.run)
99
99
  results = result_list[0]
100
100
  count = result_list[1]
101
101
  except ClientError:
102
- raise PydanticCustomError('client_error', _('Unable to get Ground Truth dataset.'))
102
+ raise PydanticCustomError('client_error', _('Unable to get dataset.'))
103
103
  return results, count
104
104
 
105
105
  def start(self) -> Dict[str, Any]:
@@ -116,7 +116,12 @@ class ExportAction(Action):
116
116
  """
117
117
  self.run.log_message_with_code(LogCode.EXPORT_STARTED)
118
118
 
119
- filters = {'expand': 'data', **self.params['filter']}
119
+ # Get expand setting from config, default to True (expand data)
120
+ filters = {**self.params['filter']}
121
+ data_expand = self.config.get('data_expand', True)
122
+ if data_expand:
123
+ filters['expand'] = 'data'
124
+
120
125
  target = self.params['target']
121
126
  handler = TargetHandlerFactory.get_handler(target)
122
127
 
@@ -1,10 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any
2
+ from typing import Any, Optional
3
+ import time
3
4
 
4
5
  from pydantic_core import PydanticCustomError
5
6
 
6
7
  from synapse_sdk.clients.exceptions import ClientError
7
8
  from synapse_sdk.i18n import gettext as _
9
+ from synapse_sdk.shared.enums import Context
8
10
 
9
11
 
10
12
  class ExportTargetHandler(ABC):
@@ -15,6 +17,103 @@ class ExportTargetHandler(ABC):
15
17
  of methods to validate filters, retrieve results, and process collections of results.
16
18
  """
17
19
 
20
+ # TODO: This is a temporary workaround and needs improvement in the future
21
+ def _get_results_chunked(self, list_method, filters, chunk_size=100, max_retries=3, retry_delay=1, run=None):
22
+ """
23
+ Retrieve results in chunks to avoid memory and response size limits.
24
+
25
+ Args:
26
+ list_method: The client method to call (e.g., client.list_assignments)
27
+ filters (dict): The filter criteria to apply
28
+ chunk_size (int): Number of items to fetch per chunk
29
+ max_retries (int): Maximum number of retries for failed requests
30
+ retry_delay (int): Delay in seconds between retries
31
+
32
+ Returns:
33
+ tuple: A tuple containing the results generator and the total count
34
+ """
35
+ filters = filters.copy()
36
+ filters['page_size'] = chunk_size
37
+
38
+ page = 1
39
+ results = []
40
+ total_count = 0
41
+
42
+ try:
43
+ while True:
44
+ filters['page'] = page
45
+
46
+ # Retry logic for handling temporary server issues
47
+ for attempt in range(max_retries + 1):
48
+ try:
49
+ response = list_method(params=filters, list_all=False)
50
+ break
51
+ except ClientError as e:
52
+ error_msg = str(e)
53
+
54
+ # Use log_dev_event for better debugging and monitoring
55
+ if run:
56
+ run.log_dev_event(
57
+ 'Chunked data retrieval error',
58
+ {
59
+ 'page': page,
60
+ 'attempt': attempt + 1,
61
+ 'error_message': error_msg,
62
+ 'chunk_size': chunk_size,
63
+ },
64
+ level=Context.WARNING,
65
+ )
66
+
67
+ # Check for JSON decode errors specifically
68
+ if 'Expecting value' in error_msg or 'JSONDecodeError' in error_msg:
69
+ if run:
70
+ run.log_dev_event(
71
+ 'JSON parsing error - skipping page',
72
+ {'page': page, 'error_type': 'JSON_DECODE_ERROR', 'error_details': error_msg},
73
+ level=Context.DANGER,
74
+ )
75
+ # Skip this page and continue with next
76
+ page += 1
77
+ break
78
+ elif attempt < max_retries and ('503' in error_msg or 'connection' in error_msg.lower()):
79
+ retry_delay_seconds = retry_delay * (2**attempt)
80
+ if run:
81
+ run.log_dev_event(
82
+ 'Server issue - retrying with backoff',
83
+ {
84
+ 'page': page,
85
+ 'retry_attempt': attempt + 1,
86
+ 'max_retries': max_retries,
87
+ 'retry_delay_seconds': retry_delay_seconds,
88
+ 'error_type': 'SERVER_ISSUE',
89
+ },
90
+ level=Context.INFO,
91
+ )
92
+ time.sleep(retry_delay_seconds) # Exponential backoff
93
+ continue
94
+ else:
95
+ raise
96
+
97
+ if page == 1:
98
+ total_count = response['count']
99
+
100
+ current_results = response.get('results', [])
101
+ results.extend(current_results)
102
+
103
+ # Check if we've got all results or if there are no more results
104
+ if len(current_results) < chunk_size or not response.get('next'):
105
+ break
106
+
107
+ page += 1
108
+
109
+ # Small delay between pages to avoid overwhelming the server
110
+ time.sleep(0.1)
111
+
112
+ return results, total_count
113
+ except Exception:
114
+ # Re-raise the exception to be handled by the calling method
115
+ raise
116
+
18
117
  @abstractmethod
19
118
  def validate_filter(self, value: dict, client: Any):
20
119
  """
@@ -33,13 +132,14 @@ class ExportTargetHandler(ABC):
33
132
  pass
34
133
 
35
134
  @abstractmethod
36
- def get_results(self, client: Any, filters: dict):
135
+ def get_results(self, client: Any, filters: dict, run=None):
37
136
  """
38
137
  Retrieve original data from target sources.
39
138
 
40
139
  Args:
41
140
  client (Any): The client used to retrieve the results.
42
141
  filters (dict): The filter criteria to apply.
142
+ run: Optional ExportRun instance for logging.
43
143
 
44
144
  Returns:
45
145
  tuple: A tuple containing the results and the total count of results.
@@ -76,8 +176,8 @@ class AssignmentExportTargetHandler(ExportTargetHandler):
76
176
  raise PydanticCustomError('client_error', _('Unable to get Assignment.'))
77
177
  return value
78
178
 
79
- def get_results(self, client: Any, filters: dict):
80
- return client.list_assignments(params=filters, list_all=True)
179
+ def get_results(self, client: Any, filters: dict, run=None):
180
+ return self._get_results_chunked(client.list_assignments, filters, run=run)
81
181
 
82
182
  def get_export_item(self, results):
83
183
  for result in results:
@@ -104,9 +204,9 @@ class GroundTruthExportTargetHandler(ExportTargetHandler):
104
204
  raise PydanticCustomError('client_error', _('Unable to get Ground Truth dataset version.'))
105
205
  return value
106
206
 
107
- def get_results(self, client: Any, filters: dict):
207
+ def get_results(self, client: Any, filters: dict, run=None):
108
208
  filters['ground_truth_dataset_versions'] = filters.pop('ground_truth_dataset_version')
109
- return client.list_ground_truth_events(params=filters, list_all=True)
209
+ return self._get_results_chunked(client.list_ground_truth_events, filters, run=run)
110
210
 
111
211
  def get_export_item(self, results):
112
212
  for result in results:
@@ -134,9 +234,9 @@ class TaskExportTargetHandler(ExportTargetHandler):
134
234
  raise PydanticCustomError('client_error', _('Unable to get Task.'))
135
235
  return value
136
236
 
137
- def get_results(self, client: Any, filters: dict):
237
+ def get_results(self, client: Any, filters: dict, run=None):
138
238
  filters['expand'] = ['data_unit', 'assignment', 'workshop']
139
- return client.list_tasks(params=filters, list_all=True)
239
+ return self._get_results_chunked(client.list_tasks, filters, run=run)
140
240
 
141
241
  def get_export_item(self, results):
142
242
  for result in results:
@@ -173,11 +173,18 @@ class UploadAction(Action):
173
173
  organized_files = context.get('organized_files', [])
174
174
  file_specification_template = context.get('file_specification_template', {})
175
175
  pathlib_cwd = context.get('pathlib_cwd')
176
+ use_single_path = context.get_param('use_single_path', True)
176
177
 
177
- if not organized_files or not file_specification_template or not pathlib_cwd:
178
+ # Validate required data based on mode
179
+ if not organized_files or not file_specification_template:
178
180
  raise ActionError('Required data not available from workflow steps')
179
181
 
182
+ # In single-path mode, pathlib_cwd is required
183
+ if use_single_path and not pathlib_cwd:
184
+ raise ActionError('pathlib_cwd is required in single-path mode')
185
+
180
186
  # CRITICAL: Integrate with existing uploader mechanism
187
+ # In multi-path mode, pathlib_cwd may be None, but uploader should still work
181
188
  uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files, self.params)
182
189
  organized_files = uploader.handle_upload_files()
183
190
 
@@ -87,7 +87,6 @@ class UploadContext:
87
87
  'generated_data_units_count': len(self.data_units),
88
88
  'success': len(self.errors) == 0,
89
89
  'errors': self.errors,
90
- 'metrics': self.metrics,
91
90
  }
92
91
 
93
92
  def has_errors(self) -> bool:
@@ -31,7 +31,9 @@ class GenerateDataUnitsStep(BaseStep):
31
31
  context.run.log_message_with_code(LogCode.GENERATING_DATA_UNITS)
32
32
 
33
33
  # Initialize metrics
34
- context.update_metrics('data_units', {'stand_by': upload_result_count, 'success': 0, 'failed': 0})
34
+ initial_metrics = {'stand_by': upload_result_count, 'success': 0, 'failed': 0}
35
+ context.update_metrics('data_units', initial_metrics)
36
+ context.run.set_metrics(initial_metrics, category='data_units')
35
37
 
36
38
  # Get batch size from parameters
37
39
  batch_size = context.get_param('creating_data_unit_batch_size', 1)
@@ -49,7 +51,9 @@ class GenerateDataUnitsStep(BaseStep):
49
51
  )
50
52
 
51
53
  # Update final metrics
52
- context.update_metrics('data_units', {'stand_by': 0, 'success': len(generated_data_units), 'failed': 0})
54
+ final_metrics = {'stand_by': 0, 'success': len(generated_data_units), 'failed': 0}
55
+ context.update_metrics('data_units', final_metrics)
56
+ context.run.set_metrics(final_metrics, category='data_units')
53
57
 
54
58
  # Complete progress
55
59
  context.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
@@ -29,19 +29,34 @@ class InitializeStep(BaseStep):
29
29
  except Exception as e:
30
30
  return self.create_error_result(f'Failed to get storage {storage_id}: {str(e)}')
31
31
 
32
- # Get and validate path
32
+ # Check if we're in multi-path mode
33
+ use_single_path = context.get_param('use_single_path', True)
34
+
35
+ # Get and validate path (only required in single-path mode)
33
36
  path = context.get_param('path')
34
- if path is None:
35
- return self.create_error_result('Path parameter is required')
37
+ pathlib_cwd = None
36
38
 
37
- try:
38
- pathlib_cwd = get_pathlib(storage, path)
39
- context.set_pathlib_cwd(pathlib_cwd)
40
- except Exception as e:
41
- return self.create_error_result(f'Failed to get path {path}: {str(e)}')
39
+ if use_single_path:
40
+ # Single-path mode: global path is required
41
+ if path is None:
42
+ return self.create_error_result('Path parameter is required in single-path mode')
43
+
44
+ try:
45
+ pathlib_cwd = get_pathlib(storage, path)
46
+ context.set_pathlib_cwd(pathlib_cwd)
47
+ except Exception as e:
48
+ return self.create_error_result(f'Failed to get path {path}: {str(e)}')
49
+ else:
50
+ # Multi-path mode: global path is optional (each asset has its own path)
51
+ if path:
52
+ try:
53
+ pathlib_cwd = get_pathlib(storage, path)
54
+ context.set_pathlib_cwd(pathlib_cwd)
55
+ except Exception as e:
56
+ return self.create_error_result(f'Failed to get path {path}: {str(e)}')
42
57
 
43
58
  # Return success with rollback data
44
- rollback_data = {'storage_id': storage_id, 'path': path}
59
+ rollback_data = {'storage_id': storage_id, 'path': path, 'use_single_path': use_single_path}
45
60
 
46
61
  return self.create_success_result(
47
62
  data={'storage': storage, 'pathlib_cwd': pathlib_cwd}, rollback_data=rollback_data
@@ -61,9 +61,15 @@ class ProcessMetadataStep(BaseStep):
61
61
  excel_metadata = metadata_strategy.extract(excel_path)
62
62
  else:
63
63
  # Look for default metadata files (meta.xlsx, meta.xls)
64
- excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
65
- if excel_path:
66
- excel_metadata = metadata_strategy.extract(excel_path)
64
+ # Only possible in single-path mode where pathlib_cwd is set
65
+ if context.pathlib_cwd:
66
+ excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
67
+ if excel_path:
68
+ excel_metadata = metadata_strategy.extract(excel_path)
69
+ else:
70
+ context.run.log_message(
71
+ 'No Excel metadata specified and multi-path mode - skipping metadata processing'
72
+ )
67
73
 
68
74
  # Validate extracted metadata
69
75
  if excel_metadata:
@@ -136,9 +142,13 @@ class ProcessMetadataStep(BaseStep):
136
142
  if path.exists() and path.is_file():
137
143
  return path, False
138
144
 
139
- # Try relative to cwd
140
- path = context.pathlib_cwd / excel_path_str
141
- return (path, False) if path.exists() else (None, False)
145
+ # Try relative to cwd (only if pathlib_cwd is set)
146
+ if context.pathlib_cwd:
147
+ path = context.pathlib_cwd / excel_path_str
148
+ return (path, False) if path.exists() else (None, False)
149
+
150
+ # In multi-path mode without pathlib_cwd, can only use absolute paths
151
+ return (None, False)
142
152
 
143
153
  def _resolve_excel_path_from_base64(
144
154
  self, excel_config: dict | ExcelMetadataFile, context: UploadContext
@@ -179,7 +189,17 @@ class ProcessMetadataStep(BaseStep):
179
189
  return None, False
180
190
 
181
191
  def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Path:
182
- """Find default Excel metadata file."""
192
+ """Find default Excel metadata file.
193
+
194
+ Args:
195
+ pathlib_cwd: Working directory path (must not be None)
196
+
197
+ Returns:
198
+ Path to Excel metadata file, or None if not found
199
+ """
200
+ if not pathlib_cwd:
201
+ return None
202
+
183
203
  # Check .xlsx first as it's more common
184
204
  excel_path = pathlib_cwd / 'meta.xlsx'
185
205
  if excel_path.exists():
@@ -90,25 +90,40 @@ class OrganizeFilesStep(BaseStep):
90
90
  if not assets:
91
91
  return self.create_error_result('Multi-path mode requires assets configuration')
92
92
 
93
+ # Validate that all required specs have asset paths
94
+ required_specs = [spec['name'] for spec in context.file_specifications if spec.get('is_required', False)]
95
+ missing_required = [spec for spec in required_specs if spec not in assets]
96
+
97
+ if missing_required:
98
+ return self.create_error_result(
99
+ f'Multi-path mode requires asset paths for required specs: {", ".join(missing_required)}'
100
+ )
101
+
93
102
  context.run.log_message(f'Using multi-path mode with {len(assets)} asset configurations')
94
103
  context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
95
104
 
96
- all_organized_files = []
105
+ # Collect all files and specs first
106
+ all_files = []
97
107
  type_dirs = {}
108
+ specs_with_files = []
98
109
 
99
110
  for spec in context.file_specifications:
100
111
  spec_name = spec['name']
112
+ is_required = spec.get('is_required', False)
101
113
 
102
- # Skip if no asset configuration for this spec
114
+ # Skip if no asset configuration for this spec (only allowed for optional specs)
103
115
  if spec_name not in assets:
104
- context.run.log_message(f'Skipping {spec_name}: no asset path configured')
116
+ if is_required:
117
+ # This should not happen due to validation above, but double-check
118
+ return self.create_error_result(f'Required spec {spec_name} missing asset path')
119
+ context.run.log_message(f'Skipping optional spec {spec_name}: no asset path configured')
105
120
  continue
106
121
 
107
122
  asset_config = assets[spec_name]
108
123
 
109
124
  # Get the asset path from storage
110
125
  try:
111
- asset_path = get_pathlib(context.storage, asset_config.path)
126
+ asset_path = get_pathlib(context.storage, asset_config.get('path', ''))
112
127
  type_dirs[spec_name] = asset_path
113
128
  except Exception as e:
114
129
  context.run.log_message(f'Error accessing path for {spec_name}: {str(e)}', 'WARNING')
@@ -119,9 +134,9 @@ class OrganizeFilesStep(BaseStep):
119
134
  continue
120
135
 
121
136
  # Discover files for this asset
122
- is_recursive = asset_config.is_recursive
137
+ is_recursive = asset_config.get('is_recursive', True)
123
138
  context.run.log_message(
124
- f'Discovering files for {spec_name} at {asset_config.path} (recursive={is_recursive})'
139
+ f'Discovering files for {spec_name} at {asset_config.get("path", "")} (recursive={is_recursive})'
125
140
  )
126
141
 
127
142
  files = file_discovery_strategy.discover(asset_path, is_recursive)
@@ -130,14 +145,23 @@ class OrganizeFilesStep(BaseStep):
130
145
  context.run.log_message(f'No files found for {spec_name}', 'WARNING')
131
146
  continue
132
147
 
133
- # Organize files for this specific spec
134
- organized = file_discovery_strategy.organize(files, [spec], context.metadata or {}, {spec_name: asset_path})
148
+ all_files.extend(files)
149
+ specs_with_files.append(spec)
150
+ context.run.log_message(f'Found {len(files)} files for {spec_name}')
135
151
 
136
- all_organized_files.extend(organized)
137
- context.run.log_message(f'Found {len(organized)} files for {spec_name}')
152
+ # Organize all files together to group by dataset_key
153
+ all_organized_files = []
154
+ if all_files and specs_with_files:
155
+ context.run.log_message(f'Organizing {len(all_files)} files across {len(specs_with_files)} specs')
156
+ context.run.log_message(f'Type directories: {list(type_dirs.keys())}')
157
+
158
+ all_organized_files = file_discovery_strategy.organize(
159
+ all_files, specs_with_files, context.metadata or {}, type_dirs
160
+ )
138
161
 
139
162
  if all_organized_files:
140
163
  context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(all_organized_files))
164
+ context.run.log_message(f'Created {len(all_organized_files)} data units from {len(all_files)} files')
141
165
  context.add_organized_files(all_organized_files)
142
166
  else:
143
167
  context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
@@ -159,8 +183,17 @@ class OrganizeFilesStep(BaseStep):
159
183
 
160
184
  def validate_prerequisites(self, context: UploadContext) -> None:
161
185
  """Validate prerequisites for file organization."""
162
- if not context.pathlib_cwd:
163
- raise ValueError('Working directory path not set')
186
+ use_single_path = context.get_param('use_single_path', True)
187
+
188
+ # In single-path mode, pathlib_cwd is required
189
+ if use_single_path and not context.pathlib_cwd:
190
+ raise ValueError('Working directory path not set in single-path mode')
191
+
192
+ # In multi-path mode, pathlib_cwd is optional (each asset has its own path)
193
+ if not use_single_path:
194
+ assets = context.get_param('assets', {})
195
+ if not assets:
196
+ raise ValueError('Multi-path mode requires assets configuration')
164
197
 
165
198
  if not context.file_specifications:
166
199
  raise ValueError('File specifications not available')
@@ -34,7 +34,9 @@ class UploadFilesStep(BaseStep):
34
34
  context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
35
35
 
36
36
  # Initialize metrics
37
- context.update_metrics('data_files', {'stand_by': organized_files_count, 'success': 0, 'failed': 0})
37
+ initial_metrics = {'stand_by': organized_files_count, 'success': 0, 'failed': 0}
38
+ context.update_metrics('data_files', initial_metrics)
39
+ context.run.set_metrics(initial_metrics, category='data_files')
38
40
 
39
41
  # Create upload configuration
40
42
  upload_config = UploadConfig(
@@ -55,10 +57,13 @@ class UploadFilesStep(BaseStep):
55
57
  context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
56
58
 
57
59
  # Update final metrics
58
- context.update_metrics(
59
- 'data_files',
60
- {'stand_by': 0, 'success': len(uploaded_files), 'failed': organized_files_count - len(uploaded_files)},
61
- )
60
+ final_metrics = {
61
+ 'stand_by': 0,
62
+ 'success': len(uploaded_files),
63
+ 'failed': organized_files_count - len(uploaded_files),
64
+ }
65
+ context.update_metrics('data_files', final_metrics)
66
+ context.run.set_metrics(final_metrics, category='data_files')
62
67
 
63
68
  # Complete progress
64
69
  context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
@@ -10,7 +10,11 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
10
10
 
11
11
  def discover(self, path: Path, recursive: bool) -> List[Path]:
12
12
  """Discover files non-recursively in the given path."""
13
- return [file_path for file_path in path.glob('*') if file_path.is_file()]
13
+ # Exclude system files
14
+ excluded_files = {'.DS_Store', 'Thumbs.db', 'desktop.ini'}
15
+ return [
16
+ file_path for file_path in path.glob('*') if file_path.is_file() and file_path.name not in excluded_files
17
+ ]
14
18
 
15
19
  def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
16
20
  """Organize files according to specifications with metadata."""
@@ -39,9 +43,14 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
39
43
  # Performance optimization 2: Build metadata index for faster lookups
40
44
  metadata_index = self._build_metadata_index(metadata)
41
45
 
42
- # Group files by dataset (flat discovery)
46
+ # Group files by dataset_key (stem-based matching) - flat discovery (no subdirectories)
47
+ # Strategy:
48
+ # 1. Group all files (required + optional) by their file stem
49
+ # 2. Only create data units for groups that have ALL required files
50
+ # 3. Optional files are automatically included if they match the stem
43
51
  dataset_files = {}
44
52
  required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
53
+ optional_specs = [spec['name'] for spec in specs if not spec.get('is_required', False)]
45
54
 
46
55
  for file_path in files:
47
56
  # Determine which type directory this file belongs to
@@ -64,20 +73,36 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
64
73
  # If stat fails, keep existing file
65
74
  pass
66
75
 
67
- # Create organized files for datasets with all required files
76
+ # Create organized files ONLY for datasets with ALL required files
77
+ # Optional files are included automatically if they match the stem
68
78
  for file_name, files_dict in sorted(dataset_files.items()):
69
- if all(req in files_dict for req in required_specs):
70
- # Calculate most common file extension (optimized)
79
+ # Check if all required files are present
80
+ has_all_required = all(req in files_dict for req in required_specs)
81
+
82
+ if has_all_required:
83
+ # Extract original file stem from actual file paths (more reliable)
84
+ # Collect stems from all files in the group
85
+ file_stems = {}
71
86
  file_extensions = {}
87
+
72
88
  for file_path in files_dict.values():
89
+ stem = file_path.stem
73
90
  ext = file_path.suffix.lower()
91
+
92
+ # Count stems (to handle multiple files with slightly different names)
93
+ if stem:
94
+ file_stems[stem] = file_stems.get(stem, 0) + 1
95
+
96
+ # Count extensions
74
97
  if ext:
75
98
  file_extensions[ext] = file_extensions.get(ext, 0) + 1
76
99
 
100
+ # Use the most common stem (usually they're all the same)
101
+ original_stem = max(file_stems, key=file_stems.get) if file_stems else file_name
77
102
  origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
78
103
 
79
104
  meta_data = {
80
- 'origin_file_stem': file_name,
105
+ 'origin_file_stem': original_stem,
81
106
  'origin_file_extension': origin_file_extension,
82
107
  'created_at': datetime.now().isoformat(),
83
108
  }