synapse-sdk 1.0.0b24__py3-none-any.whl → 2025.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (83) hide show
  1. synapse_sdk/clients/agent/ray.py +50 -0
  2. synapse_sdk/devtools/docs/docs/api/clients/annotation-mixin.md +378 -0
  3. synapse_sdk/devtools/docs/docs/api/clients/backend.md +368 -1
  4. synapse_sdk/devtools/docs/docs/api/clients/core-mixin.md +477 -0
  5. synapse_sdk/devtools/docs/docs/api/clients/data-collection-mixin.md +422 -0
  6. synapse_sdk/devtools/docs/docs/api/clients/hitl-mixin.md +554 -0
  7. synapse_sdk/devtools/docs/docs/api/clients/index.md +391 -0
  8. synapse_sdk/devtools/docs/docs/api/clients/integration-mixin.md +571 -0
  9. synapse_sdk/devtools/docs/docs/api/clients/ml-mixin.md +578 -0
  10. synapse_sdk/devtools/docs/docs/api/clients/ray.md +23 -2
  11. synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +1463 -0
  12. synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +161 -34
  13. synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +1497 -213
  14. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/annotation-mixin.md +289 -0
  15. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/backend.md +378 -11
  16. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/core-mixin.md +417 -0
  17. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/data-collection-mixin.md +356 -0
  18. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/hitl-mixin.md +192 -0
  19. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/index.md +391 -0
  20. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/integration-mixin.md +479 -0
  21. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ml-mixin.md +284 -0
  22. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ray.md +23 -2
  23. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +1463 -0
  24. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +161 -34
  25. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +1752 -572
  26. synapse_sdk/devtools/docs/sidebars.ts +7 -0
  27. synapse_sdk/plugins/README.md +1 -2
  28. synapse_sdk/plugins/categories/base.py +23 -0
  29. synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
  30. synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
  31. synapse_sdk/plugins/categories/export/actions/export/action.py +160 -0
  32. synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
  33. synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
  34. synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
  35. synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
  36. synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
  37. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +1 -1
  38. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +1 -2
  39. synapse_sdk/plugins/categories/upload/actions/upload/action.py +154 -531
  40. synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
  41. synapse_sdk/plugins/categories/upload/actions/upload/factory.py +143 -0
  42. synapse_sdk/plugins/categories/upload/actions/upload/models.py +66 -29
  43. synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +182 -0
  44. synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
  45. synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
  46. synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +106 -0
  47. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
  48. synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +62 -0
  49. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +80 -0
  50. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +66 -0
  51. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +101 -0
  52. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +89 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +96 -0
  54. synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +61 -0
  55. synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
  56. synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +86 -0
  57. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
  58. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
  59. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +34 -0
  60. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
  61. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +233 -0
  62. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +238 -0
  63. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
  64. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
  65. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
  66. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
  67. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/async_upload.py +109 -0
  68. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +43 -0
  69. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
  70. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +45 -0
  71. synapse_sdk/plugins/categories/upload/actions/upload/utils.py +194 -83
  72. synapse_sdk/plugins/categories/upload/templates/config.yaml +4 -0
  73. synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +269 -0
  74. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +71 -27
  75. synapse_sdk/plugins/models.py +5 -0
  76. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/METADATA +3 -2
  77. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/RECORD +81 -30
  78. synapse_sdk/plugins/categories/export/actions/export.py +0 -385
  79. synapse_sdk/plugins/categories/export/enums.py +0 -7
  80. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/WHEEL +0 -0
  81. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/entry_points.txt +0 -0
  82. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  83. {synapse_sdk-1.0.0b24.dist-info → synapse_sdk-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,238 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+
5
+ from ..base import FileDiscoveryStrategy
6
+
7
+
8
+ class RecursiveFileDiscoveryStrategy(FileDiscoveryStrategy):
9
+ """Recursive file discovery strategy."""
10
+
11
+ def discover(self, path: Path, recursive: bool) -> List[Path]:
12
+ """Discover files recursively in the given path."""
13
+ return [file_path for file_path in path.rglob('*') if file_path.is_file()]
14
+
15
+ def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
16
+ """Organize files according to specifications with metadata."""
17
+ organized_files = []
18
+
19
+ # Use provided type_dirs or create fallback mapping
20
+ if type_dirs is None:
21
+ type_dirs = {}
22
+ for spec in specs:
23
+ spec_name = spec['name']
24
+ # Fallback: extract spec directory from file paths
25
+ for file_path in files:
26
+ # Check if this file's path contains the spec_name as a directory
27
+ path_parts = file_path.parts
28
+ if spec_name in path_parts:
29
+ # Find the index of spec_name and reconstruct the path up to that directory
30
+ spec_index = path_parts.index(spec_name)
31
+ spec_dir = Path(*path_parts[: spec_index + 1])
32
+ if spec_dir.exists() and spec_dir.is_dir():
33
+ type_dirs[spec_name] = spec_dir
34
+ break
35
+
36
+ if not type_dirs:
37
+ return organized_files
38
+
39
+ # Performance optimization 1: Path caching - avoid repeated string conversions
40
+ path_cache = {dir_path: str(dir_path) for dir_path in type_dirs.values()}
41
+
42
+ # Performance optimization 2: Build metadata index for faster lookups
43
+ metadata_index = self._build_metadata_index(metadata)
44
+
45
+ # Group files by dataset
46
+ dataset_files = {}
47
+ required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
48
+
49
+ for file_path in files:
50
+ # Determine which type directory this file belongs to
51
+ file_path_str = str(file_path)
52
+ for spec_name, dir_path in type_dirs.items():
53
+ dir_path_str = path_cache[dir_path]
54
+ if file_path_str.startswith(dir_path_str):
55
+ file_name = file_path.stem
56
+
57
+ if file_name not in dataset_files:
58
+ dataset_files[file_name] = {}
59
+
60
+ if spec_name not in dataset_files[file_name]:
61
+ dataset_files[file_name][spec_name] = file_path
62
+ else:
63
+ # Keep the most recent file - only stat when needed
64
+ existing_file = dataset_files[file_name][spec_name]
65
+ try:
66
+ if file_path.stat().st_mtime > existing_file.stat().st_mtime:
67
+ dataset_files[file_name][spec_name] = file_path
68
+ except (OSError, IOError):
69
+ # If stat fails, keep existing file
70
+ pass
71
+
72
+ # Create organized files for datasets with all required files
73
+ for file_name, files_dict in sorted(dataset_files.items()):
74
+ if all(req in files_dict for req in required_specs):
75
+ # Calculate most common file extension (optimized)
76
+ file_extensions = {}
77
+ for file_path in files_dict.values():
78
+ ext = file_path.suffix.lower()
79
+ if ext:
80
+ file_extensions[ext] = file_extensions.get(ext, 0) + 1
81
+
82
+ origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
83
+
84
+ meta_data = {
85
+ 'origin_file_stem': file_name,
86
+ 'origin_file_extension': origin_file_extension,
87
+ 'created_at': datetime.now().isoformat(),
88
+ }
89
+
90
+ # Add metadata if available - using optimized index lookup
91
+ if metadata_index:
92
+ matched_metadata = self._find_matching_metadata_optimized(file_name, files_dict, metadata_index)
93
+ if matched_metadata:
94
+ meta_data.update(matched_metadata)
95
+
96
+ organized_files.append({'files': files_dict, 'meta': meta_data})
97
+
98
+ return organized_files
99
+
100
+ def _build_metadata_index(self, metadata: Dict) -> Dict:
101
+ """Build metadata index for faster lookups."""
102
+ if not metadata:
103
+ return {}
104
+
105
+ metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
106
+
107
+ for meta_key, meta_value in metadata.items():
108
+ meta_path = Path(meta_key)
109
+
110
+ # Index by stem
111
+ stem = meta_path.stem
112
+ if stem:
113
+ metadata_index['exact_stem'][stem] = meta_value
114
+ metadata_index['stem_lookup'][stem] = meta_value
115
+
116
+ # Index by full name
117
+ name = meta_path.name
118
+ if name:
119
+ metadata_index['exact_name'][name] = meta_value
120
+
121
+ # Index for partial path matching
122
+ metadata_index['partial_paths'][meta_key] = meta_value
123
+
124
+ # Index for full path matching
125
+ metadata_index['full_paths'][meta_key] = meta_value
126
+
127
+ return metadata_index
128
+
129
+ def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
130
+ """Find matching metadata using optimized index lookups."""
131
+ if not metadata_index:
132
+ return {}
133
+
134
+ # Strategy 1: Exact stem match (O(1) lookup)
135
+ if file_name in metadata_index['exact_stem']:
136
+ return metadata_index['exact_stem'][file_name]
137
+
138
+ # Strategy 2: Exact filename match with extension (O(1) lookup)
139
+ sample_file = list(files_dict.values())[0] if files_dict else None
140
+ if sample_file:
141
+ full_filename = f'{file_name}{sample_file.suffix}'
142
+ if full_filename in metadata_index['exact_name']:
143
+ return metadata_index['exact_name'][full_filename]
144
+
145
+ # Try sample file name
146
+ sample_filename = sample_file.name
147
+ if sample_filename in metadata_index['exact_name']:
148
+ return metadata_index['exact_name'][sample_filename]
149
+
150
+ # Strategy 3: Stem lookup (already optimized above)
151
+ # This is covered by exact_stem lookup
152
+
153
+ # Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
154
+ if sample_file:
155
+ file_path_str = str(sample_file)
156
+ file_path_posix = sample_file.as_posix()
157
+
158
+ # Check partial paths
159
+ for meta_key in metadata_index['partial_paths']:
160
+ if (
161
+ meta_key in file_path_str
162
+ or meta_key in file_path_posix
163
+ or file_path_str in meta_key
164
+ or file_path_posix in meta_key
165
+ ):
166
+ return metadata_index['partial_paths'][meta_key]
167
+
168
+ return {}
169
+
170
+ def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
171
+ """Find matching metadata using comprehensive pattern matching.
172
+
173
+ Matching priority:
174
+ 1. Exact stem match (highest priority)
175
+ 2. Exact filename match (with extension)
176
+ 3. Metadata key stem matches file stem
177
+ 4. Partial path matching
178
+ 5. Full path matching
179
+ """
180
+ if not metadata:
181
+ return {}
182
+
183
+ # Get sample file for extension and path information
184
+ sample_file = list(files_dict.values())[0] if files_dict else None
185
+
186
+ # Strategy 1: Exact stem match (highest priority)
187
+ if file_name in metadata:
188
+ return metadata[file_name]
189
+
190
+ # Strategy 2: Exact filename match (with extension)
191
+ if sample_file:
192
+ full_filename = f'{file_name}{sample_file.suffix}'
193
+ if full_filename in metadata:
194
+ return metadata[full_filename]
195
+
196
+ # Also try with sample file name
197
+ sample_filename = sample_file.name
198
+ if sample_filename in metadata:
199
+ return metadata[sample_filename]
200
+
201
+ # Strategy 3: Metadata key stem matches file stem
202
+ for meta_key in metadata.keys():
203
+ meta_stem = Path(meta_key).stem
204
+ if meta_stem == file_name:
205
+ return metadata[meta_key]
206
+
207
+ # Strategy 4: Partial path matching
208
+ if sample_file:
209
+ file_path_parts = sample_file.parts
210
+ for meta_key in metadata.keys():
211
+ meta_path = Path(meta_key)
212
+ # Check if any part of the metadata key matches our file path parts
213
+ for part in file_path_parts:
214
+ if part in str(meta_path) or str(meta_path) in part:
215
+ # Additional validation: ensure it's a reasonable match
216
+ if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
217
+ return metadata[meta_key]
218
+
219
+ # Strategy 5: Full path matching
220
+ if sample_file:
221
+ full_path_str = str(sample_file)
222
+ full_path_posix = sample_file.as_posix()
223
+
224
+ for meta_key in metadata.keys():
225
+ # Direct path match
226
+ if meta_key == full_path_str or meta_key == full_path_posix:
227
+ return metadata[meta_key]
228
+
229
+ # Relative path match (check if meta_key is contained in our path)
230
+ if meta_key in full_path_str or meta_key in full_path_posix:
231
+ return metadata[meta_key]
232
+
233
+ # Reverse match (check if our path is contained in meta_key)
234
+ if full_path_str in meta_key or full_path_posix in meta_key:
235
+ return metadata[meta_key]
236
+
237
+ # No match found
238
+ return {}
@@ -0,0 +1 @@
1
+ # Metadata strategy implementations
@@ -0,0 +1,174 @@
1
+ from io import BytesIO
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Optional
4
+
5
+ from openpyxl import load_workbook
6
+ from openpyxl.utils.exceptions import InvalidFileException
7
+
8
+ from ...exceptions import ExcelParsingError, ExcelSecurityError
9
+ from ...utils import ExcelMetadataUtils, ExcelSecurityConfig
10
+ from ..base import MetadataStrategy, ValidationResult
11
+
12
+
13
+ class ExcelMetadataStrategy(MetadataStrategy):
14
+ """Excel metadata extraction strategy."""
15
+
16
+ def __init__(self):
17
+ self.excel_config = ExcelSecurityConfig()
18
+ self.excel_utils = ExcelMetadataUtils(self.excel_config)
19
+
20
+ def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
21
+ """Extract metadata from Excel file."""
22
+ try:
23
+ excel_stream = self._prepare_excel_file(source_path)
24
+ workbook = load_workbook(excel_stream, read_only=True, data_only=True)
25
+ try:
26
+ return self._process_excel_worksheet(workbook.active)
27
+ finally:
28
+ workbook.close()
29
+
30
+ except ExcelSecurityError:
31
+ raise
32
+ except ExcelParsingError:
33
+ raise
34
+ except InvalidFileException as e:
35
+ raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
36
+ except MemoryError:
37
+ raise ExcelSecurityError('Excel file exceeds memory limits')
38
+ except (OSError, IOError) as e:
39
+ raise ExcelParsingError(f'File access error: {str(e)}')
40
+ except Exception as e:
41
+ # Handle ZIP file errors (Excel files are ZIP archives)
42
+ if 'zip file' in str(e).lower():
43
+ raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
44
+ raise ExcelParsingError(f'Unexpected error: {str(e)}')
45
+
46
+ def validate(self, metadata: Dict) -> ValidationResult:
47
+ """Validate extracted metadata."""
48
+ errors = []
49
+
50
+ if not isinstance(metadata, dict):
51
+ errors.append('Metadata must be a dictionary')
52
+ return ValidationResult(valid=False, errors=errors)
53
+
54
+ # Validate each file's metadata
55
+ for file_name, file_metadata in metadata.items():
56
+ if not isinstance(file_metadata, dict):
57
+ errors.append(f"Metadata for file '{file_name}' must be a dictionary")
58
+ continue
59
+
60
+ # Check filename length
61
+ if not self.excel_utils.is_valid_filename_length(file_name):
62
+ errors.append(f"Filename '{file_name}' exceeds maximum length")
63
+
64
+ return ValidationResult(valid=len(errors) == 0, errors=errors)
65
+
66
+ def _validate_excel_security(self, excel_path: Path) -> None:
67
+ """Validate Excel file security constraints."""
68
+ file_size = excel_path.stat().st_size
69
+ if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
70
+ raise ExcelSecurityError(
71
+ f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
72
+ )
73
+
74
+ estimated_memory = file_size * 3
75
+ if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
76
+ raise ExcelSecurityError(
77
+ f'Excel file may consume too much memory: ~{estimated_memory} bytes '
78
+ f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
79
+ )
80
+
81
+ def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
82
+ """Prepare Excel file for processing."""
83
+ self._validate_excel_security(excel_path)
84
+ excel_bytes = excel_path.read_bytes()
85
+ return BytesIO(excel_bytes)
86
+
87
+ def _process_excel_headers(self, headers: tuple) -> tuple:
88
+ """Process Excel headers."""
89
+ if len(headers) < 2:
90
+ raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
91
+
92
+ # Validate first column header (filename column)
93
+ first_header = str(headers[0]).strip().lower() if headers[0] else ''
94
+ valid_filename_headers = ['filename', 'file_name']
95
+
96
+ if first_header not in valid_filename_headers:
97
+ raise ExcelParsingError(
98
+ f'First column header must be "filename" or "file_name", got: "{headers[0]}". '
99
+ f'Valid options: {", ".join(valid_filename_headers)}'
100
+ )
101
+
102
+ self._validate_excel_content(headers, 0)
103
+ return headers
104
+
105
+ def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
106
+ """Process a single Excel data row."""
107
+ if not row[0] or str(row[0]).strip() == '':
108
+ return None
109
+
110
+ file_name = str(row[0]).strip()
111
+ if not self.excel_utils.is_valid_filename_length(file_name):
112
+ return None
113
+
114
+ file_metadata: Dict[str, Any] = {}
115
+ for i, value in enumerate(row[1:], start=1):
116
+ if i < len(headers): # Include empty strings, exclude only None values
117
+ header_value = headers[i]
118
+ column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
119
+
120
+ column_name = self.excel_utils.validate_and_truncate_string(
121
+ column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
122
+ )
123
+
124
+ # Convert None to empty string, otherwise convert to string
125
+ value_str = '' if value is None else str(value)
126
+ str_value = self.excel_utils.validate_and_truncate_string(
127
+ value_str, self.excel_config.MAX_METADATA_VALUE_LENGTH
128
+ )
129
+ file_metadata[column_name] = str_value
130
+
131
+ return {file_name: file_metadata} if file_metadata else None
132
+
133
+ def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
134
+ """Process Excel worksheet."""
135
+ if worksheet is None:
136
+ raise ExcelParsingError('Excel file has no active worksheet')
137
+
138
+ metadata_dict: Dict[str, Dict[str, Any]] = {}
139
+ headers: Optional[tuple] = None
140
+ data_row_count = 0
141
+ validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
142
+
143
+ for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
144
+ # Quick check: if row is empty or first cell is empty, skip
145
+ if not row or not row[0] or str(row[0]).strip() == '':
146
+ continue
147
+
148
+ if row_idx == 0:
149
+ headers = self._process_excel_headers(row)
150
+ continue
151
+
152
+ if headers is None:
153
+ raise ExcelParsingError('Excel file missing header row')
154
+
155
+ data_row_count += 1
156
+
157
+ if data_row_count % validation_interval == 0:
158
+ self._validate_excel_content(headers, data_row_count)
159
+
160
+ row_result = self._process_excel_data_row(row, headers)
161
+ if row_result:
162
+ metadata_dict.update(row_result)
163
+
164
+ self._validate_excel_content(headers or (), data_row_count)
165
+
166
+ return metadata_dict
167
+
168
+ def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
169
+ """Validate Excel content constraints."""
170
+ if len(headers) > self.excel_config.MAX_COLUMNS:
171
+ raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
172
+
173
+ if row_count > self.excel_config.MAX_ROWS:
174
+ raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
@@ -0,0 +1,16 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict
3
+
4
+ from ..base import MetadataStrategy, ValidationResult
5
+
6
+
7
+ class NoneMetadataStrategy(MetadataStrategy):
8
+ """Metadata strategy that returns no metadata."""
9
+
10
+ def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
11
+ """Return empty metadata."""
12
+ return {}
13
+
14
+ def validate(self, metadata: Dict) -> ValidationResult:
15
+ """Always validates successfully for empty metadata."""
16
+ return ValidationResult(valid=True)
@@ -0,0 +1 @@
1
+ # Upload strategy implementations
@@ -0,0 +1,109 @@
1
+ import asyncio
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List
4
+
5
+ from synapse_sdk.clients.exceptions import ClientError
6
+
7
+ from ...enums import LogCode, UploadStatus
8
+ from ..base import UploadConfig, UploadStrategy
9
+
10
+
11
+ class AsyncUploadStrategy(UploadStrategy):
12
+ """Asynchronous upload strategy."""
13
+
14
+ def __init__(self, context):
15
+ self.context = context
16
+
17
+ def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
18
+ """Upload files asynchronously."""
19
+ # Use the run_async helper from the action
20
+ return self._run_async_upload(files, config)
21
+
22
+ def _run_async_upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
23
+ """Run async upload using asyncio."""
24
+ import concurrent.futures
25
+
26
+ def _run_in_thread():
27
+ return asyncio.run(self._upload_files_async(files, config))
28
+
29
+ try:
30
+ # Check if we're already in an event loop
31
+ asyncio.get_running_loop()
32
+ # If we are, run in a separate thread
33
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
34
+ future = executor.submit(_run_in_thread)
35
+ return future.result()
36
+ except RuntimeError:
37
+ # No event loop running, safe to run directly
38
+ return asyncio.run(self._upload_files_async(files, config))
39
+
40
+ async def _upload_files_async(self, organized_files: List[Dict], config: UploadConfig) -> List[Dict]:
41
+ """Upload files asynchronously with concurrency control."""
42
+ client = self.context.client
43
+ collection_id = self.context.get_param('data_collection')
44
+ upload_result = []
45
+
46
+ semaphore = asyncio.Semaphore(config.max_concurrent)
47
+
48
+ async def upload_single_file(organized_file):
49
+ async with semaphore:
50
+ loop = asyncio.get_event_loop()
51
+ try:
52
+ use_chunked_upload = self._requires_chunked_upload(organized_file, config)
53
+ uploaded_data_file = await loop.run_in_executor(
54
+ None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
55
+ )
56
+ self.context.run.log_data_file(organized_file, UploadStatus.SUCCESS)
57
+ return {'status': 'success', 'result': uploaded_data_file}
58
+ except ClientError as e:
59
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
60
+ self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Client error: {str(e)}')
61
+ return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
62
+ except (OSError, IOError) as e:
63
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
64
+ self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'File system error: {str(e)}')
65
+ return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
66
+ except MemoryError as e:
67
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
68
+ self.context.run.log_message_with_code(
69
+ LogCode.FILE_UPLOAD_FAILED, f'Memory error (file too large): {str(e)}'
70
+ )
71
+ return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
72
+ except asyncio.TimeoutError as e:
73
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
74
+ self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Upload timeout: {str(e)}')
75
+ return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
76
+ except ValueError as e:
77
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
78
+ self.context.run.log_message_with_code(
79
+ LogCode.FILE_UPLOAD_FAILED, f'Data validation error: {str(e)}'
80
+ )
81
+ return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
82
+ except Exception as e:
83
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
84
+ self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Unexpected error: {str(e)}')
85
+ return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
86
+
87
+ # Create tasks for all files
88
+ tasks = [upload_single_file(organized_file) for organized_file in organized_files]
89
+
90
+ # Process completed tasks
91
+ for completed_task in asyncio.as_completed(tasks):
92
+ result = await completed_task
93
+ if result['status'] == 'success':
94
+ upload_result.append(result['result'])
95
+ # Failed uploads are logged but don't stop the process
96
+
97
+ return upload_result
98
+
99
+ def _requires_chunked_upload(self, organized_file: Dict[str, Any], config: UploadConfig) -> bool:
100
+ """Determine if chunked upload is required based on file sizes."""
101
+ max_file_size_mb = config.chunked_threshold_mb
102
+ for file_path in organized_file.get('files', {}).values():
103
+ if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
104
+ return True
105
+ return False
106
+
107
+ def _get_file_size_mb(self, file_path: Path) -> float:
108
+ """Get file size in megabytes."""
109
+ return file_path.stat().st_size / (1024 * 1024)
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List
3
+
4
+ from ...enums import LogCode, UploadStatus
5
+ from ..base import UploadConfig, UploadStrategy
6
+
7
+
8
+ class SyncUploadStrategy(UploadStrategy):
9
+ """Synchronous upload strategy."""
10
+
11
+ def __init__(self, context):
12
+ self.context = context
13
+
14
+ def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
15
+ """Upload files synchronously."""
16
+ uploaded_files = []
17
+ client = self.context.client
18
+ collection_id = self.context.get_param('data_collection')
19
+
20
+ for organized_file in files:
21
+ try:
22
+ use_chunked_upload = self._requires_chunked_upload(organized_file, config)
23
+ uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
24
+ self.context.run.log_data_file(organized_file, UploadStatus.SUCCESS)
25
+ uploaded_files.append(uploaded_data_file)
26
+ except Exception as e:
27
+ self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
28
+ self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
29
+ # Continue with other files instead of failing completely
30
+
31
+ return uploaded_files
32
+
33
+ def _requires_chunked_upload(self, organized_file: Dict[str, Any], config: UploadConfig) -> bool:
34
+ """Determine if chunked upload is required based on file sizes."""
35
+ max_file_size_mb = config.chunked_threshold_mb
36
+ for file_path in organized_file.get('files', {}).values():
37
+ if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
38
+ return True
39
+ return False
40
+
41
+ def _get_file_size_mb(self, file_path: Path) -> float:
42
+ """Get file size in megabytes."""
43
+ return file_path.stat().st_size / (1024 * 1024)
@@ -0,0 +1 @@
1
+ # Validation strategy implementations
@@ -0,0 +1,45 @@
1
+ from typing import Dict, List
2
+
3
+ from synapse_sdk.clients.validators.collections import FileSpecificationValidator
4
+
5
+ from ..base import ValidationResult, ValidationStrategy
6
+
7
+
8
+ class DefaultValidationStrategy(ValidationStrategy):
9
+ """Default validation strategy for upload operations."""
10
+
11
+ def validate_params(self, params: Dict) -> ValidationResult:
12
+ """Validate action parameters."""
13
+ errors = []
14
+
15
+ # Check required parameters
16
+ required_params = ['storage', 'data_collection', 'path', 'name']
17
+ for param in required_params:
18
+ if param not in params:
19
+ errors.append(f'Missing required parameter: {param}')
20
+
21
+ # Check parameter types
22
+ if 'storage' in params and not isinstance(params['storage'], int):
23
+ errors.append("Parameter 'storage' must be an integer")
24
+
25
+ if 'data_collection' in params and not isinstance(params['data_collection'], int):
26
+ errors.append("Parameter 'data_collection' must be an integer")
27
+
28
+ if 'is_recursive' in params and not isinstance(params['is_recursive'], bool):
29
+ errors.append("Parameter 'is_recursive' must be a boolean")
30
+
31
+ return ValidationResult(valid=len(errors) == 0, errors=errors)
32
+
33
+ def validate_files(self, files: List[Dict], specs: Dict) -> ValidationResult:
34
+ """Validate organized files against specifications."""
35
+ try:
36
+ validator = FileSpecificationValidator(specs, files)
37
+ is_valid = validator.validate()
38
+
39
+ if is_valid:
40
+ return ValidationResult(valid=True)
41
+ else:
42
+ return ValidationResult(valid=False, errors=['File specification validation failed'])
43
+
44
+ except Exception as e:
45
+ return ValidationResult(valid=False, errors=[f'Validation error: {str(e)}'])