synapse-sdk 2025.9.1__py3-none-any.whl → 2025.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/devtools/docs/docs/api/clients/annotation-mixin.md +378 -0
- synapse_sdk/devtools/docs/docs/api/clients/backend.md +368 -1
- synapse_sdk/devtools/docs/docs/api/clients/core-mixin.md +477 -0
- synapse_sdk/devtools/docs/docs/api/clients/data-collection-mixin.md +422 -0
- synapse_sdk/devtools/docs/docs/api/clients/hitl-mixin.md +554 -0
- synapse_sdk/devtools/docs/docs/api/clients/index.md +391 -0
- synapse_sdk/devtools/docs/docs/api/clients/integration-mixin.md +571 -0
- synapse_sdk/devtools/docs/docs/api/clients/ml-mixin.md +578 -0
- synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +1463 -0
- synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +161 -34
- synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +1497 -213
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/annotation-mixin.md +289 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/backend.md +378 -11
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/core-mixin.md +417 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/data-collection-mixin.md +356 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/hitl-mixin.md +192 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/index.md +391 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/integration-mixin.md +479 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ml-mixin.md +284 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +1463 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +161 -34
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +1752 -572
- synapse_sdk/devtools/docs/sidebars.ts +7 -0
- synapse_sdk/plugins/README.md +1 -2
- synapse_sdk/plugins/categories/base.py +23 -0
- synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
- synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
- synapse_sdk/plugins/categories/export/actions/export/action.py +160 -0
- synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
- synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
- synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
- synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
- synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +1 -1
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +1 -2
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +154 -531
- synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
- synapse_sdk/plugins/categories/upload/actions/upload/factory.py +143 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +66 -29
- synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +182 -0
- synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +106 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +80 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +66 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +101 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +89 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +96 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +61 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +86 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +34 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +233 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +238 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/async_upload.py +109 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +43 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +45 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +194 -83
- synapse_sdk/plugins/categories/upload/templates/config.yaml +4 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +269 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +71 -27
- synapse_sdk/plugins/models.py +5 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/METADATA +2 -1
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/RECORD +78 -27
- synapse_sdk/plugins/categories/export/actions/export.py +0 -385
- synapse_sdk/plugins/categories/export/enums.py +0 -7
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/WHEEL +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from ..base import FileDiscoveryStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RecursiveFileDiscoveryStrategy(FileDiscoveryStrategy):
|
|
9
|
+
"""Recursive file discovery strategy."""
|
|
10
|
+
|
|
11
|
+
def discover(self, path: Path, recursive: bool) -> List[Path]:
|
|
12
|
+
"""Discover files recursively in the given path."""
|
|
13
|
+
return [file_path for file_path in path.rglob('*') if file_path.is_file()]
|
|
14
|
+
|
|
15
|
+
def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
|
|
16
|
+
"""Organize files according to specifications with metadata."""
|
|
17
|
+
organized_files = []
|
|
18
|
+
|
|
19
|
+
# Use provided type_dirs or create fallback mapping
|
|
20
|
+
if type_dirs is None:
|
|
21
|
+
type_dirs = {}
|
|
22
|
+
for spec in specs:
|
|
23
|
+
spec_name = spec['name']
|
|
24
|
+
# Fallback: extract spec directory from file paths
|
|
25
|
+
for file_path in files:
|
|
26
|
+
# Check if this file's path contains the spec_name as a directory
|
|
27
|
+
path_parts = file_path.parts
|
|
28
|
+
if spec_name in path_parts:
|
|
29
|
+
# Find the index of spec_name and reconstruct the path up to that directory
|
|
30
|
+
spec_index = path_parts.index(spec_name)
|
|
31
|
+
spec_dir = Path(*path_parts[: spec_index + 1])
|
|
32
|
+
if spec_dir.exists() and spec_dir.is_dir():
|
|
33
|
+
type_dirs[spec_name] = spec_dir
|
|
34
|
+
break
|
|
35
|
+
|
|
36
|
+
if not type_dirs:
|
|
37
|
+
return organized_files
|
|
38
|
+
|
|
39
|
+
# Performance optimization 1: Path caching - avoid repeated string conversions
|
|
40
|
+
path_cache = {dir_path: str(dir_path) for dir_path in type_dirs.values()}
|
|
41
|
+
|
|
42
|
+
# Performance optimization 2: Build metadata index for faster lookups
|
|
43
|
+
metadata_index = self._build_metadata_index(metadata)
|
|
44
|
+
|
|
45
|
+
# Group files by dataset
|
|
46
|
+
dataset_files = {}
|
|
47
|
+
required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
|
|
48
|
+
|
|
49
|
+
for file_path in files:
|
|
50
|
+
# Determine which type directory this file belongs to
|
|
51
|
+
file_path_str = str(file_path)
|
|
52
|
+
for spec_name, dir_path in type_dirs.items():
|
|
53
|
+
dir_path_str = path_cache[dir_path]
|
|
54
|
+
if file_path_str.startswith(dir_path_str):
|
|
55
|
+
file_name = file_path.stem
|
|
56
|
+
|
|
57
|
+
if file_name not in dataset_files:
|
|
58
|
+
dataset_files[file_name] = {}
|
|
59
|
+
|
|
60
|
+
if spec_name not in dataset_files[file_name]:
|
|
61
|
+
dataset_files[file_name][spec_name] = file_path
|
|
62
|
+
else:
|
|
63
|
+
# Keep the most recent file - only stat when needed
|
|
64
|
+
existing_file = dataset_files[file_name][spec_name]
|
|
65
|
+
try:
|
|
66
|
+
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
67
|
+
dataset_files[file_name][spec_name] = file_path
|
|
68
|
+
except (OSError, IOError):
|
|
69
|
+
# If stat fails, keep existing file
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
# Create organized files for datasets with all required files
|
|
73
|
+
for file_name, files_dict in sorted(dataset_files.items()):
|
|
74
|
+
if all(req in files_dict for req in required_specs):
|
|
75
|
+
# Calculate most common file extension (optimized)
|
|
76
|
+
file_extensions = {}
|
|
77
|
+
for file_path in files_dict.values():
|
|
78
|
+
ext = file_path.suffix.lower()
|
|
79
|
+
if ext:
|
|
80
|
+
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
81
|
+
|
|
82
|
+
origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
|
|
83
|
+
|
|
84
|
+
meta_data = {
|
|
85
|
+
'origin_file_stem': file_name,
|
|
86
|
+
'origin_file_extension': origin_file_extension,
|
|
87
|
+
'created_at': datetime.now().isoformat(),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Add metadata if available - using optimized index lookup
|
|
91
|
+
if metadata_index:
|
|
92
|
+
matched_metadata = self._find_matching_metadata_optimized(file_name, files_dict, metadata_index)
|
|
93
|
+
if matched_metadata:
|
|
94
|
+
meta_data.update(matched_metadata)
|
|
95
|
+
|
|
96
|
+
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
97
|
+
|
|
98
|
+
return organized_files
|
|
99
|
+
|
|
100
|
+
def _build_metadata_index(self, metadata: Dict) -> Dict:
|
|
101
|
+
"""Build metadata index for faster lookups."""
|
|
102
|
+
if not metadata:
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
|
|
106
|
+
|
|
107
|
+
for meta_key, meta_value in metadata.items():
|
|
108
|
+
meta_path = Path(meta_key)
|
|
109
|
+
|
|
110
|
+
# Index by stem
|
|
111
|
+
stem = meta_path.stem
|
|
112
|
+
if stem:
|
|
113
|
+
metadata_index['exact_stem'][stem] = meta_value
|
|
114
|
+
metadata_index['stem_lookup'][stem] = meta_value
|
|
115
|
+
|
|
116
|
+
# Index by full name
|
|
117
|
+
name = meta_path.name
|
|
118
|
+
if name:
|
|
119
|
+
metadata_index['exact_name'][name] = meta_value
|
|
120
|
+
|
|
121
|
+
# Index for partial path matching
|
|
122
|
+
metadata_index['partial_paths'][meta_key] = meta_value
|
|
123
|
+
|
|
124
|
+
# Index for full path matching
|
|
125
|
+
metadata_index['full_paths'][meta_key] = meta_value
|
|
126
|
+
|
|
127
|
+
return metadata_index
|
|
128
|
+
|
|
129
|
+
def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
|
|
130
|
+
"""Find matching metadata using optimized index lookups."""
|
|
131
|
+
if not metadata_index:
|
|
132
|
+
return {}
|
|
133
|
+
|
|
134
|
+
# Strategy 1: Exact stem match (O(1) lookup)
|
|
135
|
+
if file_name in metadata_index['exact_stem']:
|
|
136
|
+
return metadata_index['exact_stem'][file_name]
|
|
137
|
+
|
|
138
|
+
# Strategy 2: Exact filename match with extension (O(1) lookup)
|
|
139
|
+
sample_file = list(files_dict.values())[0] if files_dict else None
|
|
140
|
+
if sample_file:
|
|
141
|
+
full_filename = f'{file_name}{sample_file.suffix}'
|
|
142
|
+
if full_filename in metadata_index['exact_name']:
|
|
143
|
+
return metadata_index['exact_name'][full_filename]
|
|
144
|
+
|
|
145
|
+
# Try sample file name
|
|
146
|
+
sample_filename = sample_file.name
|
|
147
|
+
if sample_filename in metadata_index['exact_name']:
|
|
148
|
+
return metadata_index['exact_name'][sample_filename]
|
|
149
|
+
|
|
150
|
+
# Strategy 3: Stem lookup (already optimized above)
|
|
151
|
+
# This is covered by exact_stem lookup
|
|
152
|
+
|
|
153
|
+
# Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
|
|
154
|
+
if sample_file:
|
|
155
|
+
file_path_str = str(sample_file)
|
|
156
|
+
file_path_posix = sample_file.as_posix()
|
|
157
|
+
|
|
158
|
+
# Check partial paths
|
|
159
|
+
for meta_key in metadata_index['partial_paths']:
|
|
160
|
+
if (
|
|
161
|
+
meta_key in file_path_str
|
|
162
|
+
or meta_key in file_path_posix
|
|
163
|
+
or file_path_str in meta_key
|
|
164
|
+
or file_path_posix in meta_key
|
|
165
|
+
):
|
|
166
|
+
return metadata_index['partial_paths'][meta_key]
|
|
167
|
+
|
|
168
|
+
return {}
|
|
169
|
+
|
|
170
|
+
def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
|
|
171
|
+
"""Find matching metadata using comprehensive pattern matching.
|
|
172
|
+
|
|
173
|
+
Matching priority:
|
|
174
|
+
1. Exact stem match (highest priority)
|
|
175
|
+
2. Exact filename match (with extension)
|
|
176
|
+
3. Metadata key stem matches file stem
|
|
177
|
+
4. Partial path matching
|
|
178
|
+
5. Full path matching
|
|
179
|
+
"""
|
|
180
|
+
if not metadata:
|
|
181
|
+
return {}
|
|
182
|
+
|
|
183
|
+
# Get sample file for extension and path information
|
|
184
|
+
sample_file = list(files_dict.values())[0] if files_dict else None
|
|
185
|
+
|
|
186
|
+
# Strategy 1: Exact stem match (highest priority)
|
|
187
|
+
if file_name in metadata:
|
|
188
|
+
return metadata[file_name]
|
|
189
|
+
|
|
190
|
+
# Strategy 2: Exact filename match (with extension)
|
|
191
|
+
if sample_file:
|
|
192
|
+
full_filename = f'{file_name}{sample_file.suffix}'
|
|
193
|
+
if full_filename in metadata:
|
|
194
|
+
return metadata[full_filename]
|
|
195
|
+
|
|
196
|
+
# Also try with sample file name
|
|
197
|
+
sample_filename = sample_file.name
|
|
198
|
+
if sample_filename in metadata:
|
|
199
|
+
return metadata[sample_filename]
|
|
200
|
+
|
|
201
|
+
# Strategy 3: Metadata key stem matches file stem
|
|
202
|
+
for meta_key in metadata.keys():
|
|
203
|
+
meta_stem = Path(meta_key).stem
|
|
204
|
+
if meta_stem == file_name:
|
|
205
|
+
return metadata[meta_key]
|
|
206
|
+
|
|
207
|
+
# Strategy 4: Partial path matching
|
|
208
|
+
if sample_file:
|
|
209
|
+
file_path_parts = sample_file.parts
|
|
210
|
+
for meta_key in metadata.keys():
|
|
211
|
+
meta_path = Path(meta_key)
|
|
212
|
+
# Check if any part of the metadata key matches our file path parts
|
|
213
|
+
for part in file_path_parts:
|
|
214
|
+
if part in str(meta_path) or str(meta_path) in part:
|
|
215
|
+
# Additional validation: ensure it's a reasonable match
|
|
216
|
+
if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
|
|
217
|
+
return metadata[meta_key]
|
|
218
|
+
|
|
219
|
+
# Strategy 5: Full path matching
|
|
220
|
+
if sample_file:
|
|
221
|
+
full_path_str = str(sample_file)
|
|
222
|
+
full_path_posix = sample_file.as_posix()
|
|
223
|
+
|
|
224
|
+
for meta_key in metadata.keys():
|
|
225
|
+
# Direct path match
|
|
226
|
+
if meta_key == full_path_str or meta_key == full_path_posix:
|
|
227
|
+
return metadata[meta_key]
|
|
228
|
+
|
|
229
|
+
# Relative path match (check if meta_key is contained in our path)
|
|
230
|
+
if meta_key in full_path_str or meta_key in full_path_posix:
|
|
231
|
+
return metadata[meta_key]
|
|
232
|
+
|
|
233
|
+
# Reverse match (check if our path is contained in meta_key)
|
|
234
|
+
if full_path_str in meta_key or full_path_posix in meta_key:
|
|
235
|
+
return metadata[meta_key]
|
|
236
|
+
|
|
237
|
+
# No match found
|
|
238
|
+
return {}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Metadata strategy implementations
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from openpyxl import load_workbook
|
|
6
|
+
from openpyxl.utils.exceptions import InvalidFileException
|
|
7
|
+
|
|
8
|
+
from ...exceptions import ExcelParsingError, ExcelSecurityError
|
|
9
|
+
from ...utils import ExcelMetadataUtils, ExcelSecurityConfig
|
|
10
|
+
from ..base import MetadataStrategy, ValidationResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExcelMetadataStrategy(MetadataStrategy):
|
|
14
|
+
"""Excel metadata extraction strategy."""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.excel_config = ExcelSecurityConfig()
|
|
18
|
+
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
19
|
+
|
|
20
|
+
def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
21
|
+
"""Extract metadata from Excel file."""
|
|
22
|
+
try:
|
|
23
|
+
excel_stream = self._prepare_excel_file(source_path)
|
|
24
|
+
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
25
|
+
try:
|
|
26
|
+
return self._process_excel_worksheet(workbook.active)
|
|
27
|
+
finally:
|
|
28
|
+
workbook.close()
|
|
29
|
+
|
|
30
|
+
except ExcelSecurityError:
|
|
31
|
+
raise
|
|
32
|
+
except ExcelParsingError:
|
|
33
|
+
raise
|
|
34
|
+
except InvalidFileException as e:
|
|
35
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
36
|
+
except MemoryError:
|
|
37
|
+
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
38
|
+
except (OSError, IOError) as e:
|
|
39
|
+
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
40
|
+
except Exception as e:
|
|
41
|
+
# Handle ZIP file errors (Excel files are ZIP archives)
|
|
42
|
+
if 'zip file' in str(e).lower():
|
|
43
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
44
|
+
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
45
|
+
|
|
46
|
+
def validate(self, metadata: Dict) -> ValidationResult:
|
|
47
|
+
"""Validate extracted metadata."""
|
|
48
|
+
errors = []
|
|
49
|
+
|
|
50
|
+
if not isinstance(metadata, dict):
|
|
51
|
+
errors.append('Metadata must be a dictionary')
|
|
52
|
+
return ValidationResult(valid=False, errors=errors)
|
|
53
|
+
|
|
54
|
+
# Validate each file's metadata
|
|
55
|
+
for file_name, file_metadata in metadata.items():
|
|
56
|
+
if not isinstance(file_metadata, dict):
|
|
57
|
+
errors.append(f"Metadata for file '{file_name}' must be a dictionary")
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Check filename length
|
|
61
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
62
|
+
errors.append(f"Filename '{file_name}' exceeds maximum length")
|
|
63
|
+
|
|
64
|
+
return ValidationResult(valid=len(errors) == 0, errors=errors)
|
|
65
|
+
|
|
66
|
+
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
67
|
+
"""Validate Excel file security constraints."""
|
|
68
|
+
file_size = excel_path.stat().st_size
|
|
69
|
+
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
70
|
+
raise ExcelSecurityError(
|
|
71
|
+
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
estimated_memory = file_size * 3
|
|
75
|
+
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
76
|
+
raise ExcelSecurityError(
|
|
77
|
+
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
78
|
+
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
82
|
+
"""Prepare Excel file for processing."""
|
|
83
|
+
self._validate_excel_security(excel_path)
|
|
84
|
+
excel_bytes = excel_path.read_bytes()
|
|
85
|
+
return BytesIO(excel_bytes)
|
|
86
|
+
|
|
87
|
+
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
88
|
+
"""Process Excel headers."""
|
|
89
|
+
if len(headers) < 2:
|
|
90
|
+
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
91
|
+
|
|
92
|
+
# Validate first column header (filename column)
|
|
93
|
+
first_header = str(headers[0]).strip().lower() if headers[0] else ''
|
|
94
|
+
valid_filename_headers = ['filename', 'file_name']
|
|
95
|
+
|
|
96
|
+
if first_header not in valid_filename_headers:
|
|
97
|
+
raise ExcelParsingError(
|
|
98
|
+
f'First column header must be "filename" or "file_name", got: "{headers[0]}". '
|
|
99
|
+
f'Valid options: {", ".join(valid_filename_headers)}'
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self._validate_excel_content(headers, 0)
|
|
103
|
+
return headers
|
|
104
|
+
|
|
105
|
+
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
106
|
+
"""Process a single Excel data row."""
|
|
107
|
+
if not row[0] or str(row[0]).strip() == '':
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
file_name = str(row[0]).strip()
|
|
111
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
file_metadata: Dict[str, Any] = {}
|
|
115
|
+
for i, value in enumerate(row[1:], start=1):
|
|
116
|
+
if i < len(headers): # Include empty strings, exclude only None values
|
|
117
|
+
header_value = headers[i]
|
|
118
|
+
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
119
|
+
|
|
120
|
+
column_name = self.excel_utils.validate_and_truncate_string(
|
|
121
|
+
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Convert None to empty string, otherwise convert to string
|
|
125
|
+
value_str = '' if value is None else str(value)
|
|
126
|
+
str_value = self.excel_utils.validate_and_truncate_string(
|
|
127
|
+
value_str, self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
128
|
+
)
|
|
129
|
+
file_metadata[column_name] = str_value
|
|
130
|
+
|
|
131
|
+
return {file_name: file_metadata} if file_metadata else None
|
|
132
|
+
|
|
133
|
+
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
134
|
+
"""Process Excel worksheet."""
|
|
135
|
+
if worksheet is None:
|
|
136
|
+
raise ExcelParsingError('Excel file has no active worksheet')
|
|
137
|
+
|
|
138
|
+
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
139
|
+
headers: Optional[tuple] = None
|
|
140
|
+
data_row_count = 0
|
|
141
|
+
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
142
|
+
|
|
143
|
+
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
144
|
+
# Quick check: if row is empty or first cell is empty, skip
|
|
145
|
+
if not row or not row[0] or str(row[0]).strip() == '':
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
if row_idx == 0:
|
|
149
|
+
headers = self._process_excel_headers(row)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
if headers is None:
|
|
153
|
+
raise ExcelParsingError('Excel file missing header row')
|
|
154
|
+
|
|
155
|
+
data_row_count += 1
|
|
156
|
+
|
|
157
|
+
if data_row_count % validation_interval == 0:
|
|
158
|
+
self._validate_excel_content(headers, data_row_count)
|
|
159
|
+
|
|
160
|
+
row_result = self._process_excel_data_row(row, headers)
|
|
161
|
+
if row_result:
|
|
162
|
+
metadata_dict.update(row_result)
|
|
163
|
+
|
|
164
|
+
self._validate_excel_content(headers or (), data_row_count)
|
|
165
|
+
|
|
166
|
+
return metadata_dict
|
|
167
|
+
|
|
168
|
+
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
169
|
+
"""Validate Excel content constraints."""
|
|
170
|
+
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
171
|
+
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
172
|
+
|
|
173
|
+
if row_count > self.excel_config.MAX_ROWS:
|
|
174
|
+
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from ..base import MetadataStrategy, ValidationResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NoneMetadataStrategy(MetadataStrategy):
|
|
8
|
+
"""Metadata strategy that returns no metadata."""
|
|
9
|
+
|
|
10
|
+
def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
11
|
+
"""Return empty metadata."""
|
|
12
|
+
return {}
|
|
13
|
+
|
|
14
|
+
def validate(self, metadata: Dict) -> ValidationResult:
|
|
15
|
+
"""Always validates successfully for empty metadata."""
|
|
16
|
+
return ValidationResult(valid=True)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Upload strategy implementations
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from synapse_sdk.clients.exceptions import ClientError
|
|
6
|
+
|
|
7
|
+
from ...enums import LogCode, UploadStatus
|
|
8
|
+
from ..base import UploadConfig, UploadStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AsyncUploadStrategy(UploadStrategy):
|
|
12
|
+
"""Asynchronous upload strategy."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, context):
|
|
15
|
+
self.context = context
|
|
16
|
+
|
|
17
|
+
def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
18
|
+
"""Upload files asynchronously."""
|
|
19
|
+
# Use the run_async helper from the action
|
|
20
|
+
return self._run_async_upload(files, config)
|
|
21
|
+
|
|
22
|
+
def _run_async_upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
23
|
+
"""Run async upload using asyncio."""
|
|
24
|
+
import concurrent.futures
|
|
25
|
+
|
|
26
|
+
def _run_in_thread():
|
|
27
|
+
return asyncio.run(self._upload_files_async(files, config))
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
# Check if we're already in an event loop
|
|
31
|
+
asyncio.get_running_loop()
|
|
32
|
+
# If we are, run in a separate thread
|
|
33
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
34
|
+
future = executor.submit(_run_in_thread)
|
|
35
|
+
return future.result()
|
|
36
|
+
except RuntimeError:
|
|
37
|
+
# No event loop running, safe to run directly
|
|
38
|
+
return asyncio.run(self._upload_files_async(files, config))
|
|
39
|
+
|
|
40
|
+
async def _upload_files_async(self, organized_files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
41
|
+
"""Upload files asynchronously with concurrency control."""
|
|
42
|
+
client = self.context.client
|
|
43
|
+
collection_id = self.context.get_param('data_collection')
|
|
44
|
+
upload_result = []
|
|
45
|
+
|
|
46
|
+
semaphore = asyncio.Semaphore(config.max_concurrent)
|
|
47
|
+
|
|
48
|
+
async def upload_single_file(organized_file):
|
|
49
|
+
async with semaphore:
|
|
50
|
+
loop = asyncio.get_event_loop()
|
|
51
|
+
try:
|
|
52
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file, config)
|
|
53
|
+
uploaded_data_file = await loop.run_in_executor(
|
|
54
|
+
None, lambda: client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
55
|
+
)
|
|
56
|
+
self.context.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
57
|
+
return {'status': 'success', 'result': uploaded_data_file}
|
|
58
|
+
except ClientError as e:
|
|
59
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
60
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Client error: {str(e)}')
|
|
61
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'client_error', 'retryable': True}
|
|
62
|
+
except (OSError, IOError) as e:
|
|
63
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
64
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'File system error: {str(e)}')
|
|
65
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'file_error', 'retryable': False}
|
|
66
|
+
except MemoryError as e:
|
|
67
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
68
|
+
self.context.run.log_message_with_code(
|
|
69
|
+
LogCode.FILE_UPLOAD_FAILED, f'Memory error (file too large): {str(e)}'
|
|
70
|
+
)
|
|
71
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'memory_error', 'retryable': False}
|
|
72
|
+
except asyncio.TimeoutError as e:
|
|
73
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
74
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Upload timeout: {str(e)}')
|
|
75
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'timeout_error', 'retryable': True}
|
|
76
|
+
except ValueError as e:
|
|
77
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
78
|
+
self.context.run.log_message_with_code(
|
|
79
|
+
LogCode.FILE_UPLOAD_FAILED, f'Data validation error: {str(e)}'
|
|
80
|
+
)
|
|
81
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'validation_error', 'retryable': False}
|
|
82
|
+
except Exception as e:
|
|
83
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
84
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Unexpected error: {str(e)}')
|
|
85
|
+
return {'status': 'failed', 'error': str(e), 'error_type': 'unknown_error', 'retryable': False}
|
|
86
|
+
|
|
87
|
+
# Create tasks for all files
|
|
88
|
+
tasks = [upload_single_file(organized_file) for organized_file in organized_files]
|
|
89
|
+
|
|
90
|
+
# Process completed tasks
|
|
91
|
+
for completed_task in asyncio.as_completed(tasks):
|
|
92
|
+
result = await completed_task
|
|
93
|
+
if result['status'] == 'success':
|
|
94
|
+
upload_result.append(result['result'])
|
|
95
|
+
# Failed uploads are logged but don't stop the process
|
|
96
|
+
|
|
97
|
+
return upload_result
|
|
98
|
+
|
|
99
|
+
def _requires_chunked_upload(self, organized_file: Dict[str, Any], config: UploadConfig) -> bool:
|
|
100
|
+
"""Determine if chunked upload is required based on file sizes."""
|
|
101
|
+
max_file_size_mb = config.chunked_threshold_mb
|
|
102
|
+
for file_path in organized_file.get('files', {}).values():
|
|
103
|
+
if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
|
|
104
|
+
return True
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def _get_file_size_mb(self, file_path: Path) -> float:
|
|
108
|
+
"""Get file size in megabytes."""
|
|
109
|
+
return file_path.stat().st_size / (1024 * 1024)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from ...enums import LogCode, UploadStatus
|
|
5
|
+
from ..base import UploadConfig, UploadStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SyncUploadStrategy(UploadStrategy):
|
|
9
|
+
"""Synchronous upload strategy."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, context):
|
|
12
|
+
self.context = context
|
|
13
|
+
|
|
14
|
+
def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
15
|
+
"""Upload files synchronously."""
|
|
16
|
+
uploaded_files = []
|
|
17
|
+
client = self.context.client
|
|
18
|
+
collection_id = self.context.get_param('data_collection')
|
|
19
|
+
|
|
20
|
+
for organized_file in files:
|
|
21
|
+
try:
|
|
22
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file, config)
|
|
23
|
+
uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
24
|
+
self.context.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
25
|
+
uploaded_files.append(uploaded_data_file)
|
|
26
|
+
except Exception as e:
|
|
27
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
28
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
|
|
29
|
+
# Continue with other files instead of failing completely
|
|
30
|
+
|
|
31
|
+
return uploaded_files
|
|
32
|
+
|
|
33
|
+
def _requires_chunked_upload(self, organized_file: Dict[str, Any], config: UploadConfig) -> bool:
|
|
34
|
+
"""Determine if chunked upload is required based on file sizes."""
|
|
35
|
+
max_file_size_mb = config.chunked_threshold_mb
|
|
36
|
+
for file_path in organized_file.get('files', {}).values():
|
|
37
|
+
if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
|
|
38
|
+
return True
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
def _get_file_size_mb(self, file_path: Path) -> float:
|
|
42
|
+
"""Get file size in megabytes."""
|
|
43
|
+
return file_path.stat().st_size / (1024 * 1024)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Validation strategy implementations
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from synapse_sdk.clients.validators.collections import FileSpecificationValidator
|
|
4
|
+
|
|
5
|
+
from ..base import ValidationResult, ValidationStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DefaultValidationStrategy(ValidationStrategy):
|
|
9
|
+
"""Default validation strategy for upload operations."""
|
|
10
|
+
|
|
11
|
+
def validate_params(self, params: Dict) -> ValidationResult:
|
|
12
|
+
"""Validate action parameters."""
|
|
13
|
+
errors = []
|
|
14
|
+
|
|
15
|
+
# Check required parameters
|
|
16
|
+
required_params = ['storage', 'data_collection', 'path', 'name']
|
|
17
|
+
for param in required_params:
|
|
18
|
+
if param not in params:
|
|
19
|
+
errors.append(f'Missing required parameter: {param}')
|
|
20
|
+
|
|
21
|
+
# Check parameter types
|
|
22
|
+
if 'storage' in params and not isinstance(params['storage'], int):
|
|
23
|
+
errors.append("Parameter 'storage' must be an integer")
|
|
24
|
+
|
|
25
|
+
if 'data_collection' in params and not isinstance(params['data_collection'], int):
|
|
26
|
+
errors.append("Parameter 'data_collection' must be an integer")
|
|
27
|
+
|
|
28
|
+
if 'is_recursive' in params and not isinstance(params['is_recursive'], bool):
|
|
29
|
+
errors.append("Parameter 'is_recursive' must be a boolean")
|
|
30
|
+
|
|
31
|
+
return ValidationResult(valid=len(errors) == 0, errors=errors)
|
|
32
|
+
|
|
33
|
+
def validate_files(self, files: List[Dict], specs: Dict) -> ValidationResult:
|
|
34
|
+
"""Validate organized files against specifications."""
|
|
35
|
+
try:
|
|
36
|
+
validator = FileSpecificationValidator(specs, files)
|
|
37
|
+
is_valid = validator.validate()
|
|
38
|
+
|
|
39
|
+
if is_valid:
|
|
40
|
+
return ValidationResult(valid=True)
|
|
41
|
+
else:
|
|
42
|
+
return ValidationResult(valid=False, errors=['File specification validation failed'])
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
return ValidationResult(valid=False, errors=[f'Validation error: {str(e)}'])
|