synapse-sdk 2025.9.1__py3-none-any.whl → 2025.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/devtools/docs/docs/api/clients/annotation-mixin.md +378 -0
- synapse_sdk/devtools/docs/docs/api/clients/backend.md +368 -1
- synapse_sdk/devtools/docs/docs/api/clients/core-mixin.md +477 -0
- synapse_sdk/devtools/docs/docs/api/clients/data-collection-mixin.md +422 -0
- synapse_sdk/devtools/docs/docs/api/clients/hitl-mixin.md +554 -0
- synapse_sdk/devtools/docs/docs/api/clients/index.md +391 -0
- synapse_sdk/devtools/docs/docs/api/clients/integration-mixin.md +571 -0
- synapse_sdk/devtools/docs/docs/api/clients/ml-mixin.md +578 -0
- synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +1463 -0
- synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +161 -34
- synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +1497 -213
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/annotation-mixin.md +289 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/backend.md +378 -11
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/core-mixin.md +417 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/data-collection-mixin.md +356 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/hitl-mixin.md +192 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/index.md +391 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/integration-mixin.md +479 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ml-mixin.md +284 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +1463 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +161 -34
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +1752 -572
- synapse_sdk/devtools/docs/sidebars.ts +7 -0
- synapse_sdk/plugins/README.md +1 -2
- synapse_sdk/plugins/categories/base.py +23 -0
- synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
- synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
- synapse_sdk/plugins/categories/export/actions/export/action.py +160 -0
- synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
- synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
- synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
- synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
- synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +1 -1
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +1 -2
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +154 -531
- synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
- synapse_sdk/plugins/categories/upload/actions/upload/factory.py +143 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +66 -29
- synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +182 -0
- synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +106 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +80 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +66 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +101 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +89 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +96 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +61 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +86 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +34 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +233 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +238 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/async_upload.py +109 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +43 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +45 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +194 -83
- synapse_sdk/plugins/categories/upload/templates/config.yaml +4 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +269 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +71 -27
- synapse_sdk/plugins/models.py +5 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/METADATA +2 -1
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/RECORD +78 -27
- synapse_sdk/plugins/categories/export/actions/export.py +0 -385
- synapse_sdk/plugins/categories/export/enums.py +0 -7
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/WHEEL +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from ..context import StepResult, UploadContext
|
|
2
|
+
from ..enums import LogCode
|
|
3
|
+
from .base import BaseStep
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OrganizeFilesStep(BaseStep):
|
|
7
|
+
"""Organize files according to specifications using file discovery strategy."""
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
def name(self) -> str:
|
|
11
|
+
return 'organize_files'
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def progress_weight(self) -> float:
|
|
15
|
+
return 0.15
|
|
16
|
+
|
|
17
|
+
def execute(self, context: UploadContext) -> StepResult:
|
|
18
|
+
"""Execute file organization step."""
|
|
19
|
+
file_discovery_strategy = context.strategies.get('file_discovery')
|
|
20
|
+
if not file_discovery_strategy:
|
|
21
|
+
return self.create_error_result('File discovery strategy not found')
|
|
22
|
+
|
|
23
|
+
if not context.file_specifications:
|
|
24
|
+
return self.create_error_result('File specifications not available')
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# Create type directories mapping
|
|
28
|
+
type_dirs = {}
|
|
29
|
+
for spec in context.file_specifications:
|
|
30
|
+
spec_name = spec['name']
|
|
31
|
+
spec_dir = context.pathlib_cwd / spec_name
|
|
32
|
+
if spec_dir.exists() and spec_dir.is_dir():
|
|
33
|
+
type_dirs[spec_name] = spec_dir
|
|
34
|
+
|
|
35
|
+
if type_dirs:
|
|
36
|
+
context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
|
|
37
|
+
else:
|
|
38
|
+
context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
|
|
39
|
+
return self.create_success_result(data={'organized_files': []})
|
|
40
|
+
|
|
41
|
+
context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
|
|
42
|
+
context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
|
|
43
|
+
|
|
44
|
+
# Discover files in type directories
|
|
45
|
+
all_files = []
|
|
46
|
+
is_recursive = context.get_param('is_recursive', True)
|
|
47
|
+
|
|
48
|
+
for spec_name, dir_path in type_dirs.items():
|
|
49
|
+
files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
|
|
50
|
+
all_files.extend(files_in_dir)
|
|
51
|
+
|
|
52
|
+
if not all_files:
|
|
53
|
+
context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
|
|
54
|
+
return self.create_success_result(data={'organized_files': []})
|
|
55
|
+
|
|
56
|
+
# Organize files using strategy
|
|
57
|
+
organized_files = file_discovery_strategy.organize(
|
|
58
|
+
all_files, context.file_specifications, context.metadata or {}, type_dirs
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if organized_files:
|
|
62
|
+
context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
|
|
63
|
+
context.add_organized_files(organized_files)
|
|
64
|
+
|
|
65
|
+
return self.create_success_result(
|
|
66
|
+
data={'organized_files': organized_files},
|
|
67
|
+
rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
return self.create_error_result(f'File organization failed: {str(e)}')
|
|
72
|
+
|
|
73
|
+
def can_skip(self, context: UploadContext) -> bool:
|
|
74
|
+
"""File organization cannot be skipped."""
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
def rollback(self, context: UploadContext) -> None:
|
|
78
|
+
"""Rollback file organization."""
|
|
79
|
+
# Clear organized files
|
|
80
|
+
context.organized_files.clear()
|
|
81
|
+
context.run.log_message('Rolled back file organization')
|
|
82
|
+
|
|
83
|
+
def validate_prerequisites(self, context: UploadContext) -> None:
|
|
84
|
+
"""Validate prerequisites for file organization."""
|
|
85
|
+
if not context.pathlib_cwd:
|
|
86
|
+
raise ValueError('Working directory path not set')
|
|
87
|
+
|
|
88
|
+
if not context.file_specifications:
|
|
89
|
+
raise ValueError('File specifications not available')
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from synapse_sdk.plugins.exceptions import ActionError
|
|
2
|
+
|
|
3
|
+
from ..context import StepResult, UploadContext
|
|
4
|
+
from ..enums import LogCode, UploadStatus
|
|
5
|
+
from ..strategies.base import UploadConfig
|
|
6
|
+
from .base import BaseStep
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UploadFilesStep(BaseStep):
|
|
10
|
+
"""Upload organized files using upload strategy."""
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def name(self) -> str:
|
|
14
|
+
return 'upload_files'
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def progress_weight(self) -> float:
|
|
18
|
+
return 0.30
|
|
19
|
+
|
|
20
|
+
def execute(self, context: UploadContext) -> StepResult:
|
|
21
|
+
"""Execute file upload step."""
|
|
22
|
+
upload_strategy = context.strategies.get('upload')
|
|
23
|
+
if not upload_strategy:
|
|
24
|
+
return self.create_error_result('Upload strategy not found')
|
|
25
|
+
|
|
26
|
+
if not context.organized_files:
|
|
27
|
+
context.run.log_message_with_code(LogCode.NO_FILES_UPLOADED)
|
|
28
|
+
return self.create_error_result('No organized files to upload')
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Setup progress tracking
|
|
32
|
+
organized_files_count = len(context.organized_files)
|
|
33
|
+
context.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
34
|
+
context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
|
|
35
|
+
|
|
36
|
+
# Initialize metrics
|
|
37
|
+
context.update_metrics('data_files', {'stand_by': organized_files_count, 'success': 0, 'failed': 0})
|
|
38
|
+
|
|
39
|
+
# Create upload configuration
|
|
40
|
+
upload_config = UploadConfig(
|
|
41
|
+
use_async=context.get_param('use_async_upload', True),
|
|
42
|
+
max_concurrent=context.get_param('max_concurrent_uploads', 10),
|
|
43
|
+
chunked_threshold_mb=context.get_param('max_file_size_mb', 50),
|
|
44
|
+
batch_size=context.get_param('upload_batch_size', 1),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Execute upload using strategy
|
|
48
|
+
uploaded_files = upload_strategy.upload(context.organized_files, upload_config)
|
|
49
|
+
|
|
50
|
+
# Update context and metrics
|
|
51
|
+
context.add_uploaded_files(uploaded_files)
|
|
52
|
+
|
|
53
|
+
# Log upload results
|
|
54
|
+
for uploaded_file in uploaded_files:
|
|
55
|
+
context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
|
|
56
|
+
|
|
57
|
+
# Update final metrics
|
|
58
|
+
context.update_metrics(
|
|
59
|
+
'data_files',
|
|
60
|
+
{'stand_by': 0, 'success': len(uploaded_files), 'failed': organized_files_count - len(uploaded_files)},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Complete progress
|
|
64
|
+
context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
65
|
+
|
|
66
|
+
return self.create_success_result(
|
|
67
|
+
data={'uploaded_files': uploaded_files},
|
|
68
|
+
rollback_data={
|
|
69
|
+
'uploaded_files_count': len(uploaded_files),
|
|
70
|
+
'upload_method': 'async' if upload_config.use_async else 'sync',
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
|
|
76
|
+
return self.create_error_result(f'File upload failed: {str(e)}')
|
|
77
|
+
|
|
78
|
+
def can_skip(self, context: UploadContext) -> bool:
|
|
79
|
+
"""File upload cannot be skipped."""
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def rollback(self, context: UploadContext) -> None:
|
|
83
|
+
"""Rollback file upload."""
|
|
84
|
+
# In a real implementation, this would delete uploaded files
|
|
85
|
+
# For now, just clear the uploaded files list and log
|
|
86
|
+
context.uploaded_files.clear()
|
|
87
|
+
context.run.log_message('Rolled back file uploads')
|
|
88
|
+
|
|
89
|
+
def validate_prerequisites(self, context: UploadContext) -> None:
|
|
90
|
+
"""Validate prerequisites for file upload."""
|
|
91
|
+
if not context.organized_files:
|
|
92
|
+
raise ValueError('No organized files available for upload')
|
|
93
|
+
|
|
94
|
+
collection_id = context.get_param('data_collection')
|
|
95
|
+
if collection_id is None:
|
|
96
|
+
raise ActionError('Data collection parameter is required for upload')
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from ..context import StepResult, UploadContext
|
|
2
|
+
from ..enums import LogCode
|
|
3
|
+
from .base import BaseStep
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ValidateFilesStep(BaseStep):
|
|
7
|
+
"""Validate organized files against specifications."""
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
def name(self) -> str:
|
|
11
|
+
return 'validate_files'
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def progress_weight(self) -> float:
|
|
15
|
+
return 0.10
|
|
16
|
+
|
|
17
|
+
def execute(self, context: UploadContext) -> StepResult:
|
|
18
|
+
"""Execute file validation step."""
|
|
19
|
+
validation_strategy = context.strategies.get('validation')
|
|
20
|
+
if not validation_strategy:
|
|
21
|
+
return self.create_error_result('Validation strategy not found')
|
|
22
|
+
|
|
23
|
+
if not context.organized_files:
|
|
24
|
+
context.run.log_message_with_code(LogCode.NO_FILES_FOUND)
|
|
25
|
+
return self.create_error_result('No organized files to validate')
|
|
26
|
+
|
|
27
|
+
if not context.file_specifications:
|
|
28
|
+
return self.create_error_result('File specifications not available')
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Validate organized files against specifications
|
|
32
|
+
validation_result = validation_strategy.validate_files(context.organized_files, context.file_specifications)
|
|
33
|
+
|
|
34
|
+
if not validation_result.valid:
|
|
35
|
+
context.run.log_message_with_code(LogCode.VALIDATION_FAILED)
|
|
36
|
+
error_msg = f'File validation failed: {", ".join(validation_result.errors)}'
|
|
37
|
+
return self.create_error_result(error_msg)
|
|
38
|
+
|
|
39
|
+
return self.create_success_result(
|
|
40
|
+
data={'validation_passed': True}, rollback_data={'validated_files_count': len(context.organized_files)}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
except Exception as e:
|
|
44
|
+
return self.create_error_result(f'File validation failed: {str(e)}')
|
|
45
|
+
|
|
46
|
+
def can_skip(self, context: UploadContext) -> bool:
|
|
47
|
+
"""File validation cannot be skipped."""
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
def rollback(self, context: UploadContext) -> None:
|
|
51
|
+
"""Rollback file validation."""
|
|
52
|
+
# Nothing specific to rollback for validation
|
|
53
|
+
context.run.log_message('Rolled back file validation')
|
|
54
|
+
|
|
55
|
+
def validate_prerequisites(self, context: UploadContext) -> None:
|
|
56
|
+
"""Validate prerequisites for file validation."""
|
|
57
|
+
if not context.organized_files:
|
|
58
|
+
raise ValueError('No organized files available for validation')
|
|
59
|
+
|
|
60
|
+
if not context.file_specifications:
|
|
61
|
+
raise ValueError('File specifications not available for validation')
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Strategy pattern implementations for upload actions
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ValidationResult:
|
|
7
|
+
"""Result of validation operations."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, valid: bool, errors: List[str] = None):
|
|
10
|
+
self.valid = valid
|
|
11
|
+
self.errors = errors or []
|
|
12
|
+
|
|
13
|
+
def __bool__(self):
|
|
14
|
+
return self.valid
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ValidationStrategy(ABC):
|
|
18
|
+
"""Strategy interface for validation operations."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def validate_params(self, params: Dict) -> ValidationResult:
|
|
22
|
+
"""Validate action parameters."""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def validate_files(self, files: List[Dict], specs: Dict) -> ValidationResult:
|
|
27
|
+
"""Validate organized files against specifications."""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class FileDiscoveryStrategy(ABC):
|
|
32
|
+
"""Strategy interface for file discovery and organization."""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def discover(self, path: Path, recursive: bool) -> List[Path]:
|
|
36
|
+
"""Discover files in the given path."""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
|
|
41
|
+
"""Organize files according to specifications."""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MetadataStrategy(ABC):
|
|
46
|
+
"""Strategy interface for metadata extraction and processing."""
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
50
|
+
"""Extract metadata from source (e.g., Excel file)."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def validate(self, metadata: Dict) -> ValidationResult:
|
|
55
|
+
"""Validate extracted metadata."""
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class UploadConfig:
|
|
60
|
+
"""Configuration for upload operations."""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self, use_async: bool = True, max_concurrent: int = 10, chunked_threshold_mb: int = 50, batch_size: int = 1
|
|
64
|
+
):
|
|
65
|
+
self.use_async = use_async
|
|
66
|
+
self.max_concurrent = max_concurrent
|
|
67
|
+
self.chunked_threshold_mb = chunked_threshold_mb
|
|
68
|
+
self.batch_size = batch_size
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class UploadStrategy(ABC):
|
|
72
|
+
"""Strategy interface for file upload operations."""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
76
|
+
"""Upload files to storage."""
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DataUnitStrategy(ABC):
|
|
81
|
+
"""Strategy interface for data unit generation."""
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
|
|
85
|
+
"""Generate data units from uploaded files."""
|
|
86
|
+
pass
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Data unit strategy implementations
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from synapse_sdk.clients.utils import get_batched_list
|
|
4
|
+
|
|
5
|
+
from ...enums import LogCode, UploadStatus
|
|
6
|
+
from ..base import DataUnitStrategy
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BatchDataUnitStrategy(DataUnitStrategy):
|
|
10
|
+
"""Batch data unit generation strategy."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, context):
|
|
13
|
+
self.context = context
|
|
14
|
+
|
|
15
|
+
def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
|
|
16
|
+
"""Generate data units in batches."""
|
|
17
|
+
client = self.context.client
|
|
18
|
+
generated_data_units = []
|
|
19
|
+
|
|
20
|
+
# Use the same batching logic as the legacy implementation
|
|
21
|
+
batches = get_batched_list(uploaded_files, batch_size)
|
|
22
|
+
|
|
23
|
+
for batch in batches:
|
|
24
|
+
try:
|
|
25
|
+
created_data_units = client.create_data_units(batch)
|
|
26
|
+
generated_data_units.extend(created_data_units)
|
|
27
|
+
|
|
28
|
+
# Log each created data unit
|
|
29
|
+
for created_data_unit in created_data_units:
|
|
30
|
+
self.context.run.log_data_unit(
|
|
31
|
+
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
32
|
+
)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
self.context.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
|
|
35
|
+
# Log failed data units
|
|
36
|
+
for _ in batch:
|
|
37
|
+
self.context.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
38
|
+
|
|
39
|
+
return generated_data_units
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from ...enums import LogCode, UploadStatus
|
|
4
|
+
from ..base import DataUnitStrategy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SingleDataUnitStrategy(DataUnitStrategy):
|
|
8
|
+
"""Single data unit generation strategy."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, context):
|
|
11
|
+
self.context = context
|
|
12
|
+
|
|
13
|
+
def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
|
|
14
|
+
"""Generate data units individually."""
|
|
15
|
+
client = self.context.client
|
|
16
|
+
generated_data_units = []
|
|
17
|
+
|
|
18
|
+
for uploaded_file in uploaded_files:
|
|
19
|
+
try:
|
|
20
|
+
# Create data unit for single file (batch of 1)
|
|
21
|
+
created_data_units = client.create_data_units([uploaded_file])
|
|
22
|
+
generated_data_units.extend(created_data_units)
|
|
23
|
+
|
|
24
|
+
# Log each created data unit
|
|
25
|
+
for created_data_unit in created_data_units:
|
|
26
|
+
self.context.run.log_data_unit(
|
|
27
|
+
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
28
|
+
)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
self.context.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
|
|
31
|
+
# Log failed data unit
|
|
32
|
+
self.context.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
33
|
+
|
|
34
|
+
return generated_data_units
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# File discovery strategy implementations
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from ..base import FileDiscoveryStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
|
|
9
|
+
"""Non-recursive file discovery strategy."""
|
|
10
|
+
|
|
11
|
+
def discover(self, path: Path, recursive: bool) -> List[Path]:
|
|
12
|
+
"""Discover files non-recursively in the given path."""
|
|
13
|
+
return [file_path for file_path in path.glob('*') if file_path.is_file()]
|
|
14
|
+
|
|
15
|
+
def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
|
|
16
|
+
"""Organize files according to specifications with metadata."""
|
|
17
|
+
organized_files = []
|
|
18
|
+
|
|
19
|
+
# Use provided type_dirs or create fallback mapping
|
|
20
|
+
if type_dirs is None:
|
|
21
|
+
type_dirs = {}
|
|
22
|
+
for spec in specs:
|
|
23
|
+
spec_name = spec['name']
|
|
24
|
+
# Fallback: extract spec directory from file paths
|
|
25
|
+
for file_path in files:
|
|
26
|
+
# Check if this file's path contains the spec_name as a directory
|
|
27
|
+
path_parts = file_path.parts
|
|
28
|
+
if spec_name in path_parts:
|
|
29
|
+
# Find the index of spec_name and reconstruct the path up to that directory
|
|
30
|
+
spec_index = path_parts.index(spec_name)
|
|
31
|
+
spec_dir = Path(*path_parts[: spec_index + 1])
|
|
32
|
+
if spec_dir.exists() and spec_dir.is_dir():
|
|
33
|
+
type_dirs[spec_name] = spec_dir
|
|
34
|
+
break
|
|
35
|
+
|
|
36
|
+
if not type_dirs:
|
|
37
|
+
return organized_files
|
|
38
|
+
|
|
39
|
+
# Performance optimization 2: Build metadata index for faster lookups
|
|
40
|
+
metadata_index = self._build_metadata_index(metadata)
|
|
41
|
+
|
|
42
|
+
# Group files by dataset (flat discovery)
|
|
43
|
+
dataset_files = {}
|
|
44
|
+
required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
|
|
45
|
+
|
|
46
|
+
for file_path in files:
|
|
47
|
+
# Determine which type directory this file belongs to
|
|
48
|
+
for spec_name, dir_path in type_dirs.items():
|
|
49
|
+
if file_path.parent == dir_path: # Only direct children
|
|
50
|
+
file_name = file_path.stem
|
|
51
|
+
|
|
52
|
+
if file_name not in dataset_files:
|
|
53
|
+
dataset_files[file_name] = {}
|
|
54
|
+
|
|
55
|
+
if spec_name not in dataset_files[file_name]:
|
|
56
|
+
dataset_files[file_name][spec_name] = file_path
|
|
57
|
+
else:
|
|
58
|
+
# Keep the most recent file - only stat when needed
|
|
59
|
+
existing_file = dataset_files[file_name][spec_name]
|
|
60
|
+
try:
|
|
61
|
+
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
62
|
+
dataset_files[file_name][spec_name] = file_path
|
|
63
|
+
except (OSError, IOError):
|
|
64
|
+
# If stat fails, keep existing file
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
# Create organized files for datasets with all required files
|
|
68
|
+
for file_name, files_dict in sorted(dataset_files.items()):
|
|
69
|
+
if all(req in files_dict for req in required_specs):
|
|
70
|
+
# Calculate most common file extension (optimized)
|
|
71
|
+
file_extensions = {}
|
|
72
|
+
for file_path in files_dict.values():
|
|
73
|
+
ext = file_path.suffix.lower()
|
|
74
|
+
if ext:
|
|
75
|
+
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
76
|
+
|
|
77
|
+
origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
|
|
78
|
+
|
|
79
|
+
meta_data = {
|
|
80
|
+
'origin_file_stem': file_name,
|
|
81
|
+
'origin_file_extension': origin_file_extension,
|
|
82
|
+
'created_at': datetime.now().isoformat(),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Add metadata if available - using optimized index lookup
|
|
86
|
+
if metadata_index:
|
|
87
|
+
matched_metadata = self._find_matching_metadata_optimized(file_name, files_dict, metadata_index)
|
|
88
|
+
if matched_metadata:
|
|
89
|
+
meta_data.update(matched_metadata)
|
|
90
|
+
|
|
91
|
+
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
92
|
+
|
|
93
|
+
return organized_files
|
|
94
|
+
|
|
95
|
+
def _build_metadata_index(self, metadata: Dict) -> Dict:
|
|
96
|
+
"""Build metadata index for faster lookups."""
|
|
97
|
+
if not metadata:
|
|
98
|
+
return {}
|
|
99
|
+
|
|
100
|
+
metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
|
|
101
|
+
|
|
102
|
+
for meta_key, meta_value in metadata.items():
|
|
103
|
+
meta_path = Path(meta_key)
|
|
104
|
+
|
|
105
|
+
# Index by stem
|
|
106
|
+
stem = meta_path.stem
|
|
107
|
+
if stem:
|
|
108
|
+
metadata_index['exact_stem'][stem] = meta_value
|
|
109
|
+
metadata_index['stem_lookup'][stem] = meta_value
|
|
110
|
+
|
|
111
|
+
# Index by full name
|
|
112
|
+
name = meta_path.name
|
|
113
|
+
if name:
|
|
114
|
+
metadata_index['exact_name'][name] = meta_value
|
|
115
|
+
|
|
116
|
+
# Index for partial path matching
|
|
117
|
+
metadata_index['partial_paths'][meta_key] = meta_value
|
|
118
|
+
|
|
119
|
+
# Index for full path matching
|
|
120
|
+
metadata_index['full_paths'][meta_key] = meta_value
|
|
121
|
+
|
|
122
|
+
return metadata_index
|
|
123
|
+
|
|
124
|
+
def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
|
|
125
|
+
"""Find matching metadata using optimized index lookups."""
|
|
126
|
+
if not metadata_index:
|
|
127
|
+
return {}
|
|
128
|
+
|
|
129
|
+
# Strategy 1: Exact stem match (O(1) lookup)
|
|
130
|
+
if file_name in metadata_index['exact_stem']:
|
|
131
|
+
return metadata_index['exact_stem'][file_name]
|
|
132
|
+
|
|
133
|
+
# Strategy 2: Exact filename match with extension (O(1) lookup)
|
|
134
|
+
sample_file = list(files_dict.values())[0] if files_dict else None
|
|
135
|
+
if sample_file:
|
|
136
|
+
full_filename = f'{file_name}{sample_file.suffix}'
|
|
137
|
+
if full_filename in metadata_index['exact_name']:
|
|
138
|
+
return metadata_index['exact_name'][full_filename]
|
|
139
|
+
|
|
140
|
+
# Try sample file name
|
|
141
|
+
sample_filename = sample_file.name
|
|
142
|
+
if sample_filename in metadata_index['exact_name']:
|
|
143
|
+
return metadata_index['exact_name'][sample_filename]
|
|
144
|
+
|
|
145
|
+
# Strategy 3: Stem lookup (already optimized above)
|
|
146
|
+
# This is covered by exact_stem lookup
|
|
147
|
+
|
|
148
|
+
# Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
|
|
149
|
+
if sample_file:
|
|
150
|
+
file_path_str = str(sample_file)
|
|
151
|
+
file_path_posix = sample_file.as_posix()
|
|
152
|
+
|
|
153
|
+
# Check partial paths
|
|
154
|
+
for meta_key in metadata_index['partial_paths']:
|
|
155
|
+
if (
|
|
156
|
+
meta_key in file_path_str
|
|
157
|
+
or meta_key in file_path_posix
|
|
158
|
+
or file_path_str in meta_key
|
|
159
|
+
or file_path_posix in meta_key
|
|
160
|
+
):
|
|
161
|
+
return metadata_index['partial_paths'][meta_key]
|
|
162
|
+
|
|
163
|
+
return {}
|
|
164
|
+
|
|
165
|
+
def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
|
|
166
|
+
"""Find matching metadata using comprehensive pattern matching.
|
|
167
|
+
|
|
168
|
+
Matching priority:
|
|
169
|
+
1. Exact stem match (highest priority)
|
|
170
|
+
2. Exact filename match (with extension)
|
|
171
|
+
3. Metadata key stem matches file stem
|
|
172
|
+
4. Partial path matching
|
|
173
|
+
5. Full path matching
|
|
174
|
+
"""
|
|
175
|
+
if not metadata:
|
|
176
|
+
return {}
|
|
177
|
+
|
|
178
|
+
# Get sample file for extension and path information
|
|
179
|
+
sample_file = list(files_dict.values())[0] if files_dict else None
|
|
180
|
+
|
|
181
|
+
# Strategy 1: Exact stem match (highest priority)
|
|
182
|
+
if file_name in metadata:
|
|
183
|
+
return metadata[file_name]
|
|
184
|
+
|
|
185
|
+
# Strategy 2: Exact filename match (with extension)
|
|
186
|
+
if sample_file:
|
|
187
|
+
full_filename = f'{file_name}{sample_file.suffix}'
|
|
188
|
+
if full_filename in metadata:
|
|
189
|
+
return metadata[full_filename]
|
|
190
|
+
|
|
191
|
+
# Also try with sample file name
|
|
192
|
+
sample_filename = sample_file.name
|
|
193
|
+
if sample_filename in metadata:
|
|
194
|
+
return metadata[sample_filename]
|
|
195
|
+
|
|
196
|
+
# Strategy 3: Metadata key stem matches file stem
|
|
197
|
+
for meta_key in metadata.keys():
|
|
198
|
+
meta_stem = Path(meta_key).stem
|
|
199
|
+
if meta_stem == file_name:
|
|
200
|
+
return metadata[meta_key]
|
|
201
|
+
|
|
202
|
+
# Strategy 4: Partial path matching
|
|
203
|
+
if sample_file:
|
|
204
|
+
file_path_parts = sample_file.parts
|
|
205
|
+
for meta_key in metadata.keys():
|
|
206
|
+
meta_path = Path(meta_key)
|
|
207
|
+
# Check if any part of the metadata key matches our file path parts
|
|
208
|
+
for part in file_path_parts:
|
|
209
|
+
if part in str(meta_path) or str(meta_path) in part:
|
|
210
|
+
# Additional validation: ensure it's a reasonable match
|
|
211
|
+
if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
|
|
212
|
+
return metadata[meta_key]
|
|
213
|
+
|
|
214
|
+
# Strategy 5: Full path matching
|
|
215
|
+
if sample_file:
|
|
216
|
+
full_path_str = str(sample_file)
|
|
217
|
+
full_path_posix = sample_file.as_posix()
|
|
218
|
+
|
|
219
|
+
for meta_key in metadata.keys():
|
|
220
|
+
# Direct path match
|
|
221
|
+
if meta_key == full_path_str or meta_key == full_path_posix:
|
|
222
|
+
return metadata[meta_key]
|
|
223
|
+
|
|
224
|
+
# Relative path match (check if meta_key is contained in our path)
|
|
225
|
+
if meta_key in full_path_str or meta_key in full_path_posix:
|
|
226
|
+
return metadata[meta_key]
|
|
227
|
+
|
|
228
|
+
# Reverse match (check if our path is contained in meta_key)
|
|
229
|
+
if full_path_str in meta_key or full_path_posix in meta_key:
|
|
230
|
+
return metadata[meta_key]
|
|
231
|
+
|
|
232
|
+
# No match found
|
|
233
|
+
return {}
|