synapse-sdk 2025.9.1__py3-none-any.whl → 2025.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synapse-sdk might be problematic. Click here for more details.

Files changed (80) hide show
  1. synapse_sdk/devtools/docs/docs/api/clients/annotation-mixin.md +378 -0
  2. synapse_sdk/devtools/docs/docs/api/clients/backend.md +368 -1
  3. synapse_sdk/devtools/docs/docs/api/clients/core-mixin.md +477 -0
  4. synapse_sdk/devtools/docs/docs/api/clients/data-collection-mixin.md +422 -0
  5. synapse_sdk/devtools/docs/docs/api/clients/hitl-mixin.md +554 -0
  6. synapse_sdk/devtools/docs/docs/api/clients/index.md +391 -0
  7. synapse_sdk/devtools/docs/docs/api/clients/integration-mixin.md +571 -0
  8. synapse_sdk/devtools/docs/docs/api/clients/ml-mixin.md +578 -0
  9. synapse_sdk/devtools/docs/docs/plugins/developing-upload-template.md +1463 -0
  10. synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +161 -34
  11. synapse_sdk/devtools/docs/docs/plugins/upload-plugins.md +1497 -213
  12. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/annotation-mixin.md +289 -0
  13. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/backend.md +378 -11
  14. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/core-mixin.md +417 -0
  15. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/data-collection-mixin.md +356 -0
  16. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/hitl-mixin.md +192 -0
  17. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/index.md +391 -0
  18. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/integration-mixin.md +479 -0
  19. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ml-mixin.md +284 -0
  20. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/developing-upload-template.md +1463 -0
  21. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +161 -34
  22. synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/upload-plugins.md +1752 -572
  23. synapse_sdk/devtools/docs/sidebars.ts +7 -0
  24. synapse_sdk/plugins/README.md +1 -2
  25. synapse_sdk/plugins/categories/base.py +23 -0
  26. synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
  27. synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
  28. synapse_sdk/plugins/categories/export/actions/export/action.py +160 -0
  29. synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
  30. synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
  31. synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
  32. synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
  33. synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
  34. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +1 -1
  35. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +1 -2
  36. synapse_sdk/plugins/categories/upload/actions/upload/action.py +154 -531
  37. synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
  38. synapse_sdk/plugins/categories/upload/actions/upload/factory.py +143 -0
  39. synapse_sdk/plugins/categories/upload/actions/upload/models.py +66 -29
  40. synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +182 -0
  41. synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
  42. synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
  43. synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +106 -0
  44. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
  45. synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +62 -0
  46. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +80 -0
  47. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +66 -0
  48. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +101 -0
  49. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +89 -0
  50. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +96 -0
  51. synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +61 -0
  52. synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +86 -0
  54. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
  55. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
  56. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +34 -0
  57. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
  58. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +233 -0
  59. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +238 -0
  60. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
  61. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
  62. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
  63. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
  64. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/async_upload.py +109 -0
  65. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +43 -0
  66. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
  67. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +45 -0
  68. synapse_sdk/plugins/categories/upload/actions/upload/utils.py +194 -83
  69. synapse_sdk/plugins/categories/upload/templates/config.yaml +4 -0
  70. synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +269 -0
  71. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +71 -27
  72. synapse_sdk/plugins/models.py +5 -0
  73. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/METADATA +2 -1
  74. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/RECORD +78 -27
  75. synapse_sdk/plugins/categories/export/actions/export.py +0 -385
  76. synapse_sdk/plugins/categories/export/enums.py +0 -7
  77. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/WHEEL +0 -0
  78. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/entry_points.txt +0 -0
  79. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  80. {synapse_sdk-2025.9.1.dist-info → synapse_sdk-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,89 @@
1
+ from ..context import StepResult, UploadContext
2
+ from ..enums import LogCode
3
+ from .base import BaseStep
4
+
5
+
6
+ class OrganizeFilesStep(BaseStep):
7
+ """Organize files according to specifications using file discovery strategy."""
8
+
9
+ @property
10
+ def name(self) -> str:
11
+ return 'organize_files'
12
+
13
+ @property
14
+ def progress_weight(self) -> float:
15
+ return 0.15
16
+
17
+ def execute(self, context: UploadContext) -> StepResult:
18
+ """Execute file organization step."""
19
+ file_discovery_strategy = context.strategies.get('file_discovery')
20
+ if not file_discovery_strategy:
21
+ return self.create_error_result('File discovery strategy not found')
22
+
23
+ if not context.file_specifications:
24
+ return self.create_error_result('File specifications not available')
25
+
26
+ try:
27
+ # Create type directories mapping
28
+ type_dirs = {}
29
+ for spec in context.file_specifications:
30
+ spec_name = spec['name']
31
+ spec_dir = context.pathlib_cwd / spec_name
32
+ if spec_dir.exists() and spec_dir.is_dir():
33
+ type_dirs[spec_name] = spec_dir
34
+
35
+ if type_dirs:
36
+ context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
37
+ else:
38
+ context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
39
+ return self.create_success_result(data={'organized_files': []})
40
+
41
+ context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
42
+ context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
43
+
44
+ # Discover files in type directories
45
+ all_files = []
46
+ is_recursive = context.get_param('is_recursive', True)
47
+
48
+ for spec_name, dir_path in type_dirs.items():
49
+ files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
50
+ all_files.extend(files_in_dir)
51
+
52
+ if not all_files:
53
+ context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
54
+ return self.create_success_result(data={'organized_files': []})
55
+
56
+ # Organize files using strategy
57
+ organized_files = file_discovery_strategy.organize(
58
+ all_files, context.file_specifications, context.metadata or {}, type_dirs
59
+ )
60
+
61
+ if organized_files:
62
+ context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
63
+ context.add_organized_files(organized_files)
64
+
65
+ return self.create_success_result(
66
+ data={'organized_files': organized_files},
67
+ rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
68
+ )
69
+
70
+ except Exception as e:
71
+ return self.create_error_result(f'File organization failed: {str(e)}')
72
+
73
+ def can_skip(self, context: UploadContext) -> bool:
74
+ """File organization cannot be skipped."""
75
+ return False
76
+
77
+ def rollback(self, context: UploadContext) -> None:
78
+ """Rollback file organization."""
79
+ # Clear organized files
80
+ context.organized_files.clear()
81
+ context.run.log_message('Rolled back file organization')
82
+
83
+ def validate_prerequisites(self, context: UploadContext) -> None:
84
+ """Validate prerequisites for file organization."""
85
+ if not context.pathlib_cwd:
86
+ raise ValueError('Working directory path not set')
87
+
88
+ if not context.file_specifications:
89
+ raise ValueError('File specifications not available')
@@ -0,0 +1,96 @@
1
+ from synapse_sdk.plugins.exceptions import ActionError
2
+
3
+ from ..context import StepResult, UploadContext
4
+ from ..enums import LogCode, UploadStatus
5
+ from ..strategies.base import UploadConfig
6
+ from .base import BaseStep
7
+
8
+
9
+ class UploadFilesStep(BaseStep):
10
+ """Upload organized files using upload strategy."""
11
+
12
+ @property
13
+ def name(self) -> str:
14
+ return 'upload_files'
15
+
16
+ @property
17
+ def progress_weight(self) -> float:
18
+ return 0.30
19
+
20
+ def execute(self, context: UploadContext) -> StepResult:
21
+ """Execute file upload step."""
22
+ upload_strategy = context.strategies.get('upload')
23
+ if not upload_strategy:
24
+ return self.create_error_result('Upload strategy not found')
25
+
26
+ if not context.organized_files:
27
+ context.run.log_message_with_code(LogCode.NO_FILES_UPLOADED)
28
+ return self.create_error_result('No organized files to upload')
29
+
30
+ try:
31
+ # Setup progress tracking
32
+ organized_files_count = len(context.organized_files)
33
+ context.run.set_progress(0, organized_files_count, category='upload_data_files')
34
+ context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
35
+
36
+ # Initialize metrics
37
+ context.update_metrics('data_files', {'stand_by': organized_files_count, 'success': 0, 'failed': 0})
38
+
39
+ # Create upload configuration
40
+ upload_config = UploadConfig(
41
+ use_async=context.get_param('use_async_upload', True),
42
+ max_concurrent=context.get_param('max_concurrent_uploads', 10),
43
+ chunked_threshold_mb=context.get_param('max_file_size_mb', 50),
44
+ batch_size=context.get_param('upload_batch_size', 1),
45
+ )
46
+
47
+ # Execute upload using strategy
48
+ uploaded_files = upload_strategy.upload(context.organized_files, upload_config)
49
+
50
+ # Update context and metrics
51
+ context.add_uploaded_files(uploaded_files)
52
+
53
+ # Log upload results
54
+ for uploaded_file in uploaded_files:
55
+ context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
56
+
57
+ # Update final metrics
58
+ context.update_metrics(
59
+ 'data_files',
60
+ {'stand_by': 0, 'success': len(uploaded_files), 'failed': organized_files_count - len(uploaded_files)},
61
+ )
62
+
63
+ # Complete progress
64
+ context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
65
+
66
+ return self.create_success_result(
67
+ data={'uploaded_files': uploaded_files},
68
+ rollback_data={
69
+ 'uploaded_files_count': len(uploaded_files),
70
+ 'upload_method': 'async' if upload_config.use_async else 'sync',
71
+ },
72
+ )
73
+
74
+ except Exception as e:
75
+ context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
76
+ return self.create_error_result(f'File upload failed: {str(e)}')
77
+
78
+ def can_skip(self, context: UploadContext) -> bool:
79
+ """File upload cannot be skipped."""
80
+ return False
81
+
82
+ def rollback(self, context: UploadContext) -> None:
83
+ """Rollback file upload."""
84
+ # In a real implementation, this would delete uploaded files
85
+ # For now, just clear the uploaded files list and log
86
+ context.uploaded_files.clear()
87
+ context.run.log_message('Rolled back file uploads')
88
+
89
+ def validate_prerequisites(self, context: UploadContext) -> None:
90
+ """Validate prerequisites for file upload."""
91
+ if not context.organized_files:
92
+ raise ValueError('No organized files available for upload')
93
+
94
+ collection_id = context.get_param('data_collection')
95
+ if collection_id is None:
96
+ raise ActionError('Data collection parameter is required for upload')
@@ -0,0 +1,61 @@
1
+ from ..context import StepResult, UploadContext
2
+ from ..enums import LogCode
3
+ from .base import BaseStep
4
+
5
+
6
+ class ValidateFilesStep(BaseStep):
7
+ """Validate organized files against specifications."""
8
+
9
+ @property
10
+ def name(self) -> str:
11
+ return 'validate_files'
12
+
13
+ @property
14
+ def progress_weight(self) -> float:
15
+ return 0.10
16
+
17
+ def execute(self, context: UploadContext) -> StepResult:
18
+ """Execute file validation step."""
19
+ validation_strategy = context.strategies.get('validation')
20
+ if not validation_strategy:
21
+ return self.create_error_result('Validation strategy not found')
22
+
23
+ if not context.organized_files:
24
+ context.run.log_message_with_code(LogCode.NO_FILES_FOUND)
25
+ return self.create_error_result('No organized files to validate')
26
+
27
+ if not context.file_specifications:
28
+ return self.create_error_result('File specifications not available')
29
+
30
+ try:
31
+ # Validate organized files against specifications
32
+ validation_result = validation_strategy.validate_files(context.organized_files, context.file_specifications)
33
+
34
+ if not validation_result.valid:
35
+ context.run.log_message_with_code(LogCode.VALIDATION_FAILED)
36
+ error_msg = f'File validation failed: {", ".join(validation_result.errors)}'
37
+ return self.create_error_result(error_msg)
38
+
39
+ return self.create_success_result(
40
+ data={'validation_passed': True}, rollback_data={'validated_files_count': len(context.organized_files)}
41
+ )
42
+
43
+ except Exception as e:
44
+ return self.create_error_result(f'File validation failed: {str(e)}')
45
+
46
+ def can_skip(self, context: UploadContext) -> bool:
47
+ """File validation cannot be skipped."""
48
+ return False
49
+
50
+ def rollback(self, context: UploadContext) -> None:
51
+ """Rollback file validation."""
52
+ # Nothing specific to rollback for validation
53
+ context.run.log_message('Rolled back file validation')
54
+
55
+ def validate_prerequisites(self, context: UploadContext) -> None:
56
+ """Validate prerequisites for file validation."""
57
+ if not context.organized_files:
58
+ raise ValueError('No organized files available for validation')
59
+
60
+ if not context.file_specifications:
61
+ raise ValueError('File specifications not available for validation')
@@ -0,0 +1 @@
1
+ # Strategy pattern implementations for upload actions
@@ -0,0 +1,86 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class ValidationResult:
7
+ """Result of validation operations."""
8
+
9
+ def __init__(self, valid: bool, errors: List[str] = None):
10
+ self.valid = valid
11
+ self.errors = errors or []
12
+
13
+ def __bool__(self):
14
+ return self.valid
15
+
16
+
17
+ class ValidationStrategy(ABC):
18
+ """Strategy interface for validation operations."""
19
+
20
+ @abstractmethod
21
+ def validate_params(self, params: Dict) -> ValidationResult:
22
+ """Validate action parameters."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def validate_files(self, files: List[Dict], specs: Dict) -> ValidationResult:
27
+ """Validate organized files against specifications."""
28
+ pass
29
+
30
+
31
+ class FileDiscoveryStrategy(ABC):
32
+ """Strategy interface for file discovery and organization."""
33
+
34
+ @abstractmethod
35
+ def discover(self, path: Path, recursive: bool) -> List[Path]:
36
+ """Discover files in the given path."""
37
+ pass
38
+
39
+ @abstractmethod
40
+ def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
41
+ """Organize files according to specifications."""
42
+ pass
43
+
44
+
45
+ class MetadataStrategy(ABC):
46
+ """Strategy interface for metadata extraction and processing."""
47
+
48
+ @abstractmethod
49
+ def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
50
+ """Extract metadata from source (e.g., Excel file)."""
51
+ pass
52
+
53
+ @abstractmethod
54
+ def validate(self, metadata: Dict) -> ValidationResult:
55
+ """Validate extracted metadata."""
56
+ pass
57
+
58
+
59
+ class UploadConfig:
60
+ """Configuration for upload operations."""
61
+
62
+ def __init__(
63
+ self, use_async: bool = True, max_concurrent: int = 10, chunked_threshold_mb: int = 50, batch_size: int = 1
64
+ ):
65
+ self.use_async = use_async
66
+ self.max_concurrent = max_concurrent
67
+ self.chunked_threshold_mb = chunked_threshold_mb
68
+ self.batch_size = batch_size
69
+
70
+
71
+ class UploadStrategy(ABC):
72
+ """Strategy interface for file upload operations."""
73
+
74
+ @abstractmethod
75
+ def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
76
+ """Upload files to storage."""
77
+ pass
78
+
79
+
80
+ class DataUnitStrategy(ABC):
81
+ """Strategy interface for data unit generation."""
82
+
83
+ @abstractmethod
84
+ def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
85
+ """Generate data units from uploaded files."""
86
+ pass
@@ -0,0 +1 @@
1
+ # Data unit strategy implementations
@@ -0,0 +1,39 @@
1
+ from typing import Dict, List
2
+
3
+ from synapse_sdk.clients.utils import get_batched_list
4
+
5
+ from ...enums import LogCode, UploadStatus
6
+ from ..base import DataUnitStrategy
7
+
8
+
9
+ class BatchDataUnitStrategy(DataUnitStrategy):
10
+ """Batch data unit generation strategy."""
11
+
12
+ def __init__(self, context):
13
+ self.context = context
14
+
15
+ def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
16
+ """Generate data units in batches."""
17
+ client = self.context.client
18
+ generated_data_units = []
19
+
20
+ # Use the same batching logic as the legacy implementation
21
+ batches = get_batched_list(uploaded_files, batch_size)
22
+
23
+ for batch in batches:
24
+ try:
25
+ created_data_units = client.create_data_units(batch)
26
+ generated_data_units.extend(created_data_units)
27
+
28
+ # Log each created data unit
29
+ for created_data_unit in created_data_units:
30
+ self.context.run.log_data_unit(
31
+ created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
32
+ )
33
+ except Exception as e:
34
+ self.context.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
35
+ # Log failed data units
36
+ for _ in batch:
37
+ self.context.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
38
+
39
+ return generated_data_units
@@ -0,0 +1,34 @@
1
+ from typing import Dict, List
2
+
3
+ from ...enums import LogCode, UploadStatus
4
+ from ..base import DataUnitStrategy
5
+
6
+
7
+ class SingleDataUnitStrategy(DataUnitStrategy):
8
+ """Single data unit generation strategy."""
9
+
10
+ def __init__(self, context):
11
+ self.context = context
12
+
13
+ def generate(self, uploaded_files: List[Dict], batch_size: int) -> List[Dict]:
14
+ """Generate data units individually."""
15
+ client = self.context.client
16
+ generated_data_units = []
17
+
18
+ for uploaded_file in uploaded_files:
19
+ try:
20
+ # Create data unit for single file (batch of 1)
21
+ created_data_units = client.create_data_units([uploaded_file])
22
+ generated_data_units.extend(created_data_units)
23
+
24
+ # Log each created data unit
25
+ for created_data_unit in created_data_units:
26
+ self.context.run.log_data_unit(
27
+ created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
28
+ )
29
+ except Exception as e:
30
+ self.context.run.log_message_with_code(LogCode.DATA_UNIT_BATCH_FAILED, str(e))
31
+ # Log failed data unit
32
+ self.context.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
33
+
34
+ return generated_data_units
@@ -0,0 +1 @@
1
+ # File discovery strategy implementations
@@ -0,0 +1,233 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+
5
+ from ..base import FileDiscoveryStrategy
6
+
7
+
8
+ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
9
+ """Non-recursive file discovery strategy."""
10
+
11
+ def discover(self, path: Path, recursive: bool) -> List[Path]:
12
+ """Discover files non-recursively in the given path."""
13
+ return [file_path for file_path in path.glob('*') if file_path.is_file()]
14
+
15
+ def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
16
+ """Organize files according to specifications with metadata."""
17
+ organized_files = []
18
+
19
+ # Use provided type_dirs or create fallback mapping
20
+ if type_dirs is None:
21
+ type_dirs = {}
22
+ for spec in specs:
23
+ spec_name = spec['name']
24
+ # Fallback: extract spec directory from file paths
25
+ for file_path in files:
26
+ # Check if this file's path contains the spec_name as a directory
27
+ path_parts = file_path.parts
28
+ if spec_name in path_parts:
29
+ # Find the index of spec_name and reconstruct the path up to that directory
30
+ spec_index = path_parts.index(spec_name)
31
+ spec_dir = Path(*path_parts[: spec_index + 1])
32
+ if spec_dir.exists() and spec_dir.is_dir():
33
+ type_dirs[spec_name] = spec_dir
34
+ break
35
+
36
+ if not type_dirs:
37
+ return organized_files
38
+
39
+ # Performance optimization 2: Build metadata index for faster lookups
40
+ metadata_index = self._build_metadata_index(metadata)
41
+
42
+ # Group files by dataset (flat discovery)
43
+ dataset_files = {}
44
+ required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
45
+
46
+ for file_path in files:
47
+ # Determine which type directory this file belongs to
48
+ for spec_name, dir_path in type_dirs.items():
49
+ if file_path.parent == dir_path: # Only direct children
50
+ file_name = file_path.stem
51
+
52
+ if file_name not in dataset_files:
53
+ dataset_files[file_name] = {}
54
+
55
+ if spec_name not in dataset_files[file_name]:
56
+ dataset_files[file_name][spec_name] = file_path
57
+ else:
58
+ # Keep the most recent file - only stat when needed
59
+ existing_file = dataset_files[file_name][spec_name]
60
+ try:
61
+ if file_path.stat().st_mtime > existing_file.stat().st_mtime:
62
+ dataset_files[file_name][spec_name] = file_path
63
+ except (OSError, IOError):
64
+ # If stat fails, keep existing file
65
+ pass
66
+
67
+ # Create organized files for datasets with all required files
68
+ for file_name, files_dict in sorted(dataset_files.items()):
69
+ if all(req in files_dict for req in required_specs):
70
+ # Calculate most common file extension (optimized)
71
+ file_extensions = {}
72
+ for file_path in files_dict.values():
73
+ ext = file_path.suffix.lower()
74
+ if ext:
75
+ file_extensions[ext] = file_extensions.get(ext, 0) + 1
76
+
77
+ origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
78
+
79
+ meta_data = {
80
+ 'origin_file_stem': file_name,
81
+ 'origin_file_extension': origin_file_extension,
82
+ 'created_at': datetime.now().isoformat(),
83
+ }
84
+
85
+ # Add metadata if available - using optimized index lookup
86
+ if metadata_index:
87
+ matched_metadata = self._find_matching_metadata_optimized(file_name, files_dict, metadata_index)
88
+ if matched_metadata:
89
+ meta_data.update(matched_metadata)
90
+
91
+ organized_files.append({'files': files_dict, 'meta': meta_data})
92
+
93
+ return organized_files
94
+
95
+ def _build_metadata_index(self, metadata: Dict) -> Dict:
96
+ """Build metadata index for faster lookups."""
97
+ if not metadata:
98
+ return {}
99
+
100
+ metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
101
+
102
+ for meta_key, meta_value in metadata.items():
103
+ meta_path = Path(meta_key)
104
+
105
+ # Index by stem
106
+ stem = meta_path.stem
107
+ if stem:
108
+ metadata_index['exact_stem'][stem] = meta_value
109
+ metadata_index['stem_lookup'][stem] = meta_value
110
+
111
+ # Index by full name
112
+ name = meta_path.name
113
+ if name:
114
+ metadata_index['exact_name'][name] = meta_value
115
+
116
+ # Index for partial path matching
117
+ metadata_index['partial_paths'][meta_key] = meta_value
118
+
119
+ # Index for full path matching
120
+ metadata_index['full_paths'][meta_key] = meta_value
121
+
122
+ return metadata_index
123
+
124
+ def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
125
+ """Find matching metadata using optimized index lookups."""
126
+ if not metadata_index:
127
+ return {}
128
+
129
+ # Strategy 1: Exact stem match (O(1) lookup)
130
+ if file_name in metadata_index['exact_stem']:
131
+ return metadata_index['exact_stem'][file_name]
132
+
133
+ # Strategy 2: Exact filename match with extension (O(1) lookup)
134
+ sample_file = list(files_dict.values())[0] if files_dict else None
135
+ if sample_file:
136
+ full_filename = f'{file_name}{sample_file.suffix}'
137
+ if full_filename in metadata_index['exact_name']:
138
+ return metadata_index['exact_name'][full_filename]
139
+
140
+ # Try sample file name
141
+ sample_filename = sample_file.name
142
+ if sample_filename in metadata_index['exact_name']:
143
+ return metadata_index['exact_name'][sample_filename]
144
+
145
+ # Strategy 3: Stem lookup (already optimized above)
146
+ # This is covered by exact_stem lookup
147
+
148
+ # Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
149
+ if sample_file:
150
+ file_path_str = str(sample_file)
151
+ file_path_posix = sample_file.as_posix()
152
+
153
+ # Check partial paths
154
+ for meta_key in metadata_index['partial_paths']:
155
+ if (
156
+ meta_key in file_path_str
157
+ or meta_key in file_path_posix
158
+ or file_path_str in meta_key
159
+ or file_path_posix in meta_key
160
+ ):
161
+ return metadata_index['partial_paths'][meta_key]
162
+
163
+ return {}
164
+
165
+ def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
166
+ """Find matching metadata using comprehensive pattern matching.
167
+
168
+ Matching priority:
169
+ 1. Exact stem match (highest priority)
170
+ 2. Exact filename match (with extension)
171
+ 3. Metadata key stem matches file stem
172
+ 4. Partial path matching
173
+ 5. Full path matching
174
+ """
175
+ if not metadata:
176
+ return {}
177
+
178
+ # Get sample file for extension and path information
179
+ sample_file = list(files_dict.values())[0] if files_dict else None
180
+
181
+ # Strategy 1: Exact stem match (highest priority)
182
+ if file_name in metadata:
183
+ return metadata[file_name]
184
+
185
+ # Strategy 2: Exact filename match (with extension)
186
+ if sample_file:
187
+ full_filename = f'{file_name}{sample_file.suffix}'
188
+ if full_filename in metadata:
189
+ return metadata[full_filename]
190
+
191
+ # Also try with sample file name
192
+ sample_filename = sample_file.name
193
+ if sample_filename in metadata:
194
+ return metadata[sample_filename]
195
+
196
+ # Strategy 3: Metadata key stem matches file stem
197
+ for meta_key in metadata.keys():
198
+ meta_stem = Path(meta_key).stem
199
+ if meta_stem == file_name:
200
+ return metadata[meta_key]
201
+
202
+ # Strategy 4: Partial path matching
203
+ if sample_file:
204
+ file_path_parts = sample_file.parts
205
+ for meta_key in metadata.keys():
206
+ meta_path = Path(meta_key)
207
+ # Check if any part of the metadata key matches our file path parts
208
+ for part in file_path_parts:
209
+ if part in str(meta_path) or str(meta_path) in part:
210
+ # Additional validation: ensure it's a reasonable match
211
+ if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
212
+ return metadata[meta_key]
213
+
214
+ # Strategy 5: Full path matching
215
+ if sample_file:
216
+ full_path_str = str(sample_file)
217
+ full_path_posix = sample_file.as_posix()
218
+
219
+ for meta_key in metadata.keys():
220
+ # Direct path match
221
+ if meta_key == full_path_str or meta_key == full_path_posix:
222
+ return metadata[meta_key]
223
+
224
+ # Relative path match (check if meta_key is contained in our path)
225
+ if meta_key in full_path_str or meta_key in full_path_posix:
226
+ return metadata[meta_key]
227
+
228
+ # Reverse match (check if our path is contained in meta_key)
229
+ if full_path_str in meta_key or full_path_posix in meta_key:
230
+ return metadata[meta_key]
231
+
232
+ # No match found
233
+ return {}