PyPI - synapse-sdk - Versions diffs - 2025.9.5__py3-none-any.whl → 2025.10.6__py3-none-any.whl - Mend

synapse-sdk 2025.9.5py3-none-any.whl → 2025.10.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synapse-sdk might be problematic. Click here for more details.

Files changed (78) hide show

synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py CHANGED Viewed

@@ -29,19 +29,34 @@ class InitializeStep(BaseStep):
         except Exception as e:
             return self.create_error_result(f'Failed to get storage {storage_id}: {str(e)}')
-        # Get and validate path
+        # Check if we're in multi-path mode
+        use_single_path = context.get_param('use_single_path', True)
+        # Get and validate path (only required in single-path mode)
         path = context.get_param('path')
-        if path is None:
-            return self.create_error_result('Path parameter is required')
+        pathlib_cwd = None
-        try:
-            pathlib_cwd = get_pathlib(storage, path)
-            context.set_pathlib_cwd(pathlib_cwd)
-        except Exception as e:
-            return self.create_error_result(f'Failed to get path {path}: {str(e)}')
+        if use_single_path:
+            # Single-path mode: global path is required
+            if path is None:
+                return self.create_error_result('Path parameter is required in single-path mode')
+            try:
+                pathlib_cwd = get_pathlib(storage, path)
+                context.set_pathlib_cwd(pathlib_cwd)
+            except Exception as e:
+                return self.create_error_result(f'Failed to get path {path}: {str(e)}')
+        else:
+            # Multi-path mode: global path is optional (each asset has its own path)
+            if path:
+                try:
+                    pathlib_cwd = get_pathlib(storage, path)
+                    context.set_pathlib_cwd(pathlib_cwd)
+                except Exception as e:
+                    return self.create_error_result(f'Failed to get path {path}: {str(e)}')
         # Return success with rollback data
-        rollback_data = {'storage_id': storage_id, 'path': path}
+        rollback_data = {'storage_id': storage_id, 'path': path, 'use_single_path': use_single_path}
         return self.create_success_result(
             data={'storage': storage, 'pathlib_cwd': pathlib_cwd}, rollback_data=rollback_data

synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py CHANGED Viewed

@@ -1,8 +1,11 @@
+import base64
+import tempfile
 from pathlib import Path
 from ..context import StepResult, UploadContext
 from ..enums import LogCode
 from ..exceptions import ExcelParsingError, ExcelSecurityError
+from ..models import ExcelMetadataFile
 from .base import BaseStep
@@ -25,28 +28,48 @@ class ProcessMetadataStep(BaseStep):
             return self.create_success_result(data={'metadata': {}})
         excel_metadata = {}
+        temp_file_to_cleanup = None
         try:
-            # Check if Excel metadata path is specified
-            excel_metadata_path = context.get_param('excel_metadata_path')
-            if excel_metadata_path:
-                # Convert string to Path object
-                if isinstance(excel_metadata_path, str):
-                    excel_metadata_path = Path(excel_metadata_path)
-                if excel_metadata_path.exists() and excel_metadata_path.is_file():
-                    excel_path = excel_metadata_path
-                else:
-                    excel_path = context.pathlib_cwd / excel_metadata_path
-                if not excel_path.exists():
+            # Check if Excel metadata is specified - try both parameters
+            # TODO: Plan to deprecate excel_metadata_path in a few versions (backward compatibility)
+            excel_metadata_path_config = context.get_param('excel_metadata_path')
+            excel_metadata_config = context.get_param('excel_metadata')
+            if excel_metadata_path_config:
+                # Traditional path-based approach (will be deprecated in future)
+                excel_path, is_temp = self._resolve_excel_path_from_string(excel_metadata_path_config, context)
+                if not excel_path or not excel_path.exists():
+                    context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
+                    return self.create_success_result(data={'metadata': {}})
+                excel_metadata = metadata_strategy.extract(excel_path)
+            elif excel_metadata_config:
+                # Base64 encoded approach
+                excel_path, is_temp = self._resolve_excel_path_from_base64(excel_metadata_config, context)
+                if not excel_path or not excel_path.exists():
                     context.run.log_message_with_code(LogCode.EXCEL_FILE_NOT_FOUND_PATH)
                     return self.create_success_result(data={'metadata': {}})
+                # Track temp file for cleanup
+                if is_temp:
+                    temp_file_to_cleanup = excel_path
                 excel_metadata = metadata_strategy.extract(excel_path)
             else:
                 # Look for default metadata files (meta.xlsx, meta.xls)
-                excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
-                if excel_path:
-                    excel_metadata = metadata_strategy.extract(excel_path)
+                # Only possible in single-path mode where pathlib_cwd is set
+                if context.pathlib_cwd:
+                    excel_path = self._find_excel_metadata_file(context.pathlib_cwd)
+                    if excel_path:
+                        excel_metadata = metadata_strategy.extract(excel_path)
+                else:
+                    context.run.log_message(
+                        'No Excel metadata specified and multi-path mode - skipping metadata processing'
+                    )
             # Validate extracted metadata
             if excel_metadata:
@@ -65,9 +88,9 @@ class ProcessMetadataStep(BaseStep):
             return self.create_error_result(f'Excel security violation: {str(e)}')
         except ExcelParsingError as e:
-            # If excel_metadata_path was specified, this is an error
+            # If excel_metadata_path or excel_metadata was specified, this is an error
             # If we were just looking for default files, it's not an error
-            if context.get_param('excel_metadata_path'):
+            if context.get_param('excel_metadata_path') or context.get_param('excel_metadata'):
                 context.run.log_message_with_code(LogCode.EXCEL_PARSING_ERROR, str(e))
                 return self.create_error_result(f'Excel parsing error: {str(e)}')
             else:
@@ -77,6 +100,15 @@ class ProcessMetadataStep(BaseStep):
         except Exception as e:
             return self.create_error_result(f'Unexpected error processing metadata: {str(e)}')
+        finally:
+            # Clean up temporary file if it was created from base64
+            if temp_file_to_cleanup and temp_file_to_cleanup.exists():
+                try:
+                    temp_file_to_cleanup.unlink()
+                    context.run.log_message(f'Cleaned up temporary Excel file: {temp_file_to_cleanup}')
+                except Exception as e:
+                    context.run.log_message(f'Failed to clean up temporary file {temp_file_to_cleanup}: {str(e)}')
     def can_skip(self, context: UploadContext) -> bool:
         """Metadata step can be skipped if no metadata strategy is configured."""
         return 'metadata' not in context.strategies
@@ -86,8 +118,88 @@ class ProcessMetadataStep(BaseStep):
         # Clear any loaded metadata
         context.metadata.clear()
+    def _resolve_excel_path_from_string(self, excel_path_str: str, context: UploadContext) -> tuple[Path | None, bool]:
+        """Resolve Excel metadata path from a string path.
+        Note: This method supports the excel_metadata_path parameter which will be deprecated
+        in a future version. Consider using _resolve_excel_path_from_base64 instead.
+        Args:
+            excel_path_str: File path string to the Excel metadata file
+            context: Upload context for resolving relative paths
+        Returns:
+            Tuple of (resolved_path, is_temporary_file)
+            - resolved_path: Path object pointing to the Excel file, or None if resolution failed
+            - is_temporary_file: Always False for path-based approach
+        Examples:
+            >>> path, is_temp = self._resolve_excel_path_from_string("/data/meta.xlsx", context)
+        """
+        # TODO: Plan to deprecate this method in a few versions (backward compatibility)
+        # Try absolute path first
+        path = Path(excel_path_str)
+        if path.exists() and path.is_file():
+            return path, False
+        # Try relative to cwd (only if pathlib_cwd is set)
+        if context.pathlib_cwd:
+            path = context.pathlib_cwd / excel_path_str
+            return (path, False) if path.exists() else (None, False)
+        # In multi-path mode without pathlib_cwd, can only use absolute paths
+        return (None, False)
+    def _resolve_excel_path_from_base64(
+        self, excel_config: dict | ExcelMetadataFile, context: UploadContext
+    ) -> tuple[Path | None, bool]:
+        """Resolve Excel metadata path from base64 encoded data.
+        Args:
+            excel_config: Either a dict or an ExcelMetadataFile object with base64 data
+            context: Upload context for logging
+        Returns:
+            Tuple of (resolved_path, is_temporary_file)
+            - resolved_path: Path object pointing to the temporary Excel file, or None if decoding failed
+            - is_temporary_file: Always True for base64 approach (requires cleanup)
+        Examples:
+            >>> config = ExcelMetadataFile(data="UEsDB...", filename="meta.xlsx")
+            >>> path, is_temp = self._resolve_excel_path_from_base64(config, context)
+        """
+        if isinstance(excel_config, dict):
+            excel_config = ExcelMetadataFile(**excel_config)
+        try:
+            # Decode base64 data
+            decoded_data = base64.b64decode(excel_config.data, validate=True)
+            # Create temp file
+            temp_dir = Path(tempfile.gettempdir())
+            filename = excel_config.filename
+            temp_file = temp_dir / filename
+            temp_file.write_bytes(decoded_data)
+            context.run.log_message(f'Decoded base64 Excel metadata to temporary file: {temp_file}')
+            return temp_file, True
+        except Exception as e:
+            context.run.log_message(f'Failed to decode base64 Excel metadata: {str(e)}')
+            return None, False
     def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Path:
-        """Find default Excel metadata file."""
+        """Find default Excel metadata file.
+        Args:
+            pathlib_cwd: Working directory path (must not be None)
+        Returns:
+            Path to Excel metadata file, or None if not found
+        """
+        if not pathlib_cwd:
+            return None
         # Check .xlsx first as it's more common
         excel_path = pathlib_cwd / 'meta.xlsx'
         if excel_path.exists():

synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py CHANGED Viewed

@@ -24,51 +24,152 @@ class OrganizeFilesStep(BaseStep):
             return self.create_error_result('File specifications not available')
         try:
-            # Create type directories mapping
-            type_dirs = {}
-            for spec in context.file_specifications:
-                spec_name = spec['name']
-                spec_dir = context.pathlib_cwd / spec_name
-                if spec_dir.exists() and spec_dir.is_dir():
-                    type_dirs[spec_name] = spec_dir
-            if type_dirs:
-                context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
+            # Check which mode we're in
+            use_single_path = context.get_param('use_single_path', True)
+            if use_single_path:
+                # Single path mode: all assets use same base path
+                return self._execute_single_path_mode(context, file_discovery_strategy)
             else:
-                context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
-                return self.create_success_result(data={'organized_files': []})
+                # Multi-path mode: each asset has its own path
+                return self._execute_multi_path_mode(context, file_discovery_strategy)
-            context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
-            context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
+        except Exception as e:
+            return self.create_error_result(f'File organization failed: {str(e)}')
-            # Discover files in type directories
-            all_files = []
-            is_recursive = context.get_param('is_recursive', True)
+    def _execute_single_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
+        """Execute file organization in single path mode (traditional)."""
+        # Create type directories mapping
+        type_dirs = {}
+        for spec in context.file_specifications:
+            spec_name = spec['name']
+            spec_dir = context.pathlib_cwd / spec_name
+            if spec_dir.exists() and spec_dir.is_dir():
+                type_dirs[spec_name] = spec_dir
+        if type_dirs:
+            context.run.log_message_with_code(LogCode.TYPE_DIRECTORIES_FOUND, list(type_dirs.keys()))
+        else:
+            context.run.log_message_with_code(LogCode.NO_TYPE_DIRECTORIES)
+            return self.create_success_result(data={'organized_files': []})
+        context.run.log_message_with_code(LogCode.TYPE_STRUCTURE_DETECTED)
+        context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
+        # Discover files in type directories
+        all_files = []
+        is_recursive = context.get_param('is_recursive', True)
+        for spec_name, dir_path in type_dirs.items():
+            files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
+            all_files.extend(files_in_dir)
+        if not all_files:
+            context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
+            return self.create_success_result(data={'organized_files': []})
+        # Organize files using strategy
+        organized_files = file_discovery_strategy.organize(
+            all_files, context.file_specifications, context.metadata or {}, type_dirs
+        )
+        if organized_files:
+            context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
+            context.add_organized_files(organized_files)
+        return self.create_success_result(
+            data={'organized_files': organized_files},
+            rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
+        )
+    def _execute_multi_path_mode(self, context: UploadContext, file_discovery_strategy) -> StepResult:
+        """Execute file organization in multi-path mode (each asset has own path)."""
+        from synapse_sdk.utils.storage import get_pathlib
+        assets = context.get_param('assets', {})
+        if not assets:
+            return self.create_error_result('Multi-path mode requires assets configuration')
+        # Validate that all required specs have asset paths
+        required_specs = [spec['name'] for spec in context.file_specifications if spec.get('is_required', False)]
+        missing_required = [spec for spec in required_specs if spec not in assets]
+        if missing_required:
+            return self.create_error_result(
+                f'Multi-path mode requires asset paths for required specs: {", ".join(missing_required)}'
+            )
-            for spec_name, dir_path in type_dirs.items():
-                files_in_dir = file_discovery_strategy.discover(dir_path, is_recursive)
-                all_files.extend(files_in_dir)
+        context.run.log_message(f'Using multi-path mode with {len(assets)} asset configurations')
+        context.run.log_message_with_code(LogCode.FILE_ORGANIZATION_STARTED)
+        # Collect all files and specs first
+        all_files = []
+        type_dirs = {}
+        specs_with_files = []
+        for spec in context.file_specifications:
+            spec_name = spec['name']
+            is_required = spec.get('is_required', False)
+            # Skip if no asset configuration for this spec (only allowed for optional specs)
+            if spec_name not in assets:
+                if is_required:
+                    # This should not happen due to validation above, but double-check
+                    return self.create_error_result(f'Required spec {spec_name} missing asset path')
+                context.run.log_message(f'Skipping optional spec {spec_name}: no asset path configured')
+                continue
+            asset_config = assets[spec_name]
+            # Get the asset path from storage
+            try:
+                asset_path = get_pathlib(context.storage, asset_config.get('path', ''))
+                type_dirs[spec_name] = asset_path
+            except Exception as e:
+                context.run.log_message(f'Error accessing path for {spec_name}: {str(e)}', 'WARNING')
+                continue
+            if not asset_path.exists():
+                context.run.log_message(f'Path does not exist for {spec_name}: {asset_config.path}', 'WARNING')
+                continue
+            # Discover files for this asset
+            is_recursive = asset_config.get('is_recursive', True)
+            context.run.log_message(
+                f'Discovering files for {spec_name} at {asset_config.get("path", "")} (recursive={is_recursive})'
+            )
-            if not all_files:
-                context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
-                return self.create_success_result(data={'organized_files': []})
+            files = file_discovery_strategy.discover(asset_path, is_recursive)
-            # Organize files using strategy
-            organized_files = file_discovery_strategy.organize(
-                all_files, context.file_specifications, context.metadata or {}, type_dirs
-            )
+            if not files:
+                context.run.log_message(f'No files found for {spec_name}', 'WARNING')
+                continue
-            if organized_files:
-                context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(organized_files))
-                context.add_organized_files(organized_files)
+            all_files.extend(files)
+            specs_with_files.append(spec)
+            context.run.log_message(f'Found {len(files)} files for {spec_name}')
-            return self.create_success_result(
-                data={'organized_files': organized_files},
-                rollback_data={'files_count': len(organized_files), 'type_dirs': list(type_dirs.keys())},
+        # Organize all files together to group by dataset_key
+        all_organized_files = []
+        if all_files and specs_with_files:
+            context.run.log_message(f'Organizing {len(all_files)} files across {len(specs_with_files)} specs')
+            context.run.log_message(f'Type directories: {list(type_dirs.keys())}')
+            all_organized_files = file_discovery_strategy.organize(
+                all_files, specs_with_files, context.metadata or {}, type_dirs
             )
-        except Exception as e:
-            return self.create_error_result(f'File organization failed: {str(e)}')
+        if all_organized_files:
+            context.run.log_message_with_code(LogCode.FILES_DISCOVERED, len(all_organized_files))
+            context.run.log_message(f'Created {len(all_organized_files)} data units from {len(all_files)} files')
+            context.add_organized_files(all_organized_files)
+        else:
+            context.run.log_message_with_code(LogCode.NO_FILES_FOUND_WARNING)
+        return self.create_success_result(
+            data={'organized_files': all_organized_files},
+            rollback_data={'files_count': len(all_organized_files), 'type_dirs': list(type_dirs.keys())},
+        )
     def can_skip(self, context: UploadContext) -> bool:
         """File organization cannot be skipped."""
@@ -82,8 +183,17 @@ class OrganizeFilesStep(BaseStep):
     def validate_prerequisites(self, context: UploadContext) -> None:
         """Validate prerequisites for file organization."""
-        if not context.pathlib_cwd:
-            raise ValueError('Working directory path not set')
+        use_single_path = context.get_param('use_single_path', True)
+        # In single-path mode, pathlib_cwd is required
+        if use_single_path and not context.pathlib_cwd:
+            raise ValueError('Working directory path not set in single-path mode')
+        # In multi-path mode, pathlib_cwd is optional (each asset has its own path)
+        if not use_single_path:
+            assets = context.get_param('assets', {})
+            if not assets:
+                raise ValueError('Multi-path mode requires assets configuration')
         if not context.file_specifications:
             raise ValueError('File specifications not available')

synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py CHANGED Viewed

@@ -34,7 +34,9 @@ class UploadFilesStep(BaseStep):
             context.run.log_message_with_code(LogCode.UPLOADING_DATA_FILES)
             # Initialize metrics
-            context.update_metrics('data_files', {'stand_by': organized_files_count, 'success': 0, 'failed': 0})
+            initial_metrics = {'stand_by': organized_files_count, 'success': 0, 'failed': 0}
+            context.update_metrics('data_files', initial_metrics)
+            context.run.set_metrics(initial_metrics, category='data_files')
             # Create upload configuration
             upload_config = UploadConfig(
@@ -55,10 +57,13 @@ class UploadFilesStep(BaseStep):
                 context.run.log_data_file(uploaded_file, UploadStatus.SUCCESS)
             # Update final metrics
-            context.update_metrics(
-                'data_files',
-                {'stand_by': 0, 'success': len(uploaded_files), 'failed': organized_files_count - len(uploaded_files)},
-            )
+            final_metrics = {
+                'stand_by': 0,
+                'success': len(uploaded_files),
+                'failed': organized_files_count - len(uploaded_files),
+            }
+            context.update_metrics('data_files', final_metrics)
+            context.run.set_metrics(final_metrics, category='data_files')
             # Complete progress
             context.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')

synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py CHANGED Viewed

@@ -10,7 +10,11 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
     def discover(self, path: Path, recursive: bool) -> List[Path]:
         """Discover files non-recursively in the given path."""
-        return [file_path for file_path in path.glob('*') if file_path.is_file()]
+        # Exclude system files
+        excluded_files = {'.DS_Store', 'Thumbs.db', 'desktop.ini'}
+        return [
+            file_path for file_path in path.glob('*') if file_path.is_file() and file_path.name not in excluded_files
+        ]
     def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
         """Organize files according to specifications with metadata."""
@@ -39,9 +43,14 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
         # Performance optimization 2: Build metadata index for faster lookups
         metadata_index = self._build_metadata_index(metadata)
-        # Group files by dataset (flat discovery)
+        # Group files by dataset_key (stem-based matching) - flat discovery (no subdirectories)
+        # Strategy:
+        # 1. Group all files (required + optional) by their file stem
+        # 2. Only create data units for groups that have ALL required files
+        # 3. Optional files are automatically included if they match the stem
         dataset_files = {}
         required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
+        optional_specs = [spec['name'] for spec in specs if not spec.get('is_required', False)]
         for file_path in files:
             # Determine which type directory this file belongs to
@@ -64,20 +73,36 @@ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
                             # If stat fails, keep existing file
                             pass
-        # Create organized files for datasets with all required files
+        # Create organized files ONLY for datasets with ALL required files
+        # Optional files are included automatically if they match the stem
         for file_name, files_dict in sorted(dataset_files.items()):
-            if all(req in files_dict for req in required_specs):
-                # Calculate most common file extension (optimized)
+            # Check if all required files are present
+            has_all_required = all(req in files_dict for req in required_specs)
+            if has_all_required:
+                # Extract original file stem from actual file paths (more reliable)
+                # Collect stems from all files in the group
+                file_stems = {}
                 file_extensions = {}
                 for file_path in files_dict.values():
+                    stem = file_path.stem
                     ext = file_path.suffix.lower()
+                    # Count stems (to handle multiple files with slightly different names)
+                    if stem:
+                        file_stems[stem] = file_stems.get(stem, 0) + 1
+                    # Count extensions
                     if ext:
                         file_extensions[ext] = file_extensions.get(ext, 0) + 1
+                # Use the most common stem (usually they're all the same)
+                original_stem = max(file_stems, key=file_stems.get) if file_stems else file_name
                 origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
                 meta_data = {
-                    'origin_file_stem': file_name,
+                    'origin_file_stem': original_stem,
                     'origin_file_extension': origin_file_extension,
                     'created_at': datetime.now().isoformat(),
                 }

synapse-sdk 2025.9.5__py3-none-any.whl → 2025.10.6__py3-none-any.whl

Potentially problematic release.

synapse-sdk 2025.9.5py3-none-any.whl → 2025.10.6py3-none-any.whl