PyPI - synapse-sdk - Versions diffs - 1.0.0a86__py3-none-any.whl → 1.0.0a88__py3-none-any.whl - Mend

synapse-sdk 1.0.0a86py3-none-any.whl → 1.0.0a88py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synapse-sdk might be problematic. Click here for more details.

Files changed (11) hide show

synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
 from synapse_sdk.plugins.categories.data_validation.actions.validation import ValidationDataStatus, ValidationResult
-def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> ValidationResult:
+def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
     """Validate data with assignment data.
     * Custom validation logic can be added here.
@@ -15,7 +15,7 @@ def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Validation
         **kwargs: Additional arguments.
     Returns:
-        ValidationResult: The validation result with status and errors.
+        Dict[str, Any]: The validation result as a dictionary with ValidationResult structure.
     """
     errors: List[str] = []
@@ -26,4 +26,8 @@ def validate(data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Validation
     # Determine status based on errors
     status = ValidationDataStatus.FAILED if errors else ValidationDataStatus.SUCCESS
-    return ValidationResult(status=status, errors=errors)
+    # DO NOT MODIFY BELOW THIS LINE - Validation result should be returned as a dumped ValidationResult.
+    validation_result = ValidationResult(status=status, errors=errors)
+    result_dict = validation_result.model_dump()
+    result_dict['status'] = status.value
+    return result_dict

synapse_sdk/plugins/categories/upload/actions/upload.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import json
+import os
 from datetime import datetime
 from enum import Enum
+from io import BytesIO
 from pathlib import Path
-from typing import Annotated, Dict, List
+from typing import Annotated, Any, Dict, List, Optional
+from openpyxl import load_workbook
+from openpyxl.utils.exceptions import InvalidFileException
 from pydantic import AfterValidator, BaseModel, field_validator
 from pydantic_core import PydanticCustomError
@@ -39,6 +43,7 @@ class UploadRun(Run):
         data_unit_id: int | None
         status: UploadStatus
         created: str
+        data_unit_meta: dict | None
     class TaskLog(BaseModel):
         """Task log model."""
@@ -59,26 +64,29 @@ class UploadRun(Run):
         Args:
             data_file_info (dict): The json info of the data file.
-            checksum (str): The checksum of the data file.
-            status (DataUnitStatus): The status of the data unit.
+            status (UploadStatus): The status of the data file.
         """
         now = datetime.now().isoformat()
+        data_file_info_str = json.dumps(data_file_info, ensure_ascii=False)
         self.log(
             'upload_data_file',
-            self.DataFileLog(data_file_info=json.dumps(data_file_info), status=status.value, created=now).model_dump(),
+            self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
         )
-    def log_data_unit(self, data_unit_id: int, status: UploadStatus):
+    def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
         """Upload data_unit log.
         Args:
             data_unit_id (int): The ID of the data unit.
-            status (DataUnitStatus): The status of the data unit.
+            status (UploadStatus): The status of the data unit.
+            data_unit_meta (dict | None): The metadata of the data unit.
         """
         now = datetime.now().isoformat()
         self.log(
             'upload_data_unit',
-            self.DataUnitLog(data_unit_id=data_unit_id, status=status.value, created=now).model_dump(),
+            self.DataUnitLog(
+                data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
+            ).model_dump(),
         )
     def log_task(self, task_id: int, status: UploadStatus):
@@ -101,6 +109,78 @@ class UploadRun(Run):
         self.set_metrics(value=record.model_dump(), category=category)
+class ExcelSecurityError(Exception):
+    """Custom exception for Excel security validation errors."""
+    pass
+class ExcelParsingError(Exception):
+    """Custom exception for Excel parsing errors."""
+    pass
+class ExcelSecurityConfig:
+    """Configuration class for Excel security settings."""
+    def __init__(self):
+        # File size limits
+        self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
+        self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
+        # Memory limits
+        self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
+        self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
+        # Content limits
+        self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
+        self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
+        # String length limits
+        self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
+        self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
+        self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
+class ExcelMetadataUtils:
+    """Utility class for Excel metadata processing with shared validation logic."""
+    def __init__(self, config: ExcelSecurityConfig):
+        self.config = config
+    def validate_and_truncate_string(self, value: str, max_length: int) -> str:
+        """Validate and truncate string to specified maximum length.
+        Args:
+            value: String value to validate and truncate
+            max_length: Maximum allowed length
+        Returns:
+            str: Validated and potentially truncated string
+        """
+        if not isinstance(value, str):
+            value = str(value)
+        value = value.strip()
+        if len(value) > max_length:
+            return value[:max_length]
+        return value
+    def is_valid_filename_length(self, filename: str) -> bool:
+        """Check if filename length is within acceptable limits.
+        Args:
+            filename: Filename to check
+        Returns:
+            bool: True if filename length is acceptable
+        """
+        return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
 class UploadParams(BaseModel):
     """Upload action parameters.
@@ -112,6 +192,8 @@ class UploadParams(BaseModel):
         storage (int): The storage of the action.
         collection (int): The collection of the action.
         project (int | None): The project of the action.
+        use_excel_metadata (bool): Whether to use excel file for additional metadata.
+        excel_metadata_path (str | None): Path to excel file containing metadata.
     """
     name: Annotated[str, AfterValidator(non_blank)]
@@ -120,8 +202,8 @@ class UploadParams(BaseModel):
     storage: int
     collection: int
     project: int | None
-    is_generate_tasks: bool = False
-    is_generate_ground_truths: bool = False
+    use_excel_metadata: bool = False
+    excel_metadata_path: str | None = None
     @field_validator('storage', mode='before')
     @classmethod
@@ -166,6 +248,74 @@ class UploadParams(BaseModel):
             raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
         return value
+    @field_validator('excel_metadata_path', mode='before')
+    @classmethod
+    def check_excel_metadata_path(cls, value: str, info) -> str:
+        """Validate excel metadata file exists and is secure if use_excel_metadata is True.
+        This validator performs comprehensive security checks including:
+        - File existence and format validation
+        - File size limits (max 10MB)
+        - Basic security checks for file content
+        Args:
+            value: The excel file path to validate
+            info: Validation context information
+        Returns:
+            str: The validated file path
+        Raises:
+            PydanticCustomError: If validation fails
+        """
+        if not value:
+            return value
+        # Check if use_excel_metadata is True
+        data = info.data
+        if data.get('use_excel_metadata', False):
+            excel_path = Path(value)
+            # Check file existence
+            if not excel_path.exists():
+                raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
+            # Check file extension
+            if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
+                raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
+            # Security check: file size limit
+            file_size = excel_path.stat().st_size
+            excel_config = ExcelSecurityConfig()
+            if file_size > excel_config.MAX_FILE_SIZE_BYTES:
+                raise PydanticCustomError(
+                    'file_too_large',
+                    _('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
+                )
+            # Basic security check: ensure file is readable and not corrupted
+            try:
+                with open(excel_path, 'rb') as f:
+                    # Read first few bytes to check if it's a valid Excel file
+                    header = f.read(8)
+                    if not header:
+                        raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
+                    # Check for valid Excel file signatures
+                    if excel_path.suffix.lower() == '.xlsx':
+                        # XLSX files start with PK (ZIP signature)
+                        if not header.startswith(b'PK'):
+                            raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
+                    elif excel_path.suffix.lower() == '.xls':
+                        # XLS files have specific OLE signatures
+                        if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
+                            raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
+            except (OSError, IOError):
+                raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
+        return value
 @register_action
 class UploadAction(Action):
@@ -194,13 +344,13 @@ class UploadAction(Action):
     run_class = UploadRun
     progress_categories = {
         'analyze_collection': {
-            'proportion': 10,
+            'proportion': 2,
         },
         'upload_data_files': {
-            'proportion': 50,
+            'proportion': 38,
         },
         'generate_data_units': {
-            'proportion': 40,
+            'proportion': 60,
         },
     }
     metrics_categories = {
@@ -216,30 +366,283 @@ class UploadAction(Action):
         },
     }
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.excel_config = ExcelSecurityConfig()
+        self.excel_utils = ExcelMetadataUtils(self.excel_config)
     def get_uploader(self, path, file_specification, organized_files):
         """Get uploader from entrypoint."""
         return self.entrypoint(self.run, path, file_specification, organized_files)
-    def start(self) -> Dict:
+    def _validate_excel_security(self, excel_path: Path) -> None:
+        """Validate Excel file security constraints.
+        Performs comprehensive security validation including:
+        - File size limits
+        - Memory usage constraints
+        - Basic malicious content detection
+        Args:
+            excel_path: Path to the Excel file to validate
+        Raises:
+            ExcelSecurityError: If security validation fails
+        """
+        # File size check (already done in validator, but double-check)
+        file_size = excel_path.stat().st_size
+        if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
+            raise ExcelSecurityError(
+                f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
+            )
+        # Memory usage estimation (rough estimate: file_size * 3 for processing)
+        estimated_memory = file_size * 3
+        if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
+            raise ExcelSecurityError(
+                f'Excel file may consume too much memory: ~{estimated_memory} bytes '
+                f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
+            )
+    def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
+        """Prepare Excel file for reading with security validation.
+        Args:
+            excel_path: Path to the Excel file
+        Returns:
+            BytesIO: Excel file stream ready for reading
+        Raises:
+            ExcelSecurityError: If security validation fails
+        """
+        self._validate_excel_security(excel_path)
+        excel_bytes = excel_path.read_bytes()
+        return BytesIO(excel_bytes)
+    def _process_excel_headers(self, headers: tuple) -> tuple:
+        """Process and validate Excel headers.
+        Args:
+            headers: Raw header tuple from Excel
+        Returns:
+            tuple: Validated headers
+        Raises:
+            ExcelParsingError: If headers are invalid
+        """
+        if len(headers) < 2:
+            raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
+        self._validate_excel_content(headers, 0)  # Validate column count
+        return headers
+    def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
+        """Process a single Excel data row.
+        Args:
+            row: Raw row data from Excel
+            headers: Excel headers
+        Returns:
+            Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
+        """
+        # Skip empty rows
+        if not row[0] or str(row[0]).strip() == '':
+            return None
+        file_name = str(row[0]).strip()
+        if not self.excel_utils.is_valid_filename_length(file_name):
+            self.run.log_message(
+                f'Skipping file with overly long name: {file_name[:50]}...', context=Context.WARNING.value
+            )
+            return None
+        # Create metadata dictionary from remaining columns
+        file_metadata: Dict[str, Any] = {}
+        for i, value in enumerate(row[1:], start=1):
+            if value is not None and i < len(headers):
+                header_value = headers[i]
+                column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
+                # Validate and truncate column name and value
+                column_name = self.excel_utils.validate_and_truncate_string(
+                    column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
+                )
+                str_value = self.excel_utils.validate_and_truncate_string(
+                    str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
+                )
+                file_metadata[column_name] = str_value
+        return {file_name: file_metadata} if file_metadata else None
+    def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
+        """Process Excel worksheet and extract metadata.
+        Args:
+            worksheet: openpyxl worksheet object
+        Returns:
+            Dict[str, Dict[str, Any]]: Extracted metadata dictionary
+        Raises:
+            ExcelParsingError: If worksheet processing fails
+        """
+        if worksheet is None:
+            raise ExcelParsingError('Excel file has no active worksheet')
+        metadata_dict: Dict[str, Dict[str, Any]] = {}
+        headers: Optional[tuple] = None
+        data_row_count = 0
+        validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
+        # Process rows one by one for memory efficiency
+        for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
+            if not row or all(cell is None or str(cell).strip() == '' for cell in row):
+                continue  # Skip completely empty rows
+            if row_idx == 0:  # Header row
+                headers = self._process_excel_headers(row)
+                continue
+            # Data rows
+            if headers is None:
+                raise ExcelParsingError('Excel file missing header row')
+            data_row_count += 1
+            # Validate row count periodically
+            if data_row_count % validation_interval == 0:
+                self._validate_excel_content(headers, data_row_count)
+            # Process individual row
+            row_result = self._process_excel_data_row(row, headers)
+            if row_result:
+                metadata_dict.update(row_result)
+        # Final validation
+        self._validate_excel_content(headers or (), data_row_count)
+        return metadata_dict
+    def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
+        """Validate Excel content constraints.
+        Args:
+            headers: Tuple of header values from the first row
+            row_count: Total number of data rows processed
+        Raises:
+            ExcelParsingError: If content validation fails
+        """
+        # Limit number of columns to prevent memory exhaustion
+        if len(headers) > self.excel_config.MAX_COLUMNS:
+            raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
+        # Limit number of rows to prevent excessive processing
+        if row_count > self.excel_config.MAX_ROWS:
+            raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
+    def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
+        """Read metadata from excel file with comprehensive security validation.
+        This method orchestrates the Excel metadata reading process by delegating
+        to specialized methods for each step.
+        Args:
+            pathlib_cwd (Path): The pathlib object representing the current working directory.
+        Returns:
+            Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
+            Empty dict if no metadata is configured or if reading fails.
+        Raises:
+            ExcelSecurityError: If security validation fails
+            ExcelParsingError: If Excel content is invalid or exceeds limits
+        """
+        if not self.params.get('use_excel_metadata', False) or not self.params.get('excel_metadata_path'):
+            return {}
+        excel_path = pathlib_cwd / self.params['excel_metadata_path']
+        if not excel_path.exists():
+            self.run.log_message(f'Excel metadata file not found: {excel_path}', context=Context.WARNING.value)
+            return {}
+        try:
+            # Prepare Excel file with security validation
+            excel_stream = self._prepare_excel_file(excel_path)
+            # Load and process workbook
+            workbook = load_workbook(excel_stream, read_only=True, data_only=True)
+            try:
+                return self._process_excel_worksheet(workbook.active)
+            finally:
+                workbook.close()
+        except ExcelSecurityError as e:
+            self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
+            raise
+        except ExcelParsingError as e:
+            self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
+            raise
+        except InvalidFileException as e:
+            self.run.log_message(f'Invalid Excel file format: {str(e)}', context=Context.ERROR.value)
+            raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
+        except MemoryError:
+            self.run.log_message('Excel file too large to process (memory limit exceeded)', context=Context.ERROR.value)
+            raise ExcelSecurityError('Excel file exceeds memory limits')
+        except (OSError, IOError) as e:
+            self.run.log_message(f'File access error reading excel metadata: {str(e)}', context=Context.ERROR.value)
+            raise ExcelParsingError(f'File access error: {str(e)}')
+        except Exception as e:
+            self.run.log_message(f'Unexpected error reading excel metadata: {str(e)}', context=Context.ERROR.value)
+            raise ExcelParsingError(f'Unexpected error: {str(e)}')
+    def start(self) -> Dict[str, Any]:
         """Start upload process.
         Returns:
             Dict: The result of the upload process.
         """
+        # Setup result dict early for error handling
+        result: Dict[str, Any] = {}
         # Setup path object with path and storage.
         storage = self.client.get_storage(self.params['storage'])
         pathlib_cwd = get_pathlib(storage, self.params['path'])
+        # Read excel metadata if configured
+        excel_metadata: Dict[str, Dict[str, Any]] = {}
+        try:
+            excel_metadata = self._read_excel_metadata(pathlib_cwd)
+            if excel_metadata:
+                self.run.log_message(f'Excel metadata loaded for {len(excel_metadata)} files')
+            elif self.params.get('use_excel_metadata', False):
+                self.run.log_message('Excel metadata enabled but no entries found')
+            # Don't log anything if Excel metadata is not being used
+        except ExcelSecurityError as e:
+            # Security violations should stop the process entirely
+            self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
+            self.run.log_message('Upload aborted due to Excel security concerns.', context=Context.ERROR.value)
+            return result
+        except ExcelParsingError as e:
+            # Parsing errors can be non-critical if Excel metadata is optional
+            if self.params.get('use_excel_metadata', False):
+                self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
+                self.run.log_message('Upload aborted due to Excel parsing failure.', context=Context.ERROR.value)
+                return result
+            else:
+                # If Excel metadata is not explicitly enabled, treat as warning and continue
+                self.run.log_message(f'Excel parsing failed (continuing): {str(e)}', context=Context.WARNING.value)
+                excel_metadata = {}
         # Analyze Collection file specifications to determine the data structure for upload.
         file_specification_template = self._analyze_collection()
-        organized_files = self._organize_files(pathlib_cwd, file_specification_template)
+        organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
         # Initialize uploader.
         uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
-        # Setup result dict.
-        result = {}
         # Get organized files from the uploader (plugin developer's custom implementation)
         # or use the default organization method if uploader doesn't provide valid files
         organized_files = uploader.handle_upload_files()
@@ -247,37 +650,35 @@ class UploadAction(Action):
         # Validate the organized files
         if not self._validate_organized_files(organized_files, file_specification_template):
             self.run.log_message('Validation failed.', context=Context.ERROR.value)
-            self.run.end_log()
+            self.run.log_message('Upload is aborted due to validation errors.', context=Context.ERROR.value)
             return result
         # Upload files to synapse-backend.
-        organized_files_count = len(organized_files)
-        if not organized_files_count:
+        if not organized_files:
             self.run.log_message('Files not found on the path.', context=Context.WARNING.value)
-            self.run.end_log()
+            self.run.log_message('Upload is aborted due to missing files.', context=Context.ERROR.value)
             return result
-        uploaded_files = self._upload_files(organized_files, organized_files_count)
+        uploaded_files = self._upload_files(organized_files)
         result['uploaded_files_count'] = len(uploaded_files)
         # Generate data units for the uploaded data.
-        upload_result_count = len(uploaded_files)
-        if not upload_result_count:
+        if not uploaded_files:
             self.run.log_message('No files were uploaded.', context=Context.WARNING.value)
-            self.run.end_log()
+            self.run.log_message('Upload is aborted due to no uploaded files.', context=Context.ERROR.value)
             return result
-        generated_data_units = self._generate_data_units(uploaded_files, upload_result_count)
+        generated_data_units = self._generate_data_units(uploaded_files)
         result['generated_data_units_count'] = len(generated_data_units)
         # Setup task with uploaded synapse-backend data units.
-        if not len(generated_data_units):
+        if not generated_data_units:
             self.run.log_message('No data units were generated.', context=Context.WARNING.value)
-            self.run.end_log()
+            self.run.log_message('Upload is aborted due to no generated data units.', context=Context.ERROR.value)
             return result
-        self.run.end_log()
+        self.run.log_message('Import completed.')
         return result
-    def _analyze_collection(self) -> Dict:
+    def _analyze_collection(self) -> Dict[str, Any]:
         """Analyze Synapse Collection Specifications.
         Returns:
@@ -285,50 +686,60 @@ class UploadAction(Action):
         """
         # Initialize progress
-        self.run.set_progress(0, 1, category='analyze_collection')
+        self.run.set_progress(0, 2, category='analyze_collection')
-        client = self.run.client
         collection_id = self.params['data_collection']
-        collection = client.get_data_collection(collection_id)
+        self.run.set_progress(1, 2, category='analyze_collection')
-        # Finish progress
-        self.run.set_progress(1, 1, category='analyze_collection')
+        collection = self.run.client.get_data_collection(collection_id)
+        self.run.set_progress(2, 2, category='analyze_collection')
+        self.run.log_message('Collection analysis completed.')
         return collection['file_specifications']
-    def _upload_files(self, organized_files, organized_files_count: int) -> List:
+    def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Upload files to synapse-backend.
         Returns:
             Dict: The result of the upload.
         """
         # Initialize progress
+        organized_files_count = len(organized_files)
         self.run.set_progress(0, organized_files_count, category='upload_data_files')
         self.run.log_message('Uploading data files...')
-        data_file_metrics_record = self.run.MetricsRecord(stand_by=organized_files_count, success=0, failed=0)
         client = self.run.client
         collection_id = self.params['data_collection']
         upload_result = []
-        organized_files_count = len(organized_files)
         current_progress = 0
+        success_count = 0
+        failed_count = 0
+        # Initialize metrics
+        self._update_metrics(organized_files_count, success_count, failed_count, 'data_file')
         for organized_file in organized_files:
-            uploaded_data_file = client.upload_data_file(organized_file, collection_id)
-            self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
-            data_file_metrics_record.stand_by -= 1
-            data_file_metrics_record.success += 1
-            self.run.log_metrics(record=data_file_metrics_record, category='data_file')
-            upload_result.append(uploaded_data_file)
-            self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
+            try:
+                uploaded_data_file = client.upload_data_file(organized_file, collection_id)
+                self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
+                success_count += 1
+                upload_result.append(uploaded_data_file)
+            except Exception as e:
+                self.run.log_data_file(organized_file, UploadStatus.FAILED)
+                self.run.log_message(f'Failed to upload file: {str(e)}')
+                failed_count += 1
             current_progress += 1
+            self._update_metrics(organized_files_count, success_count, failed_count, 'data_file')
+            self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
         # Finish progress
         self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
-        self.run.log_message('Upload data files completed.')
+        self.run.log_message(f'Upload data files completed. Success: {success_count}, Failed: {failed_count}')
         return upload_result
-    def _generate_data_units(self, uploaded_files: List, upload_result_count: int) -> List:
+    def _generate_data_units(self, uploaded_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Generate data units for the uploaded data.
         TODO: make batch size configurable.
@@ -337,76 +748,93 @@ class UploadAction(Action):
             Dict: The result of the generate data units process.
         """
         # Initialize progress
+        upload_result_count = len(uploaded_files)
         self.run.set_progress(0, upload_result_count, category='generate_data_units')
-        data_unit_metrics_record = self.run.MetricsRecord(stand_by=upload_result_count, success=0, failed=0)
+        self.run.log_message('Generating data units...')
         client = self.run.client
         generated_data_units = []
         current_progress = 0
+        success_count = 0
+        failed_count = 0
         batches = get_batched_list(uploaded_files, 100)
         batches_count = len(batches)
+        # Initialize metrics
+        self._update_metrics(upload_result_count, success_count, failed_count, 'data_unit')
         for batch in batches:
-            created_data_units = client.create_data_units(batch)
-            data_unit_metrics_record.stand_by -= len(created_data_units)
-            data_unit_metrics_record.success += len(created_data_units)
-            self.run.log_metrics(record=data_unit_metrics_record, category='data_unit')
-            generated_data_units.append(created_data_units)
-            self.run.set_progress(current_progress, batches_count, category='generate_data_units')
+            try:
+                created_data_units = client.create_data_units(batch)
+                success_count += len(created_data_units)
+                generated_data_units.append(created_data_units)
+                for created_data_unit in created_data_units:
+                    self.run.log_data_unit(
+                        created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
+                    )
+            except Exception as e:
+                failed_count += len(batch)
+                self.run.log_message(f'Failed to create data units batch: {str(e)}')
+                for _ in batch:
+                    self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
             current_progress += 1
-            for created_data_unit in created_data_units:
-                self.run.log_data_unit(created_data_unit['id'], UploadStatus.SUCCESS)
+            self._update_metrics(upload_result_count, success_count, failed_count, 'data_unit')
+            self.run.set_progress(current_progress, batches_count, category='generate_data_units')
         # Finish progress
         self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
+        self.run.log_message(f'Data units generation completed. Success: {success_count}, Failed: {failed_count}')
         return sum(generated_data_units, [])
-    def _validate_organized_files(self, organized_files: List, file_specification_template: Dict) -> bool:
+    def _validate_organized_files(
+        self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
+    ) -> bool:
         """Validate organized files from Uploader."""
         validator = FileSpecificationValidator(file_specification_template, organized_files)
         return validator.validate()
-    def _organize_files(self, directory: Path, file_specification: List) -> List:
+    def _organize_files(
+        self,
+        directory: Path,
+        file_specification: List[Dict[str, Any]],
+        excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
+    ) -> List[Dict[str, Any]]:
         """Organize files according to the file specification.
         This method handles type-based directory structure where files are organized in
         directories named after file types (e.g., 'image_1/' directory contains image files
         like '1.jpg', '2.jpg'). For each dataset ID found in the primary directory, it attempts
         to find corresponding files in all type directories.
-        TODO : Add Logic to handle file specific name patterns and extensions.
-        (e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037, image_2:S_DCH_230725_0156_LF_037.jpg)
+        TODO: Add Logic to handle file specific name patterns and extensions.
+        (e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037,
+        image_2:S_DCH_230725_0156_LF_037.jpg)
         Args:
             directory (Path): Root directory containing files to organize.
-            file_specification (List): File specification list.
+            file_specification (List[Dict[str, Any]]): File specification list with metadata.
+            excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
+                to their metadata key-value pairs from excel file.
         Returns:
-            List: List of dictionaries containing organized files.
+            List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
         """
-        organized_files = []
+        organized_files: List[Dict[str, Any]] = []
         # Check for type-based directory structure (e.g., image_1/, pcd_1/)
-        type_dirs = {}
-        type_extensions = {}  # Store common extensions for each type directory
+        type_dirs: Dict[str, Path] = {}
         for spec in file_specification:
             spec_name = spec['name']
             spec_dir = directory / spec_name
             if spec_dir.exists() and spec_dir.is_dir():
                 type_dirs[spec_name] = spec_dir
-                # Analyze file extensions in this directory
-                extensions = {}
-                for file_path in spec_dir.glob('*'):
-                    if file_path.is_file():
-                        ext = file_path.suffix.lower()
-                        extensions[ext] = extensions.get(ext, 0) + 1
-                # Find the most common extension
-                if extensions:
-                    common_ext = max(extensions.items(), key=lambda x: x[1])[0]
-                    type_extensions[spec_name] = common_ext
-                    self.run.log_message(f'Found type directory: {spec_name} (extension: {common_ext})')
+        if type_dirs:
+            self.run.log_message(f'Found type directories: {list(type_dirs.keys())}')
         # If type-based directories don't exist, exit early
         if not type_dirs:
@@ -415,10 +843,11 @@ class UploadAction(Action):
         self.run.log_message('Detected type-based directory structure')
-        # Build a comprehensive map of all dataset IDs across all type directories
-        dataset_files = {}  # Dictionary: file_name -> {spec_name -> file_path}
+        # Collect and process files in a single pass
+        dataset_files = {}
+        required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
-        # First pass: collect all dataset IDs from all type directories
+        # Process all files from all type directories
         for spec_name, dir_path in type_dirs.items():
             for file_path in dir_path.glob('*'):
                 if file_path.is_file():
@@ -428,45 +857,70 @@ class UploadAction(Action):
                     if file_name not in dataset_files:
                         dataset_files[file_name] = {}
-                    # Map this file to its specification
+                    # Map this file to its specification (handle duplicates)
                     if spec_name not in dataset_files[file_name]:
                         dataset_files[file_name][spec_name] = file_path
                     else:
-                        # If multiple files with same file_name for same spec, use most recent
                         existing_file = dataset_files[file_name][spec_name]
                         if file_path.stat().st_mtime > existing_file.stat().st_mtime:
                             dataset_files[file_name][spec_name] = file_path
-                            self.run.log_message(
-                                f"Found newer file for name of {file_name}, spec '{spec_name}': "
-                                f'{file_path.name} (replacing {existing_file.name})'
-                            )
         if not dataset_files:
             self.run.log_message('No files found.', context=Context.WARNING.value)
             return organized_files
-        self.run.log_message(f'Found {len(dataset_files)} files by ID')
+        self.run.log_message(f'Discovered {len(dataset_files)} files')
-        # Second pass: organize valid datasets
+        # Organize datasets - check requirements and create metadata
         for file_name, files_dict in sorted(dataset_files.items()):
-            # Check if all required files are present
-            required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
             if all(req in files_dict for req in required_specs):
+                # Get most common file extension
+                file_extensions = {}
+                for file_path in files_dict.values():
+                    ext = file_path.suffix.lower()
+                    if ext:
+                        file_extensions[ext] = file_extensions.get(ext, 0) + 1
+                origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
                 # Create metadata for this dataset
-                meta_data = {
+                meta_data: Dict[str, Any] = {
                     'origin_file_stem': file_name,
+                    'origin_file_extension': origin_file_extension,
                     'created_at': datetime.now().isoformat(),
                 }
+                # Add excel metadata if available
+                if excel_metadata and file_name in excel_metadata:
+                    meta_data.update(excel_metadata[file_name])
                 # Add the organized dataset
                 organized_files.append({'files': files_dict, 'meta': meta_data})
             else:
-                # Missing required files warning
                 missing = [req for req in required_specs if req not in files_dict]
                 self.run.log_message(
-                    f'Dataset ID {file_name} is missing required files: {", ".join(missing)}',
+                    f'Dataset ID {file_name} missing required files: {", ".join(missing)}',
                     context=Context.WARNING.value,
                 )
-        self.run.log_message(f'Total datasets organized: {len(organized_files)}')
         return organized_files
+    def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
+        """Update metrics for upload progress.
+        Args:
+            total_count (int): Total number of items to process.
+            success_count (int): Number of successfully processed items.
+            failed_count (int): Number of failed items.
+            category (str): The category of the metrics.
+        """
+        if not self.run:
+            raise ValueError('Run instance not properly initialized')
+        # Type assertion to help the linter
+        assert isinstance(self.run, UploadRun)
+        metrics = self.run.MetricsRecord(
+            stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
+        )
+        self.run.log_metrics(metrics, category)

synapse_sdk/plugins/categories/upload/templates/config.yaml CHANGED Viewed

@@ -2,8 +2,6 @@ actions:
   upload:
     entrypoint: plugin.upload.Uploader
     options:
-      allow_generate_tasks: false # Allow the plugin to generate tasks for the uploaded data
-      allow_generate_ground_truths: false # Allow the plugin to generate ground truths for the uploaded data
     supported_data_type: image # A primary data type of synapse backend collection. (e.g. 'image', 'text', 'video', 'pcd', 'audio')
     ui_schema: |
       Dumped FormKit Schema for upload plugin custom options

synapse_sdk/plugins/templates/plugin-config-schema.json CHANGED Viewed

@@ -241,16 +241,6 @@
                 "type": "boolean",
                 "description": "Whether to enable visualization for training",
                 "default": false
-              },
-              "allow_generate_tasks": {
-                "type": "boolean",
-                "description": "Allow the plugin to generate tasks for uploaded data",
-                "default": false
-              },
-              "allow_generate_ground_truths": {
-                "type": "boolean",
-                "description": "Allow the plugin to generate ground truths for uploaded data",
-                "default": false
               }
             }
           },

synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- synapse-sdk[all]
1	+ synapse-sdk[all]

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: synapse-sdk
-Version: 1.0.0a86
+Version: 1.0.0a88
 Summary: synapse sdk
 Author-email: datamaker <developer@datamaker.io>
 License: MIT

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/RECORD RENAMED Viewed

@@ -120,7 +120,7 @@ synapse_sdk/plugins/categories/data_validation/actions/__init__.py,sha256=47DEQp
 synapse_sdk/plugins/categories/data_validation/actions/validation.py,sha256=E6irjH3aylnS_nIlXVDdqwpkvMxlTwzmedYeXj5sgSk,2262
 synapse_sdk/plugins/categories/data_validation/templates/config.yaml,sha256=Hijb-b3hy0msZsTV_bbr3Hvlk8ok-Rk0a05mlLGTCAg,66
 synapse_sdk/plugins/categories/data_validation/templates/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py,sha256=_K1fAp027Q75msY-H_Vyv3hb60A4OthE2nmHtXjSCNw,1011
+synapse_sdk/plugins/categories/data_validation/templates/plugin/validation.py,sha256=O-5eFC-njZzt7tAZlIRAG2UT_zhMEYDrio1e2g4UjVY,1263
 synapse_sdk/plugins/categories/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 synapse_sdk/plugins/categories/export/enums.py,sha256=gtyngvQ1DKkos9iKGcbecwTVQQ6sDwbrBPSGPNb5Am0,127
 synapse_sdk/plugins/categories/export/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -165,12 +165,12 @@ synapse_sdk/plugins/categories/smart_tool/templates/plugin/__init__.py,sha256=47
 synapse_sdk/plugins/categories/smart_tool/templates/plugin/auto_label.py,sha256=eevNg0nOcYFR4z_L_R-sCvVOYoLWSAH1jwDkAf3YCjY,320
 synapse_sdk/plugins/categories/upload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 synapse_sdk/plugins/categories/upload/actions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-synapse_sdk/plugins/categories/upload/actions/upload.py,sha256=cO0Hl6CyQm3MLVxmy_3LN-X58flCJUh_fJjJmh7Qg3U,18721
-synapse_sdk/plugins/categories/upload/templates/config.yaml,sha256=kwHNWHFYbzDi1mEh40KozatPZbZGH44dlP0t0J7ejJw,483
+synapse_sdk/plugins/categories/upload/actions/upload.py,sha256=Nhlq1aIvnhybxKhm7KlJoTGP3NO-JsE2Ya7Dgi7HVOg,37427
+synapse_sdk/plugins/categories/upload/templates/config.yaml,sha256=6_dRa0_J2aS8NSUfO4MKbPxZcdPS2FpJzzp51edYAZc,281
 synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 synapse_sdk/plugins/categories/upload/templates/plugin/upload.py,sha256=IZU4sdSMSLKPCtlNqF7DP2howTdYR6hr74HCUZsGdPk,1559
 synapse_sdk/plugins/templates/cookiecutter.json,sha256=NxOWk9A_v1pO0Ny4IYT9Cj5iiJ16--cIQrGC67QdR0I,396
-synapse_sdk/plugins/templates/plugin-config-schema.json,sha256=f9jilP0J9oLzCxaE_EbetGBkfROx8qmenHrncZNifNQ,12245
+synapse_sdk/plugins/templates/plugin-config-schema.json,sha256=_ff1dnHdWPcWjD1Wy8Wn0aT6TnMD5J5Sdms3RTSWCUs,11805
 synapse_sdk/plugins/templates/schema.json,sha256=E5bZjskN0mqQtk2eLAB1nfXufIoGvyrtkMMP-wGKSdc,14029
 synapse_sdk/plugins/templates/hooks/post_gen_project.py,sha256=jqlYkY1O2TxIR-Vh3gnwILYy8k-D39Xx66d2KNQVMCs,147
 synapse_sdk/plugins/templates/hooks/pre_prompt.py,sha256=aOAMM623s0sKFGjTZaotAOYFvsNMxeii4tPyhOAFKVE,539
@@ -179,7 +179,7 @@ synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/.pre-c
 synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/README.md,sha256=ETBZv_2Ocgzn4Fe3o5Y842mZiz00ABuAalrXpNVnWU0,56
 synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/config.yaml,sha256=tbX80vu-c_FyRJgPQXp8WoxJ2XhTYF8c1qv7SdHwddY,230
 synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/pyproject.toml,sha256=Usgd80tHZAD1Ug5MAjPfETUZxtKKgZW-xovFEAEbQDo,317
-synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt,sha256=F5UwinpTLQFfyakFGTFxgBOo4H-EKD9d4e77WKOPHhk,17
+synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/requirements.txt,sha256=TtWSjUr7lzkNdnFrLIEaC8ZMHAL3pEu602pcccLen0o,16
 synapse_sdk/plugins/templates/synapse-{{cookiecutter.plugin_code}}-plugin/plugin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 synapse_sdk/plugins/utils/__init__.py,sha256=h868T8mxZDgo746W5u0YUB_biHhPov0sr4usFjefYJw,988
 synapse_sdk/plugins/utils/actions.py,sha256=QjzISxUMyrcFtOphaYP8tdAkAAzOUZlfnMbVwR_zt2I,3736
@@ -221,9 +221,9 @@ synapse_sdk/utils/storage/providers/gcp.py,sha256=i2BQCu1Kej1If9SuNr2_lEyTcr5M_n
 synapse_sdk/utils/storage/providers/http.py,sha256=2DhIulND47JOnS5ZY7MZUex7Su3peAPksGo1Wwg07L4,5828
 synapse_sdk/utils/storage/providers/s3.py,sha256=ZmqekAvIgcQBdRU-QVJYv1Rlp6VHfXwtbtjTSphua94,2573
 synapse_sdk/utils/storage/providers/sftp.py,sha256=_8s9hf0JXIO21gvm-JVS00FbLsbtvly4c-ETLRax68A,1426
-synapse_sdk-1.0.0a86.dist-info/licenses/LICENSE,sha256=bKzmC5YAg4V1Fhl8OO_tqY8j62hgdncAkN7VrdjmrGk,1101
-synapse_sdk-1.0.0a86.dist-info/METADATA,sha256=Spzp2B5PSKr6fMRT2fiV_UhJtqgGQUmx1hg1-s7cDOY,3805
-synapse_sdk-1.0.0a86.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-synapse_sdk-1.0.0a86.dist-info/entry_points.txt,sha256=VNptJoGoNJI8yLXfBmhgUefMsmGI0m3-0YoMvrOgbxo,48
-synapse_sdk-1.0.0a86.dist-info/top_level.txt,sha256=ytgJMRK1slVOKUpgcw3LEyHHP7S34J6n_gJzdkcSsw8,12
-synapse_sdk-1.0.0a86.dist-info/RECORD,,
+synapse_sdk-1.0.0a88.dist-info/licenses/LICENSE,sha256=bKzmC5YAg4V1Fhl8OO_tqY8j62hgdncAkN7VrdjmrGk,1101
+synapse_sdk-1.0.0a88.dist-info/METADATA,sha256=z0WgvP2Fz3kFGvw7LheZC1_jFFIAIUFcQdMT18UL5EY,3805
+synapse_sdk-1.0.0a88.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+synapse_sdk-1.0.0a88.dist-info/entry_points.txt,sha256=VNptJoGoNJI8yLXfBmhgUefMsmGI0m3-0YoMvrOgbxo,48
+synapse_sdk-1.0.0a88.dist-info/top_level.txt,sha256=ytgJMRK1slVOKUpgcw3LEyHHP7S34J6n_gJzdkcSsw8,12
+synapse_sdk-1.0.0a88.dist-info/RECORD,,

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/WHEEL RENAMED Viewed

File without changes

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{synapse_sdk-1.0.0a86.dist-info → synapse_sdk-1.0.0a88.dist-info}/top_level.txt RENAMED Viewed

File without changes

synapse-sdk 1.0.0a86__py3-none-any.whl → 1.0.0a88__py3-none-any.whl

Potentially problematic release.

synapse-sdk 1.0.0a86py3-none-any.whl → 1.0.0a88py3-none-any.whl