synapse-sdk 1.0.0b5__py3-none-any.whl → 2025.12.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synapse_sdk/__init__.py +24 -0
- synapse_sdk/cli/code_server.py +305 -33
- synapse_sdk/clients/agent/__init__.py +2 -1
- synapse_sdk/clients/agent/container.py +143 -0
- synapse_sdk/clients/agent/ray.py +296 -38
- synapse_sdk/clients/backend/annotation.py +1 -1
- synapse_sdk/clients/backend/core.py +31 -4
- synapse_sdk/clients/backend/data_collection.py +82 -7
- synapse_sdk/clients/backend/hitl.py +1 -1
- synapse_sdk/clients/backend/ml.py +1 -1
- synapse_sdk/clients/base.py +211 -61
- synapse_sdk/loggers.py +46 -0
- synapse_sdk/plugins/README.md +1340 -0
- synapse_sdk/plugins/categories/base.py +59 -9
- synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
- synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
- synapse_sdk/plugins/categories/export/actions/export/action.py +165 -0
- synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
- synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
- synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
- synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
- synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
- synapse_sdk/plugins/categories/export/templates/config.yaml +19 -1
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +390 -0
- synapse_sdk/plugins/categories/export/templates/plugin/export.py +153 -177
- synapse_sdk/plugins/categories/neural_net/actions/train.py +1130 -32
- synapse_sdk/plugins/categories/neural_net/actions/tune.py +157 -4
- synapse_sdk/plugins/categories/neural_net/templates/config.yaml +7 -4
- synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +148 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +100 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +248 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +265 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +92 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +243 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +19 -0
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +236 -0
- synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
- synapse_sdk/plugins/categories/upload/actions/upload/enums.py +493 -0
- synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
- synapse_sdk/plugins/categories/upload/actions/upload/factory.py +138 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +214 -0
- synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +183 -0
- synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
- synapse_sdk/plugins/categories/upload/actions/upload/run.py +179 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +107 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +63 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +91 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +82 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +235 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +201 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +104 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +71 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +82 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +29 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +300 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +287 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +84 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +60 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +250 -0
- synapse_sdk/plugins/categories/upload/templates/README.md +470 -0
- synapse_sdk/plugins/categories/upload/templates/config.yaml +28 -2
- synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +310 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +82 -20
- synapse_sdk/plugins/models.py +111 -9
- synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
- synapse_sdk/plugins/templates/schema.json +7 -0
- synapse_sdk/plugins/utils/__init__.py +3 -0
- synapse_sdk/plugins/utils/ray_gcs.py +66 -0
- synapse_sdk/shared/__init__.py +25 -0
- synapse_sdk/utils/converters/dm/__init__.py +42 -41
- synapse_sdk/utils/converters/dm/base.py +137 -0
- synapse_sdk/utils/converters/dm/from_v1.py +208 -562
- synapse_sdk/utils/converters/dm/to_v1.py +258 -304
- synapse_sdk/utils/converters/dm/tools/__init__.py +214 -0
- synapse_sdk/utils/converters/dm/tools/answer.py +95 -0
- synapse_sdk/utils/converters/dm/tools/bounding_box.py +132 -0
- synapse_sdk/utils/converters/dm/tools/bounding_box_3d.py +121 -0
- synapse_sdk/utils/converters/dm/tools/classification.py +75 -0
- synapse_sdk/utils/converters/dm/tools/keypoint.py +117 -0
- synapse_sdk/utils/converters/dm/tools/named_entity.py +111 -0
- synapse_sdk/utils/converters/dm/tools/polygon.py +122 -0
- synapse_sdk/utils/converters/dm/tools/polyline.py +124 -0
- synapse_sdk/utils/converters/dm/tools/prompt.py +94 -0
- synapse_sdk/utils/converters/dm/tools/relation.py +86 -0
- synapse_sdk/utils/converters/dm/tools/segmentation.py +141 -0
- synapse_sdk/utils/converters/dm/tools/segmentation_3d.py +83 -0
- synapse_sdk/utils/converters/dm/types.py +168 -0
- synapse_sdk/utils/converters/dm/utils.py +162 -0
- synapse_sdk/utils/converters/dm_legacy/__init__.py +56 -0
- synapse_sdk/utils/converters/dm_legacy/from_v1.py +627 -0
- synapse_sdk/utils/converters/dm_legacy/to_v1.py +367 -0
- synapse_sdk/utils/file/__init__.py +58 -0
- synapse_sdk/utils/file/archive.py +32 -0
- synapse_sdk/utils/file/checksum.py +56 -0
- synapse_sdk/utils/file/chunking.py +31 -0
- synapse_sdk/utils/file/download.py +385 -0
- synapse_sdk/utils/file/encoding.py +40 -0
- synapse_sdk/utils/file/io.py +22 -0
- synapse_sdk/utils/file/upload.py +165 -0
- synapse_sdk/utils/file/video/__init__.py +29 -0
- synapse_sdk/utils/file/video/transcode.py +307 -0
- synapse_sdk/utils/{file.py → file.py.backup} +77 -0
- synapse_sdk/utils/network.py +272 -0
- synapse_sdk/utils/storage/__init__.py +6 -2
- synapse_sdk/utils/storage/providers/file_system.py +6 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/METADATA +19 -2
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/RECORD +134 -74
- synapse_sdk/devtools/docs/.gitignore +0 -20
- synapse_sdk/devtools/docs/README.md +0 -41
- synapse_sdk/devtools/docs/blog/2019-05-28-first-blog-post.md +0 -12
- synapse_sdk/devtools/docs/blog/2019-05-29-long-blog-post.md +0 -44
- synapse_sdk/devtools/docs/blog/2021-08-01-mdx-blog-post.mdx +0 -24
- synapse_sdk/devtools/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
- synapse_sdk/devtools/docs/blog/2021-08-26-welcome/index.md +0 -29
- synapse_sdk/devtools/docs/blog/authors.yml +0 -25
- synapse_sdk/devtools/docs/blog/tags.yml +0 -19
- synapse_sdk/devtools/docs/docusaurus.config.ts +0 -138
- synapse_sdk/devtools/docs/package-lock.json +0 -17455
- synapse_sdk/devtools/docs/package.json +0 -47
- synapse_sdk/devtools/docs/sidebars.ts +0 -44
- synapse_sdk/devtools/docs/src/components/HomepageFeatures/index.tsx +0 -71
- synapse_sdk/devtools/docs/src/components/HomepageFeatures/styles.module.css +0 -11
- synapse_sdk/devtools/docs/src/css/custom.css +0 -30
- synapse_sdk/devtools/docs/src/pages/index.module.css +0 -23
- synapse_sdk/devtools/docs/src/pages/index.tsx +0 -21
- synapse_sdk/devtools/docs/src/pages/markdown-page.md +0 -7
- synapse_sdk/devtools/docs/static/.nojekyll +0 -0
- synapse_sdk/devtools/docs/static/img/docusaurus-social-card.jpg +0 -0
- synapse_sdk/devtools/docs/static/img/docusaurus.png +0 -0
- synapse_sdk/devtools/docs/static/img/favicon.ico +0 -0
- synapse_sdk/devtools/docs/static/img/logo.png +0 -0
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_mountain.svg +0 -171
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_react.svg +0 -170
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_tree.svg +0 -40
- synapse_sdk/devtools/docs/tsconfig.json +0 -8
- synapse_sdk/plugins/categories/export/actions/export.py +0 -346
- synapse_sdk/plugins/categories/export/enums.py +0 -7
- synapse_sdk/plugins/categories/neural_net/actions/gradio.py +0 -151
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task.py +0 -943
- synapse_sdk/plugins/categories/upload/actions/upload.py +0 -954
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from openpyxl import load_workbook
|
|
6
|
+
from openpyxl.utils.exceptions import InvalidFileException
|
|
7
|
+
|
|
8
|
+
from ...exceptions import ExcelParsingError, ExcelSecurityError
|
|
9
|
+
from ...utils import ExcelMetadataUtils, ExcelSecurityConfig
|
|
10
|
+
from ..base import MetadataStrategy, ValidationResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExcelMetadataStrategy(MetadataStrategy):
|
|
14
|
+
"""Excel metadata extraction strategy."""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.excel_config = ExcelSecurityConfig()
|
|
18
|
+
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
19
|
+
|
|
20
|
+
def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
21
|
+
"""Extract metadata from Excel file."""
|
|
22
|
+
try:
|
|
23
|
+
excel_stream = self._prepare_excel_file(source_path)
|
|
24
|
+
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
25
|
+
try:
|
|
26
|
+
return self._process_excel_worksheet(workbook.active)
|
|
27
|
+
finally:
|
|
28
|
+
workbook.close()
|
|
29
|
+
|
|
30
|
+
except ExcelSecurityError:
|
|
31
|
+
raise
|
|
32
|
+
except ExcelParsingError:
|
|
33
|
+
raise
|
|
34
|
+
except InvalidFileException as e:
|
|
35
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
36
|
+
except MemoryError:
|
|
37
|
+
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
38
|
+
except (OSError, IOError) as e:
|
|
39
|
+
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
40
|
+
except Exception as e:
|
|
41
|
+
# Handle ZIP file errors (Excel files are ZIP archives)
|
|
42
|
+
if 'zip file' in str(e).lower():
|
|
43
|
+
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
44
|
+
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
45
|
+
|
|
46
|
+
def validate(self, metadata: Dict) -> ValidationResult:
|
|
47
|
+
"""Validate extracted metadata."""
|
|
48
|
+
errors = []
|
|
49
|
+
|
|
50
|
+
if not isinstance(metadata, dict):
|
|
51
|
+
errors.append('Metadata must be a dictionary')
|
|
52
|
+
return ValidationResult(valid=False, errors=errors)
|
|
53
|
+
|
|
54
|
+
# Validate each file's metadata
|
|
55
|
+
for file_name, file_metadata in metadata.items():
|
|
56
|
+
if not isinstance(file_metadata, dict):
|
|
57
|
+
errors.append(f"Metadata for file '{file_name}' must be a dictionary")
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Check filename length
|
|
61
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
62
|
+
errors.append(f"Filename '{file_name}' exceeds maximum length")
|
|
63
|
+
|
|
64
|
+
return ValidationResult(valid=len(errors) == 0, errors=errors)
|
|
65
|
+
|
|
66
|
+
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
67
|
+
"""Validate Excel file security constraints."""
|
|
68
|
+
file_size = excel_path.stat().st_size
|
|
69
|
+
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
70
|
+
raise ExcelSecurityError(
|
|
71
|
+
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
estimated_memory = file_size * 3
|
|
75
|
+
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
76
|
+
raise ExcelSecurityError(
|
|
77
|
+
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
78
|
+
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
82
|
+
"""Prepare Excel file for processing."""
|
|
83
|
+
self._validate_excel_security(excel_path)
|
|
84
|
+
excel_bytes = excel_path.read_bytes()
|
|
85
|
+
return BytesIO(excel_bytes)
|
|
86
|
+
|
|
87
|
+
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
88
|
+
"""Process Excel headers."""
|
|
89
|
+
if len(headers) < 2:
|
|
90
|
+
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
91
|
+
|
|
92
|
+
# Validate first column header (filename column)
|
|
93
|
+
first_header = str(headers[0]).strip().lower() if headers[0] else ''
|
|
94
|
+
valid_filename_headers = ['filename', 'file_name']
|
|
95
|
+
|
|
96
|
+
if first_header not in valid_filename_headers:
|
|
97
|
+
raise ExcelParsingError(
|
|
98
|
+
f'First column header must be "filename" or "file_name", got: "{headers[0]}". '
|
|
99
|
+
f'Valid options: {", ".join(valid_filename_headers)}'
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self._validate_excel_content(headers, 0)
|
|
103
|
+
return headers
|
|
104
|
+
|
|
105
|
+
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
106
|
+
"""Process a single Excel data row."""
|
|
107
|
+
if not row[0] or str(row[0]).strip() == '':
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
file_name = str(row[0]).strip()
|
|
111
|
+
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
file_metadata: Dict[str, Any] = {}
|
|
115
|
+
for i, value in enumerate(row[1:], start=1):
|
|
116
|
+
if i < len(headers): # Include empty strings, exclude only None values
|
|
117
|
+
header_value = headers[i]
|
|
118
|
+
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
119
|
+
|
|
120
|
+
column_name = self.excel_utils.validate_and_truncate_string(
|
|
121
|
+
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Convert None to empty string, otherwise convert to string
|
|
125
|
+
value_str = '' if value is None else str(value)
|
|
126
|
+
str_value = self.excel_utils.validate_and_truncate_string(
|
|
127
|
+
value_str, self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
128
|
+
)
|
|
129
|
+
file_metadata[column_name] = str_value
|
|
130
|
+
|
|
131
|
+
return {file_name: file_metadata} if file_metadata else None
|
|
132
|
+
|
|
133
|
+
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
134
|
+
"""Process Excel worksheet."""
|
|
135
|
+
if worksheet is None:
|
|
136
|
+
raise ExcelParsingError('Excel file has no active worksheet')
|
|
137
|
+
|
|
138
|
+
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
139
|
+
headers: Optional[tuple] = None
|
|
140
|
+
data_row_count = 0
|
|
141
|
+
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
142
|
+
|
|
143
|
+
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
144
|
+
# Quick check: if row is empty or first cell is empty, skip
|
|
145
|
+
if not row or not row[0] or str(row[0]).strip() == '':
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
if row_idx == 0:
|
|
149
|
+
headers = self._process_excel_headers(row)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
if headers is None:
|
|
153
|
+
raise ExcelParsingError('Excel file missing header row')
|
|
154
|
+
|
|
155
|
+
data_row_count += 1
|
|
156
|
+
|
|
157
|
+
if data_row_count % validation_interval == 0:
|
|
158
|
+
self._validate_excel_content(headers, data_row_count)
|
|
159
|
+
|
|
160
|
+
row_result = self._process_excel_data_row(row, headers)
|
|
161
|
+
if row_result:
|
|
162
|
+
metadata_dict.update(row_result)
|
|
163
|
+
|
|
164
|
+
self._validate_excel_content(headers or (), data_row_count)
|
|
165
|
+
|
|
166
|
+
return metadata_dict
|
|
167
|
+
|
|
168
|
+
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
169
|
+
"""Validate Excel content constraints."""
|
|
170
|
+
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
171
|
+
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
172
|
+
|
|
173
|
+
if row_count > self.excel_config.MAX_ROWS:
|
|
174
|
+
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from ..base import MetadataStrategy, ValidationResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NoneMetadataStrategy(MetadataStrategy):
|
|
8
|
+
"""Metadata strategy that returns no metadata."""
|
|
9
|
+
|
|
10
|
+
def extract(self, source_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
11
|
+
"""Return empty metadata."""
|
|
12
|
+
return {}
|
|
13
|
+
|
|
14
|
+
def validate(self, metadata: Dict) -> ValidationResult:
|
|
15
|
+
"""Always validates successfully for empty metadata."""
|
|
16
|
+
return ValidationResult(valid=True)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Upload strategy implementations
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from ...enums import LogCode, UploadStatus
|
|
5
|
+
from ..base import UploadConfig, UploadStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SyncUploadStrategy(UploadStrategy):
|
|
9
|
+
"""Synchronous upload strategy."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, context):
|
|
12
|
+
self.context = context
|
|
13
|
+
|
|
14
|
+
def upload(self, files: List[Dict], config: UploadConfig) -> List[Dict]:
|
|
15
|
+
"""Upload files synchronously.
|
|
16
|
+
|
|
17
|
+
Integrates plugin's uploader for file processing before upload.
|
|
18
|
+
"""
|
|
19
|
+
# Process files through plugin uploader if available
|
|
20
|
+
processed_files = self._process_files_with_uploader(files)
|
|
21
|
+
|
|
22
|
+
uploaded_files = []
|
|
23
|
+
client = self.context.client
|
|
24
|
+
collection_id = self.context.get_param('data_collection')
|
|
25
|
+
|
|
26
|
+
for organized_file in processed_files:
|
|
27
|
+
try:
|
|
28
|
+
use_chunked_upload = self._requires_chunked_upload(organized_file, config)
|
|
29
|
+
uploaded_data_file = client.upload_data_file(organized_file, collection_id, use_chunked_upload)
|
|
30
|
+
self.context.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
31
|
+
uploaded_files.append(uploaded_data_file)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
self.context.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
34
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, str(e))
|
|
35
|
+
# Continue with other files instead of failing completely
|
|
36
|
+
|
|
37
|
+
return uploaded_files
|
|
38
|
+
|
|
39
|
+
def _requires_chunked_upload(self, organized_file: Dict[str, Any], config: UploadConfig) -> bool:
|
|
40
|
+
"""Determine if chunked upload is required based on file sizes."""
|
|
41
|
+
max_file_size_mb = config.chunked_threshold_mb
|
|
42
|
+
for file_path in organized_file.get('files', {}).values():
|
|
43
|
+
if isinstance(file_path, Path) and self._get_file_size_mb(file_path) > max_file_size_mb:
|
|
44
|
+
return True
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def _get_file_size_mb(self, file_path: Path) -> float:
|
|
48
|
+
"""Get file size in megabytes."""
|
|
49
|
+
return file_path.stat().st_size / (1024 * 1024)
|
|
50
|
+
|
|
51
|
+
def _process_files_with_uploader(self, files: List[Dict]) -> List[Dict]:
|
|
52
|
+
"""Process files through plugin uploader if available.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
files: List of organized files to process
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Processed files (filtered, transformed) ready for upload
|
|
59
|
+
"""
|
|
60
|
+
action = self.context._action
|
|
61
|
+
if not (hasattr(action, 'entrypoint') and action.entrypoint):
|
|
62
|
+
# No plugin uploader, return files as-is
|
|
63
|
+
return files
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
use_single_path = self.context.get_param('use_single_path', True)
|
|
67
|
+
pathlib_cwd = self.context.get('pathlib_cwd')
|
|
68
|
+
uploader_path = pathlib_cwd if use_single_path else None
|
|
69
|
+
|
|
70
|
+
# Get uploader instance from plugin
|
|
71
|
+
uploader = action.get_uploader(
|
|
72
|
+
path=uploader_path,
|
|
73
|
+
file_specification=self.context.file_specifications,
|
|
74
|
+
organized_files=files,
|
|
75
|
+
params=self.context.params,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Process files through uploader (includes extension validation, transformation, etc.)
|
|
79
|
+
return uploader.handle_upload_files()
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
self.context.run.log_message_with_code(LogCode.FILE_UPLOAD_FAILED, f'Uploader processing failed: {str(e)}')
|
|
83
|
+
# Return original files if uploader fails
|
|
84
|
+
return files
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Validation strategy implementations
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from synapse_sdk.clients.validators.collections import FileSpecificationValidator
|
|
4
|
+
|
|
5
|
+
from ..base import ValidationResult, ValidationStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DefaultValidationStrategy(ValidationStrategy):
|
|
9
|
+
"""Default validation strategy for upload operations."""
|
|
10
|
+
|
|
11
|
+
def validate_params(self, params: Dict) -> ValidationResult:
|
|
12
|
+
"""Validate action parameters."""
|
|
13
|
+
errors = []
|
|
14
|
+
|
|
15
|
+
# Check required parameters (common to all modes)
|
|
16
|
+
required_params = ['storage', 'data_collection', 'name']
|
|
17
|
+
for param in required_params:
|
|
18
|
+
if param not in params:
|
|
19
|
+
errors.append(f'Missing required parameter: {param}')
|
|
20
|
+
|
|
21
|
+
# Check mode-specific requirements
|
|
22
|
+
use_single_path = params.get('use_single_path', True)
|
|
23
|
+
|
|
24
|
+
if use_single_path:
|
|
25
|
+
# Single-path mode: 'path' is required
|
|
26
|
+
if 'path' not in params:
|
|
27
|
+
errors.append("Missing required parameter 'path' in single-path mode")
|
|
28
|
+
else:
|
|
29
|
+
# Multi-path mode: 'assets' is required
|
|
30
|
+
if 'assets' not in params:
|
|
31
|
+
errors.append("Missing required parameter 'assets' in multi-path mode")
|
|
32
|
+
|
|
33
|
+
# Check parameter types
|
|
34
|
+
if 'storage' in params and not isinstance(params['storage'], int):
|
|
35
|
+
errors.append("Parameter 'storage' must be an integer")
|
|
36
|
+
|
|
37
|
+
if 'data_collection' in params and not isinstance(params['data_collection'], int):
|
|
38
|
+
errors.append("Parameter 'data_collection' must be an integer")
|
|
39
|
+
|
|
40
|
+
if 'is_recursive' in params and not isinstance(params['is_recursive'], bool):
|
|
41
|
+
errors.append("Parameter 'is_recursive' must be a boolean")
|
|
42
|
+
|
|
43
|
+
if 'use_single_path' in params and not isinstance(params['use_single_path'], bool):
|
|
44
|
+
errors.append("Parameter 'use_single_path' must be a boolean")
|
|
45
|
+
|
|
46
|
+
return ValidationResult(valid=len(errors) == 0, errors=errors)
|
|
47
|
+
|
|
48
|
+
def validate_files(self, files: List[Dict], specs: Dict) -> ValidationResult:
|
|
49
|
+
"""Validate organized files against specifications."""
|
|
50
|
+
try:
|
|
51
|
+
validator = FileSpecificationValidator(specs, files)
|
|
52
|
+
is_valid = validator.validate()
|
|
53
|
+
|
|
54
|
+
if is_valid:
|
|
55
|
+
return ValidationResult(valid=True)
|
|
56
|
+
else:
|
|
57
|
+
return ValidationResult(valid=False, errors=['File specification validation failed'])
|
|
58
|
+
|
|
59
|
+
except Exception as e:
|
|
60
|
+
return ValidationResult(valid=False, errors=[f'Validation error: {str(e)}'])
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, model_validator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PathAwareJSONEncoder(json.JSONEncoder):
|
|
8
|
+
"""Custom JSON encoder that handles Path objects and datetime objects.
|
|
9
|
+
|
|
10
|
+
Extends the default JSON encoder to properly serialize Path objects
|
|
11
|
+
and datetime objects that are commonly used in upload operations.
|
|
12
|
+
|
|
13
|
+
Supported object types:
|
|
14
|
+
- Path objects (converts to string using __fspath__ or as_posix)
|
|
15
|
+
- Datetime objects (converts using isoformat)
|
|
16
|
+
- All other standard JSON-serializable types
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
>>> data = {"path": Path("/tmp/file.txt"), "timestamp": datetime.now()}
|
|
20
|
+
>>> json.dumps(data, cls=PathAwareJSONEncoder)
|
|
21
|
+
'{"path": "/tmp/file.txt", "timestamp": "2023-01-01T12:00:00"}'
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def default(self, obj):
|
|
25
|
+
if hasattr(obj, '__fspath__'):
|
|
26
|
+
return obj.__fspath__()
|
|
27
|
+
elif hasattr(obj, 'as_posix'):
|
|
28
|
+
return obj.as_posix()
|
|
29
|
+
elif hasattr(obj, 'isoformat'):
|
|
30
|
+
return obj.isoformat()
|
|
31
|
+
return super().default(obj)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ExcelSecurityConfig(BaseModel):
|
|
35
|
+
"""Security configuration for Excel file processing using Pydantic.
|
|
36
|
+
|
|
37
|
+
Defines essential security limits for Excel file processing to prevent
|
|
38
|
+
resource exhaustion attacks and ensure safe handling of potentially malicious files.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
max_file_size_mb (int): Maximum file size in megabytes (default: 10)
|
|
42
|
+
max_rows (int): Maximum number of rows allowed (default: 100000)
|
|
43
|
+
max_columns (int): Maximum number of columns allowed (default: 50)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
max_file_size_mb: int = Field(
|
|
47
|
+
default=10,
|
|
48
|
+
ge=1,
|
|
49
|
+
le=1000,
|
|
50
|
+
description='Maximum file size in megabytes',
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
max_rows: int = Field(
|
|
54
|
+
default=100000,
|
|
55
|
+
ge=1,
|
|
56
|
+
le=100000,
|
|
57
|
+
description='Maximum number of rows allowed',
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
max_columns: int = Field(
|
|
61
|
+
default=50,
|
|
62
|
+
ge=1,
|
|
63
|
+
le=16384, # Excel's column limit
|
|
64
|
+
description='Maximum number of columns allowed',
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
max_memory_usage_mb: int = Field(
|
|
68
|
+
default=30,
|
|
69
|
+
ge=1,
|
|
70
|
+
le=1000,
|
|
71
|
+
description='Maximum memory usage in megabytes',
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
max_filename_length: int = Field(
|
|
75
|
+
default=255,
|
|
76
|
+
ge=1,
|
|
77
|
+
le=1000,
|
|
78
|
+
description='Maximum filename length',
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
max_column_name_length: int = Field(
|
|
82
|
+
default=100,
|
|
83
|
+
ge=1,
|
|
84
|
+
le=500,
|
|
85
|
+
description='Maximum column name length',
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
max_metadata_value_length: int = Field(
|
|
89
|
+
default=1000,
|
|
90
|
+
ge=1,
|
|
91
|
+
le=10000,
|
|
92
|
+
description='Maximum metadata value length',
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
validation_check_interval: int = Field(
|
|
96
|
+
default=1000,
|
|
97
|
+
ge=100,
|
|
98
|
+
le=10000,
|
|
99
|
+
description='Validation check interval for processing',
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
model_config = {'validate_assignment': True, 'extra': 'forbid'}
|
|
103
|
+
|
|
104
|
+
@model_validator(mode='after')
|
|
105
|
+
def validate_resource_limits(self) -> 'ExcelSecurityConfig':
|
|
106
|
+
"""Validate that resource limits are reasonable."""
|
|
107
|
+
# Check for unreasonable combinations
|
|
108
|
+
estimated_cells = self.max_rows * self.max_columns
|
|
109
|
+
if estimated_cells > 50000000: # 50 million cells
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f'Combination of max_rows ({self.max_rows}) and max_columns ({self.max_columns}) '
|
|
112
|
+
f'would allow too many cells ({estimated_cells:,})'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return self
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def max_file_size_bytes(self) -> int:
|
|
119
|
+
"""Get maximum file size in bytes."""
|
|
120
|
+
return self.max_file_size_mb * 1024 * 1024
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def max_memory_usage_bytes(self) -> int:
|
|
124
|
+
"""Get maximum memory usage in bytes."""
|
|
125
|
+
return self.max_memory_usage_mb * 1024 * 1024
|
|
126
|
+
|
|
127
|
+
# Backward compatibility properties (uppercase versions)
|
|
128
|
+
@property
|
|
129
|
+
def MAX_FILE_SIZE_MB(self) -> int:
|
|
130
|
+
"""Backward compatibility property."""
|
|
131
|
+
return self.max_file_size_mb
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def MAX_FILE_SIZE_BYTES(self) -> int:
|
|
135
|
+
"""Backward compatibility property."""
|
|
136
|
+
return self.max_file_size_bytes
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def MAX_MEMORY_USAGE_MB(self) -> int:
|
|
140
|
+
"""Backward compatibility property."""
|
|
141
|
+
return self.max_memory_usage_mb
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def MAX_MEMORY_USAGE_BYTES(self) -> int:
|
|
145
|
+
"""Backward compatibility property."""
|
|
146
|
+
return self.max_memory_usage_bytes
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def MAX_ROWS(self) -> int:
|
|
150
|
+
"""Backward compatibility property."""
|
|
151
|
+
return self.max_rows
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def MAX_COLUMNS(self) -> int:
|
|
155
|
+
"""Backward compatibility property."""
|
|
156
|
+
return self.max_columns
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def MAX_FILENAME_LENGTH(self) -> int:
|
|
160
|
+
"""Backward compatibility property."""
|
|
161
|
+
return self.max_filename_length
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def MAX_COLUMN_NAME_LENGTH(self) -> int:
|
|
165
|
+
"""Backward compatibility property."""
|
|
166
|
+
return self.max_column_name_length
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def MAX_METADATA_VALUE_LENGTH(self) -> int:
|
|
170
|
+
"""Backward compatibility property."""
|
|
171
|
+
return self.max_metadata_value_length
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def VALIDATION_CHECK_INTERVAL(self) -> int:
|
|
175
|
+
"""Backward compatibility property."""
|
|
176
|
+
return self.validation_check_interval
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_action_config(cls, action_config: Optional[Dict[str, Any]]) -> 'ExcelSecurityConfig':
|
|
180
|
+
"""Create ExcelSecurityConfig from plugin action configuration (config.yaml).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
action_config: Action configuration dictionary from config.yaml
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
New ExcelSecurityConfig instance with config.yaml values
|
|
187
|
+
|
|
188
|
+
Example config.yaml:
|
|
189
|
+
actions:
|
|
190
|
+
upload:
|
|
191
|
+
excel_config:
|
|
192
|
+
max_file_size_mb: 25
|
|
193
|
+
max_rows: 50000
|
|
194
|
+
max_columns: 100
|
|
195
|
+
"""
|
|
196
|
+
if not action_config or 'excel_config' not in action_config:
|
|
197
|
+
return cls()
|
|
198
|
+
|
|
199
|
+
excel_config = action_config['excel_config']
|
|
200
|
+
|
|
201
|
+
return cls(
|
|
202
|
+
max_file_size_mb=excel_config.get('max_file_size_mb', 10),
|
|
203
|
+
max_rows=excel_config.get('max_rows', 100000),
|
|
204
|
+
max_columns=excel_config.get('max_columns', 50),
|
|
205
|
+
max_memory_usage_mb=excel_config.get('max_memory_usage_mb', 30),
|
|
206
|
+
max_filename_length=excel_config.get('max_filename_length', 255),
|
|
207
|
+
max_column_name_length=excel_config.get('max_column_name_length', 100),
|
|
208
|
+
max_metadata_value_length=excel_config.get('max_metadata_value_length', 1000),
|
|
209
|
+
validation_check_interval=excel_config.get('validation_check_interval', 1000),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class ExcelMetadataUtils:
|
|
214
|
+
"""Utility class for Excel metadata processing."""
|
|
215
|
+
|
|
216
|
+
def __init__(self, config: ExcelSecurityConfig):
|
|
217
|
+
"""Initialize with Excel security configuration."""
|
|
218
|
+
self.config = config
|
|
219
|
+
|
|
220
|
+
def is_valid_filename_length(self, filename: str) -> bool:
|
|
221
|
+
"""Check if filename length is within limits."""
|
|
222
|
+
return len(filename) <= self.config.max_filename_length
|
|
223
|
+
|
|
224
|
+
def validate_and_truncate_string(self, value: str, max_length: int) -> str:
|
|
225
|
+
"""Validate and truncate string to maximum length."""
|
|
226
|
+
if not isinstance(value, str):
|
|
227
|
+
value = str(value)
|
|
228
|
+
|
|
229
|
+
# Strip whitespace
|
|
230
|
+
value = value.strip()
|
|
231
|
+
|
|
232
|
+
# Truncate if too long
|
|
233
|
+
if len(value) > max_length:
|
|
234
|
+
value = value[:max_length]
|
|
235
|
+
|
|
236
|
+
return value
|
|
237
|
+
|
|
238
|
+
def is_valid_column_name(self, column_name: str) -> bool:
|
|
239
|
+
"""Check if column name is valid."""
|
|
240
|
+
if not column_name or not isinstance(column_name, str):
|
|
241
|
+
return False
|
|
242
|
+
return len(column_name.strip()) <= self.config.max_column_name_length
|
|
243
|
+
|
|
244
|
+
def is_valid_metadata_value(self, value: str) -> bool:
|
|
245
|
+
"""Check if metadata value is valid."""
|
|
246
|
+
if value is None:
|
|
247
|
+
return True
|
|
248
|
+
if not isinstance(value, str):
|
|
249
|
+
value = str(value)
|
|
250
|
+
return len(value) <= self.config.max_metadata_value_length
|