synapse-sdk 1.0.0b5__py3-none-any.whl → 2025.12.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synapse_sdk/__init__.py +24 -0
- synapse_sdk/cli/code_server.py +305 -33
- synapse_sdk/clients/agent/__init__.py +2 -1
- synapse_sdk/clients/agent/container.py +143 -0
- synapse_sdk/clients/agent/ray.py +296 -38
- synapse_sdk/clients/backend/annotation.py +1 -1
- synapse_sdk/clients/backend/core.py +31 -4
- synapse_sdk/clients/backend/data_collection.py +82 -7
- synapse_sdk/clients/backend/hitl.py +1 -1
- synapse_sdk/clients/backend/ml.py +1 -1
- synapse_sdk/clients/base.py +211 -61
- synapse_sdk/loggers.py +46 -0
- synapse_sdk/plugins/README.md +1340 -0
- synapse_sdk/plugins/categories/base.py +59 -9
- synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
- synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
- synapse_sdk/plugins/categories/export/actions/export/action.py +165 -0
- synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
- synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
- synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
- synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
- synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
- synapse_sdk/plugins/categories/export/templates/config.yaml +19 -1
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +390 -0
- synapse_sdk/plugins/categories/export/templates/plugin/export.py +153 -177
- synapse_sdk/plugins/categories/neural_net/actions/train.py +1130 -32
- synapse_sdk/plugins/categories/neural_net/actions/tune.py +157 -4
- synapse_sdk/plugins/categories/neural_net/templates/config.yaml +7 -4
- synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +148 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +100 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +248 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +265 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +92 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +243 -0
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
- synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +19 -0
- synapse_sdk/plugins/categories/upload/actions/upload/action.py +236 -0
- synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
- synapse_sdk/plugins/categories/upload/actions/upload/enums.py +493 -0
- synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
- synapse_sdk/plugins/categories/upload/actions/upload/factory.py +138 -0
- synapse_sdk/plugins/categories/upload/actions/upload/models.py +214 -0
- synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +183 -0
- synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
- synapse_sdk/plugins/categories/upload/actions/upload/run.py +179 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +107 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +63 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +91 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +82 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +235 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +201 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +104 -0
- synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +71 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +82 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +29 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +300 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +287 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +84 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
- synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +60 -0
- synapse_sdk/plugins/categories/upload/actions/upload/utils.py +250 -0
- synapse_sdk/plugins/categories/upload/templates/README.md +470 -0
- synapse_sdk/plugins/categories/upload/templates/config.yaml +28 -2
- synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +310 -0
- synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +82 -20
- synapse_sdk/plugins/models.py +111 -9
- synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
- synapse_sdk/plugins/templates/schema.json +7 -0
- synapse_sdk/plugins/utils/__init__.py +3 -0
- synapse_sdk/plugins/utils/ray_gcs.py +66 -0
- synapse_sdk/shared/__init__.py +25 -0
- synapse_sdk/utils/converters/dm/__init__.py +42 -41
- synapse_sdk/utils/converters/dm/base.py +137 -0
- synapse_sdk/utils/converters/dm/from_v1.py +208 -562
- synapse_sdk/utils/converters/dm/to_v1.py +258 -304
- synapse_sdk/utils/converters/dm/tools/__init__.py +214 -0
- synapse_sdk/utils/converters/dm/tools/answer.py +95 -0
- synapse_sdk/utils/converters/dm/tools/bounding_box.py +132 -0
- synapse_sdk/utils/converters/dm/tools/bounding_box_3d.py +121 -0
- synapse_sdk/utils/converters/dm/tools/classification.py +75 -0
- synapse_sdk/utils/converters/dm/tools/keypoint.py +117 -0
- synapse_sdk/utils/converters/dm/tools/named_entity.py +111 -0
- synapse_sdk/utils/converters/dm/tools/polygon.py +122 -0
- synapse_sdk/utils/converters/dm/tools/polyline.py +124 -0
- synapse_sdk/utils/converters/dm/tools/prompt.py +94 -0
- synapse_sdk/utils/converters/dm/tools/relation.py +86 -0
- synapse_sdk/utils/converters/dm/tools/segmentation.py +141 -0
- synapse_sdk/utils/converters/dm/tools/segmentation_3d.py +83 -0
- synapse_sdk/utils/converters/dm/types.py +168 -0
- synapse_sdk/utils/converters/dm/utils.py +162 -0
- synapse_sdk/utils/converters/dm_legacy/__init__.py +56 -0
- synapse_sdk/utils/converters/dm_legacy/from_v1.py +627 -0
- synapse_sdk/utils/converters/dm_legacy/to_v1.py +367 -0
- synapse_sdk/utils/file/__init__.py +58 -0
- synapse_sdk/utils/file/archive.py +32 -0
- synapse_sdk/utils/file/checksum.py +56 -0
- synapse_sdk/utils/file/chunking.py +31 -0
- synapse_sdk/utils/file/download.py +385 -0
- synapse_sdk/utils/file/encoding.py +40 -0
- synapse_sdk/utils/file/io.py +22 -0
- synapse_sdk/utils/file/upload.py +165 -0
- synapse_sdk/utils/file/video/__init__.py +29 -0
- synapse_sdk/utils/file/video/transcode.py +307 -0
- synapse_sdk/utils/{file.py → file.py.backup} +77 -0
- synapse_sdk/utils/network.py +272 -0
- synapse_sdk/utils/storage/__init__.py +6 -2
- synapse_sdk/utils/storage/providers/file_system.py +6 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/METADATA +19 -2
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/RECORD +134 -74
- synapse_sdk/devtools/docs/.gitignore +0 -20
- synapse_sdk/devtools/docs/README.md +0 -41
- synapse_sdk/devtools/docs/blog/2019-05-28-first-blog-post.md +0 -12
- synapse_sdk/devtools/docs/blog/2019-05-29-long-blog-post.md +0 -44
- synapse_sdk/devtools/docs/blog/2021-08-01-mdx-blog-post.mdx +0 -24
- synapse_sdk/devtools/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
- synapse_sdk/devtools/docs/blog/2021-08-26-welcome/index.md +0 -29
- synapse_sdk/devtools/docs/blog/authors.yml +0 -25
- synapse_sdk/devtools/docs/blog/tags.yml +0 -19
- synapse_sdk/devtools/docs/docusaurus.config.ts +0 -138
- synapse_sdk/devtools/docs/package-lock.json +0 -17455
- synapse_sdk/devtools/docs/package.json +0 -47
- synapse_sdk/devtools/docs/sidebars.ts +0 -44
- synapse_sdk/devtools/docs/src/components/HomepageFeatures/index.tsx +0 -71
- synapse_sdk/devtools/docs/src/components/HomepageFeatures/styles.module.css +0 -11
- synapse_sdk/devtools/docs/src/css/custom.css +0 -30
- synapse_sdk/devtools/docs/src/pages/index.module.css +0 -23
- synapse_sdk/devtools/docs/src/pages/index.tsx +0 -21
- synapse_sdk/devtools/docs/src/pages/markdown-page.md +0 -7
- synapse_sdk/devtools/docs/static/.nojekyll +0 -0
- synapse_sdk/devtools/docs/static/img/docusaurus-social-card.jpg +0 -0
- synapse_sdk/devtools/docs/static/img/docusaurus.png +0 -0
- synapse_sdk/devtools/docs/static/img/favicon.ico +0 -0
- synapse_sdk/devtools/docs/static/img/logo.png +0 -0
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_mountain.svg +0 -171
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_react.svg +0 -170
- synapse_sdk/devtools/docs/static/img/undraw_docusaurus_tree.svg +0 -40
- synapse_sdk/devtools/docs/tsconfig.json +0 -8
- synapse_sdk/plugins/categories/export/actions/export.py +0 -346
- synapse_sdk/plugins/categories/export/enums.py +0 -7
- synapse_sdk/plugins/categories/neural_net/actions/gradio.py +0 -151
- synapse_sdk/plugins/categories/pre_annotation/actions/to_task.py +0 -943
- synapse_sdk/plugins/categories/upload/actions/upload.py +0 -954
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/top_level.txt +0 -0
|
@@ -1,954 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from enum import Enum
|
|
5
|
-
from io import BytesIO
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Annotated, Any, Dict, List, Optional
|
|
8
|
-
|
|
9
|
-
from openpyxl import load_workbook
|
|
10
|
-
from openpyxl.utils.exceptions import InvalidFileException
|
|
11
|
-
from pydantic import AfterValidator, BaseModel, field_validator
|
|
12
|
-
from pydantic_core import PydanticCustomError
|
|
13
|
-
|
|
14
|
-
from synapse_sdk.clients.exceptions import ClientError
|
|
15
|
-
from synapse_sdk.clients.utils import get_batched_list
|
|
16
|
-
from synapse_sdk.clients.validators.collections import FileSpecificationValidator
|
|
17
|
-
from synapse_sdk.i18n import gettext as _
|
|
18
|
-
from synapse_sdk.plugins.categories.base import Action
|
|
19
|
-
from synapse_sdk.plugins.categories.decorators import register_action
|
|
20
|
-
from synapse_sdk.plugins.enums import PluginCategory, RunMethod
|
|
21
|
-
from synapse_sdk.plugins.exceptions import ActionError
|
|
22
|
-
from synapse_sdk.plugins.models import Run
|
|
23
|
-
from synapse_sdk.shared.enums import Context
|
|
24
|
-
from synapse_sdk.utils.pydantic.validators import non_blank
|
|
25
|
-
from synapse_sdk.utils.storage import get_pathlib
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class PathAwareJSONEncoder(json.JSONEncoder):
|
|
29
|
-
"""Custom JSON encoder that handles Path-like objects."""
|
|
30
|
-
|
|
31
|
-
def default(self, obj):
|
|
32
|
-
if hasattr(obj, '__fspath__') or hasattr(obj, 'as_posix'):
|
|
33
|
-
# Handle Path-like objects (including UPath, SFTPPath, pathlib.Path, etc.)
|
|
34
|
-
return str(obj)
|
|
35
|
-
elif hasattr(obj, 'isoformat'):
|
|
36
|
-
# Handle datetime objects
|
|
37
|
-
return obj.isoformat()
|
|
38
|
-
# Let the base class handle other types
|
|
39
|
-
return super().default(obj)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class UploadStatus(str, Enum):
|
|
43
|
-
SUCCESS = 'success'
|
|
44
|
-
FAILED = 'failed'
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class UploadRun(Run):
|
|
48
|
-
class DataFileLog(BaseModel):
|
|
49
|
-
"""Data file log model."""
|
|
50
|
-
|
|
51
|
-
data_file_info: str | None
|
|
52
|
-
status: UploadStatus
|
|
53
|
-
created: str
|
|
54
|
-
|
|
55
|
-
class DataUnitLog(BaseModel):
|
|
56
|
-
"""Data unit log model."""
|
|
57
|
-
|
|
58
|
-
data_unit_id: int | None
|
|
59
|
-
status: UploadStatus
|
|
60
|
-
created: str
|
|
61
|
-
data_unit_meta: dict | None
|
|
62
|
-
|
|
63
|
-
class TaskLog(BaseModel):
|
|
64
|
-
"""Task log model."""
|
|
65
|
-
|
|
66
|
-
task_id: int | None
|
|
67
|
-
status: UploadStatus
|
|
68
|
-
created: str
|
|
69
|
-
|
|
70
|
-
class MetricsRecord(BaseModel):
|
|
71
|
-
"""Metrics record model."""
|
|
72
|
-
|
|
73
|
-
stand_by: int
|
|
74
|
-
failed: int
|
|
75
|
-
success: int
|
|
76
|
-
|
|
77
|
-
def log_data_file(self, data_file_info: dict, status: UploadStatus):
|
|
78
|
-
"""Upload data_file log.
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
data_file_info (dict): The json info of the data file.
|
|
82
|
-
status (UploadStatus): The status of the data file.
|
|
83
|
-
"""
|
|
84
|
-
now = datetime.now().isoformat()
|
|
85
|
-
# Use custom JSON encoder to handle Path-like objects
|
|
86
|
-
data_file_info_str = json.dumps(data_file_info, ensure_ascii=False, cls=PathAwareJSONEncoder)
|
|
87
|
-
self.log(
|
|
88
|
-
'upload_data_file',
|
|
89
|
-
self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
|
|
93
|
-
"""Upload data_unit log.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
data_unit_id (int): The ID of the data unit.
|
|
97
|
-
status (UploadStatus): The status of the data unit.
|
|
98
|
-
data_unit_meta (dict | None): The metadata of the data unit.
|
|
99
|
-
"""
|
|
100
|
-
now = datetime.now().isoformat()
|
|
101
|
-
self.log(
|
|
102
|
-
'upload_data_unit',
|
|
103
|
-
self.DataUnitLog(
|
|
104
|
-
data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
|
|
105
|
-
).model_dump(),
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
def log_task(self, task_id: int, status: UploadStatus):
|
|
109
|
-
"""Upload task log.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
task_id (int): The ID of the task.
|
|
113
|
-
status (UploadStatus): The status of the task.
|
|
114
|
-
"""
|
|
115
|
-
now = datetime.now().isoformat()
|
|
116
|
-
self.log('upload_task', self.TaskLog(task_id=task_id, status=status.value, created=now).model_dump())
|
|
117
|
-
|
|
118
|
-
def log_metrics(self, record: MetricsRecord, category: str):
|
|
119
|
-
"""Log upload metrics.
|
|
120
|
-
Args:
|
|
121
|
-
record (MetricsRecord): The metrics record to log.
|
|
122
|
-
category (str): The category of the metrics.
|
|
123
|
-
"""
|
|
124
|
-
record = self.MetricsRecord.model_validate(record)
|
|
125
|
-
self.set_metrics(value=record.model_dump(), category=category)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class ExcelSecurityError(Exception):
|
|
129
|
-
"""Custom exception for Excel security validation errors."""
|
|
130
|
-
|
|
131
|
-
pass
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
class ExcelParsingError(Exception):
|
|
135
|
-
"""Custom exception for Excel parsing errors."""
|
|
136
|
-
|
|
137
|
-
pass
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class ExcelSecurityConfig:
|
|
141
|
-
"""Configuration class for Excel security settings."""
|
|
142
|
-
|
|
143
|
-
def __init__(self):
|
|
144
|
-
# File size limits
|
|
145
|
-
self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
|
|
146
|
-
self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
|
|
147
|
-
|
|
148
|
-
# Memory limits
|
|
149
|
-
self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
|
|
150
|
-
self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
|
|
151
|
-
|
|
152
|
-
# Content limits
|
|
153
|
-
self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
|
|
154
|
-
self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
|
|
155
|
-
|
|
156
|
-
# String length limits
|
|
157
|
-
self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
|
|
158
|
-
self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
|
|
159
|
-
self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class ExcelMetadataUtils:
|
|
163
|
-
"""Utility class for Excel metadata processing with shared validation logic."""
|
|
164
|
-
|
|
165
|
-
def __init__(self, config: ExcelSecurityConfig):
|
|
166
|
-
self.config = config
|
|
167
|
-
|
|
168
|
-
def validate_and_truncate_string(self, value: str, max_length: int) -> str:
|
|
169
|
-
"""Validate and truncate string to specified maximum length.
|
|
170
|
-
|
|
171
|
-
Args:
|
|
172
|
-
value: String value to validate and truncate
|
|
173
|
-
max_length: Maximum allowed length
|
|
174
|
-
|
|
175
|
-
Returns:
|
|
176
|
-
str: Validated and potentially truncated string
|
|
177
|
-
"""
|
|
178
|
-
if not isinstance(value, str):
|
|
179
|
-
value = str(value)
|
|
180
|
-
|
|
181
|
-
value = value.strip()
|
|
182
|
-
|
|
183
|
-
if len(value) > max_length:
|
|
184
|
-
return value[:max_length]
|
|
185
|
-
|
|
186
|
-
return value
|
|
187
|
-
|
|
188
|
-
def is_valid_filename_length(self, filename: str) -> bool:
|
|
189
|
-
"""Check if filename length is within acceptable limits.
|
|
190
|
-
|
|
191
|
-
Args:
|
|
192
|
-
filename: Filename to check
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
bool: True if filename length is acceptable
|
|
196
|
-
"""
|
|
197
|
-
return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
class UploadParams(BaseModel):
|
|
201
|
-
"""Upload action parameters.
|
|
202
|
-
|
|
203
|
-
Args:
|
|
204
|
-
name (str): The name of the action.
|
|
205
|
-
description (str | None): The description of the action.
|
|
206
|
-
checkpoint (int | None): The checkpoint of the action.
|
|
207
|
-
path (str): The path of the action.
|
|
208
|
-
storage (int): The storage of the action.
|
|
209
|
-
collection (int): The collection of the action.
|
|
210
|
-
project (int | None): The project of the action.
|
|
211
|
-
excel_metadata_path (str | None): Path to excel file containing metadata.
|
|
212
|
-
Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
|
|
213
|
-
"""
|
|
214
|
-
|
|
215
|
-
name: Annotated[str, AfterValidator(non_blank)]
|
|
216
|
-
description: str | None
|
|
217
|
-
path: str
|
|
218
|
-
storage: int
|
|
219
|
-
collection: int
|
|
220
|
-
project: int | None
|
|
221
|
-
excel_metadata_path: str | None = None
|
|
222
|
-
|
|
223
|
-
@field_validator('storage', mode='before')
|
|
224
|
-
@classmethod
|
|
225
|
-
def check_storage_exists(cls, value: str, info) -> str:
|
|
226
|
-
"""Validate synapse-backend storage exists.
|
|
227
|
-
|
|
228
|
-
TODO: Need to define validation method naming convention.
|
|
229
|
-
TODO: Need to make validation method reusable.
|
|
230
|
-
"""
|
|
231
|
-
action = info.context['action']
|
|
232
|
-
client = action.client
|
|
233
|
-
try:
|
|
234
|
-
client.get_storage(value)
|
|
235
|
-
except ClientError:
|
|
236
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking storage exists.'))
|
|
237
|
-
return value
|
|
238
|
-
|
|
239
|
-
@field_validator('collection', mode='before')
|
|
240
|
-
@classmethod
|
|
241
|
-
def check_collection_exists(cls, value: str, info) -> str:
|
|
242
|
-
"""Validate synapse-backend collection exists."""
|
|
243
|
-
action = info.context['action']
|
|
244
|
-
client = action.client
|
|
245
|
-
try:
|
|
246
|
-
client.get_data_collection(value)
|
|
247
|
-
except ClientError:
|
|
248
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking collection exists.'))
|
|
249
|
-
return value
|
|
250
|
-
|
|
251
|
-
@field_validator('project', mode='before')
|
|
252
|
-
@classmethod
|
|
253
|
-
def check_project_exists(cls, value: str, info) -> str:
|
|
254
|
-
"""Validate synapse-backend project exists."""
|
|
255
|
-
if not value:
|
|
256
|
-
return value
|
|
257
|
-
|
|
258
|
-
action = info.context['action']
|
|
259
|
-
client = action.client
|
|
260
|
-
try:
|
|
261
|
-
client.get_project(value)
|
|
262
|
-
except ClientError:
|
|
263
|
-
raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
|
|
264
|
-
return value
|
|
265
|
-
|
|
266
|
-
@field_validator('excel_metadata_path', mode='before')
|
|
267
|
-
@classmethod
|
|
268
|
-
def check_excel_metadata_path(cls, value: str, info) -> str:
|
|
269
|
-
"""Validate excel metadata file exists and is secure if provided.
|
|
270
|
-
|
|
271
|
-
This validator performs comprehensive security checks including:
|
|
272
|
-
- File existence and format validation
|
|
273
|
-
- File size limits (max 10MB)
|
|
274
|
-
- Basic security checks for file content
|
|
275
|
-
|
|
276
|
-
Args:
|
|
277
|
-
value: The excel file path to validate
|
|
278
|
-
info: Validation context information
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
str: The validated file path
|
|
282
|
-
|
|
283
|
-
Raises:
|
|
284
|
-
PydanticCustomError: If validation fails
|
|
285
|
-
"""
|
|
286
|
-
if not value:
|
|
287
|
-
return value
|
|
288
|
-
|
|
289
|
-
excel_path = Path(value)
|
|
290
|
-
|
|
291
|
-
# Check file existence
|
|
292
|
-
if not excel_path.exists():
|
|
293
|
-
raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
|
|
294
|
-
|
|
295
|
-
# Check file extension
|
|
296
|
-
if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
|
|
297
|
-
raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
|
|
298
|
-
|
|
299
|
-
# Security check: file size limit
|
|
300
|
-
file_size = excel_path.stat().st_size
|
|
301
|
-
excel_config = ExcelSecurityConfig()
|
|
302
|
-
if file_size > excel_config.MAX_FILE_SIZE_BYTES:
|
|
303
|
-
raise PydanticCustomError(
|
|
304
|
-
'file_too_large',
|
|
305
|
-
_('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
# Basic security check: ensure file is readable and not corrupted
|
|
309
|
-
try:
|
|
310
|
-
with open(excel_path, 'rb') as f:
|
|
311
|
-
# Read first few bytes to check if it's a valid Excel file
|
|
312
|
-
header = f.read(8)
|
|
313
|
-
if not header:
|
|
314
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
|
|
315
|
-
|
|
316
|
-
# Check for valid Excel file signatures
|
|
317
|
-
if excel_path.suffix.lower() == '.xlsx':
|
|
318
|
-
# XLSX files start with PK (ZIP signature)
|
|
319
|
-
if not header.startswith(b'PK'):
|
|
320
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
321
|
-
elif excel_path.suffix.lower() == '.xls':
|
|
322
|
-
# XLS files have specific OLE signatures
|
|
323
|
-
if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
|
|
324
|
-
raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
|
|
325
|
-
|
|
326
|
-
except (OSError, IOError):
|
|
327
|
-
raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
|
|
328
|
-
|
|
329
|
-
return value
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
@register_action
|
|
333
|
-
class UploadAction(Action):
|
|
334
|
-
"""Upload action class.
|
|
335
|
-
|
|
336
|
-
Attrs:
|
|
337
|
-
name (str): The name of the action.
|
|
338
|
-
category (PluginCategory): The category of the action.
|
|
339
|
-
method (RunMethod): The method to run of the action.
|
|
340
|
-
|
|
341
|
-
Progress Categories:
|
|
342
|
-
analyze_collection: The progress category for the analyze collection process.
|
|
343
|
-
data_file_upload: The progress category for the upload process.
|
|
344
|
-
generate_data_units: The progress category for the generate data units process.
|
|
345
|
-
generate_tasks: The progress category for the generate tasks process.
|
|
346
|
-
generate_ground_truths: The progress category for the generate ground truths process.
|
|
347
|
-
|
|
348
|
-
Metrics Categories:
|
|
349
|
-
data_file: The metrics category for the data file.
|
|
350
|
-
data_unit: The metrics category for the data unit.
|
|
351
|
-
"""
|
|
352
|
-
|
|
353
|
-
name = 'upload'
|
|
354
|
-
category = PluginCategory.UPLOAD
|
|
355
|
-
method = RunMethod.JOB
|
|
356
|
-
run_class = UploadRun
|
|
357
|
-
progress_categories = {
|
|
358
|
-
'analyze_collection': {
|
|
359
|
-
'proportion': 2,
|
|
360
|
-
},
|
|
361
|
-
'upload_data_files': {
|
|
362
|
-
'proportion': 38,
|
|
363
|
-
},
|
|
364
|
-
'generate_data_units': {
|
|
365
|
-
'proportion': 60,
|
|
366
|
-
},
|
|
367
|
-
}
|
|
368
|
-
metrics_categories = {
|
|
369
|
-
'data_files': {
|
|
370
|
-
'stand_by': 0,
|
|
371
|
-
'failed': 0,
|
|
372
|
-
'success': 0,
|
|
373
|
-
},
|
|
374
|
-
'data_units': {
|
|
375
|
-
'stand_by': 0,
|
|
376
|
-
'failed': 0,
|
|
377
|
-
'success': 0,
|
|
378
|
-
},
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
def __init__(self, *args, **kwargs):
|
|
382
|
-
super().__init__(*args, **kwargs)
|
|
383
|
-
self.excel_config = ExcelSecurityConfig()
|
|
384
|
-
self.excel_utils = ExcelMetadataUtils(self.excel_config)
|
|
385
|
-
|
|
386
|
-
def get_uploader(self, path, file_specification, organized_files):
|
|
387
|
-
"""Get uploader from entrypoint."""
|
|
388
|
-
return self.entrypoint(self.run, path, file_specification, organized_files)
|
|
389
|
-
|
|
390
|
-
def _validate_excel_security(self, excel_path: Path) -> None:
|
|
391
|
-
"""Validate Excel file security constraints.
|
|
392
|
-
|
|
393
|
-
Performs comprehensive security validation including:
|
|
394
|
-
- File size limits
|
|
395
|
-
- Memory usage constraints
|
|
396
|
-
- Basic malicious content detection
|
|
397
|
-
|
|
398
|
-
Args:
|
|
399
|
-
excel_path: Path to the Excel file to validate
|
|
400
|
-
|
|
401
|
-
Raises:
|
|
402
|
-
ExcelSecurityError: If security validation fails
|
|
403
|
-
"""
|
|
404
|
-
# File size check (already done in validator, but double-check)
|
|
405
|
-
file_size = excel_path.stat().st_size
|
|
406
|
-
if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
|
|
407
|
-
raise ExcelSecurityError(
|
|
408
|
-
f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
# Memory usage estimation (rough estimate: file_size * 3 for processing)
|
|
412
|
-
estimated_memory = file_size * 3
|
|
413
|
-
if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
|
|
414
|
-
raise ExcelSecurityError(
|
|
415
|
-
f'Excel file may consume too much memory: ~{estimated_memory} bytes '
|
|
416
|
-
f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
|
|
420
|
-
"""Prepare Excel file for reading with security validation.
|
|
421
|
-
|
|
422
|
-
Args:
|
|
423
|
-
excel_path: Path to the Excel file
|
|
424
|
-
|
|
425
|
-
Returns:
|
|
426
|
-
BytesIO: Excel file stream ready for reading
|
|
427
|
-
|
|
428
|
-
Raises:
|
|
429
|
-
ExcelSecurityError: If security validation fails
|
|
430
|
-
"""
|
|
431
|
-
self._validate_excel_security(excel_path)
|
|
432
|
-
excel_bytes = excel_path.read_bytes()
|
|
433
|
-
return BytesIO(excel_bytes)
|
|
434
|
-
|
|
435
|
-
def _process_excel_headers(self, headers: tuple) -> tuple:
|
|
436
|
-
"""Process and validate Excel headers.
|
|
437
|
-
|
|
438
|
-
Args:
|
|
439
|
-
headers: Raw header tuple from Excel
|
|
440
|
-
|
|
441
|
-
Returns:
|
|
442
|
-
tuple: Validated headers
|
|
443
|
-
|
|
444
|
-
Raises:
|
|
445
|
-
ExcelParsingError: If headers are invalid
|
|
446
|
-
"""
|
|
447
|
-
if len(headers) < 2:
|
|
448
|
-
raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
|
|
449
|
-
self._validate_excel_content(headers, 0) # Validate column count
|
|
450
|
-
return headers
|
|
451
|
-
|
|
452
|
-
def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
|
|
453
|
-
"""Process a single Excel data row.
|
|
454
|
-
|
|
455
|
-
Args:
|
|
456
|
-
row: Raw row data from Excel
|
|
457
|
-
headers: Excel headers
|
|
458
|
-
|
|
459
|
-
Returns:
|
|
460
|
-
Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
|
|
461
|
-
"""
|
|
462
|
-
# Skip empty rows
|
|
463
|
-
if not row[0] or str(row[0]).strip() == '':
|
|
464
|
-
return None
|
|
465
|
-
|
|
466
|
-
file_name = str(row[0]).strip()
|
|
467
|
-
if not self.excel_utils.is_valid_filename_length(file_name):
|
|
468
|
-
self.run.log_message(
|
|
469
|
-
f'Skipping file with overly long name: {file_name[:50]}...', context=Context.WARNING.value
|
|
470
|
-
)
|
|
471
|
-
return None
|
|
472
|
-
|
|
473
|
-
# Create metadata dictionary from remaining columns
|
|
474
|
-
file_metadata: Dict[str, Any] = {}
|
|
475
|
-
for i, value in enumerate(row[1:], start=1):
|
|
476
|
-
if value is not None and i < len(headers):
|
|
477
|
-
header_value = headers[i]
|
|
478
|
-
column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
|
|
479
|
-
|
|
480
|
-
# Validate and truncate column name and value
|
|
481
|
-
column_name = self.excel_utils.validate_and_truncate_string(
|
|
482
|
-
column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
|
|
483
|
-
)
|
|
484
|
-
str_value = self.excel_utils.validate_and_truncate_string(
|
|
485
|
-
str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
|
|
486
|
-
)
|
|
487
|
-
file_metadata[column_name] = str_value
|
|
488
|
-
|
|
489
|
-
return {file_name: file_metadata} if file_metadata else None
|
|
490
|
-
|
|
491
|
-
def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
|
|
492
|
-
"""Process Excel worksheet and extract metadata.
|
|
493
|
-
|
|
494
|
-
Args:
|
|
495
|
-
worksheet: openpyxl worksheet object
|
|
496
|
-
|
|
497
|
-
Returns:
|
|
498
|
-
Dict[str, Dict[str, Any]]: Extracted metadata dictionary
|
|
499
|
-
|
|
500
|
-
Raises:
|
|
501
|
-
ExcelParsingError: If worksheet processing fails
|
|
502
|
-
"""
|
|
503
|
-
if worksheet is None:
|
|
504
|
-
raise ExcelParsingError('Excel file has no active worksheet')
|
|
505
|
-
|
|
506
|
-
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
|
507
|
-
headers: Optional[tuple] = None
|
|
508
|
-
data_row_count = 0
|
|
509
|
-
validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
|
|
510
|
-
|
|
511
|
-
# Process rows one by one for memory efficiency
|
|
512
|
-
for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
|
|
513
|
-
if not row or all(cell is None or str(cell).strip() == '' for cell in row):
|
|
514
|
-
continue # Skip completely empty rows
|
|
515
|
-
|
|
516
|
-
if row_idx == 0: # Header row
|
|
517
|
-
headers = self._process_excel_headers(row)
|
|
518
|
-
continue
|
|
519
|
-
|
|
520
|
-
# Data rows
|
|
521
|
-
if headers is None:
|
|
522
|
-
raise ExcelParsingError('Excel file missing header row')
|
|
523
|
-
|
|
524
|
-
data_row_count += 1
|
|
525
|
-
|
|
526
|
-
# Validate row count periodically
|
|
527
|
-
if data_row_count % validation_interval == 0:
|
|
528
|
-
self._validate_excel_content(headers, data_row_count)
|
|
529
|
-
|
|
530
|
-
# Process individual row
|
|
531
|
-
row_result = self._process_excel_data_row(row, headers)
|
|
532
|
-
if row_result:
|
|
533
|
-
metadata_dict.update(row_result)
|
|
534
|
-
|
|
535
|
-
# Final validation
|
|
536
|
-
self._validate_excel_content(headers or (), data_row_count)
|
|
537
|
-
|
|
538
|
-
return metadata_dict
|
|
539
|
-
|
|
540
|
-
def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
|
|
541
|
-
"""Validate Excel content constraints.
|
|
542
|
-
|
|
543
|
-
Args:
|
|
544
|
-
headers: Tuple of header values from the first row
|
|
545
|
-
row_count: Total number of data rows processed
|
|
546
|
-
|
|
547
|
-
Raises:
|
|
548
|
-
ExcelParsingError: If content validation fails
|
|
549
|
-
"""
|
|
550
|
-
# Limit number of columns to prevent memory exhaustion
|
|
551
|
-
if len(headers) > self.excel_config.MAX_COLUMNS:
|
|
552
|
-
raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
|
|
553
|
-
|
|
554
|
-
# Limit number of rows to prevent excessive processing
|
|
555
|
-
if row_count > self.excel_config.MAX_ROWS:
|
|
556
|
-
raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
|
|
557
|
-
|
|
558
|
-
def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
|
|
559
|
-
"""Find Excel metadata file in the directory.
|
|
560
|
-
|
|
561
|
-
Checks for meta.xlsx and meta.xls in the given directory.
|
|
562
|
-
|
|
563
|
-
Args:
|
|
564
|
-
pathlib_cwd (Path): The pathlib object representing the current working directory.
|
|
565
|
-
|
|
566
|
-
Returns:
|
|
567
|
-
Optional[Path]: Path to the Excel metadata file if found, None otherwise.
|
|
568
|
-
"""
|
|
569
|
-
# Check for xlsx first, then xls
|
|
570
|
-
for extension in ['.xlsx', '.xls']:
|
|
571
|
-
excel_path = pathlib_cwd / f'meta{extension}'
|
|
572
|
-
if excel_path.exists() and excel_path.is_file():
|
|
573
|
-
return excel_path
|
|
574
|
-
return None
|
|
575
|
-
|
|
576
|
-
def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
|
|
577
|
-
"""Read metadata from excel file with comprehensive security validation.
|
|
578
|
-
|
|
579
|
-
This method orchestrates the Excel metadata reading process by delegating
|
|
580
|
-
to specialized methods for each step.
|
|
581
|
-
|
|
582
|
-
Args:
|
|
583
|
-
pathlib_cwd (Path): The pathlib object representing the current working directory.
|
|
584
|
-
|
|
585
|
-
Returns:
|
|
586
|
-
Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
|
|
587
|
-
Empty dict if no metadata is configured or if reading fails.
|
|
588
|
-
|
|
589
|
-
Raises:
|
|
590
|
-
ExcelSecurityError: If security validation fails
|
|
591
|
-
ExcelParsingError: If Excel content is invalid or exceeds limits
|
|
592
|
-
"""
|
|
593
|
-
excel_path = None
|
|
594
|
-
|
|
595
|
-
# Check if user provided a specific excel_metadata_path
|
|
596
|
-
if self.params.get('excel_metadata_path'):
|
|
597
|
-
excel_path = pathlib_cwd / self.params['excel_metadata_path']
|
|
598
|
-
if not excel_path.exists():
|
|
599
|
-
self.run.log_message(f'Excel metadata file not found: {excel_path}', context=Context.WARNING.value)
|
|
600
|
-
return {}
|
|
601
|
-
else:
|
|
602
|
-
# Look for default meta.xlsx or meta.xls
|
|
603
|
-
excel_path = self._find_excel_metadata_file(pathlib_cwd)
|
|
604
|
-
if not excel_path:
|
|
605
|
-
# No Excel metadata file found, return empty dict (not an error)
|
|
606
|
-
return {}
|
|
607
|
-
|
|
608
|
-
try:
|
|
609
|
-
# Prepare Excel file with security validation
|
|
610
|
-
excel_stream = self._prepare_excel_file(excel_path)
|
|
611
|
-
|
|
612
|
-
# Load and process workbook
|
|
613
|
-
workbook = load_workbook(excel_stream, read_only=True, data_only=True)
|
|
614
|
-
try:
|
|
615
|
-
return self._process_excel_worksheet(workbook.active)
|
|
616
|
-
finally:
|
|
617
|
-
workbook.close()
|
|
618
|
-
|
|
619
|
-
except ExcelSecurityError as e:
|
|
620
|
-
self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
|
|
621
|
-
raise
|
|
622
|
-
except ExcelParsingError as e:
|
|
623
|
-
self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
|
|
624
|
-
raise
|
|
625
|
-
except InvalidFileException as e:
|
|
626
|
-
self.run.log_message(f'Invalid Excel file format: {str(e)}', context=Context.ERROR.value)
|
|
627
|
-
raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
|
|
628
|
-
except MemoryError:
|
|
629
|
-
self.run.log_message('Excel file too large to process (memory limit exceeded)', context=Context.ERROR.value)
|
|
630
|
-
raise ExcelSecurityError('Excel file exceeds memory limits')
|
|
631
|
-
except (OSError, IOError) as e:
|
|
632
|
-
self.run.log_message(f'File access error reading excel metadata: {str(e)}', context=Context.ERROR.value)
|
|
633
|
-
raise ExcelParsingError(f'File access error: {str(e)}')
|
|
634
|
-
except Exception as e:
|
|
635
|
-
self.run.log_message(f'Unexpected error reading excel metadata: {str(e)}', context=Context.ERROR.value)
|
|
636
|
-
raise ExcelParsingError(f'Unexpected error: {str(e)}')
|
|
637
|
-
|
|
638
|
-
def start(self) -> Dict[str, Any]:
|
|
639
|
-
"""Start upload process.
|
|
640
|
-
|
|
641
|
-
Returns:
|
|
642
|
-
Dict: The result of the upload process.
|
|
643
|
-
"""
|
|
644
|
-
# Setup result dict early for error handling
|
|
645
|
-
result: Dict[str, Any] = {}
|
|
646
|
-
|
|
647
|
-
# Setup path object with path and storage.
|
|
648
|
-
storage = self.client.get_storage(self.params['storage'])
|
|
649
|
-
pathlib_cwd = get_pathlib(storage, self.params['path'])
|
|
650
|
-
|
|
651
|
-
# Read excel metadata if configured or default file exists
|
|
652
|
-
excel_metadata: Dict[str, Dict[str, Any]] = {}
|
|
653
|
-
try:
|
|
654
|
-
excel_metadata = self._read_excel_metadata(pathlib_cwd)
|
|
655
|
-
if excel_metadata:
|
|
656
|
-
self.run.log_message(f'Excel metadata loaded for {len(excel_metadata)} files')
|
|
657
|
-
except ExcelSecurityError as e:
|
|
658
|
-
# Security violations should stop the process entirely
|
|
659
|
-
self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
|
|
660
|
-
self.run.log_message('Upload aborted due to Excel security concerns.', context=Context.ERROR.value)
|
|
661
|
-
return result
|
|
662
|
-
except ExcelParsingError as e:
|
|
663
|
-
# Parsing errors can be non-critical if user didn't explicitly provide Excel file
|
|
664
|
-
if self.params.get('excel_metadata_path'):
|
|
665
|
-
# User explicitly provided Excel file, treat as error
|
|
666
|
-
self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
|
|
667
|
-
self.run.log_message('Upload aborted due to Excel parsing failure.', context=Context.ERROR.value)
|
|
668
|
-
return result
|
|
669
|
-
else:
|
|
670
|
-
# Default Excel file found but failed to parse, treat as warning and continue
|
|
671
|
-
self.run.log_message(f'Excel parsing failed (continuing): {str(e)}', context=Context.WARNING.value)
|
|
672
|
-
excel_metadata = {}
|
|
673
|
-
|
|
674
|
-
# Analyze Collection file specifications to determine the data structure for upload.
|
|
675
|
-
file_specification_template = self._analyze_collection()
|
|
676
|
-
organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
|
|
677
|
-
|
|
678
|
-
# Initialize uploader.
|
|
679
|
-
uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
|
|
680
|
-
|
|
681
|
-
# Get organized files from the uploader (plugin developer's custom implementation)
|
|
682
|
-
# or use the default organization method if uploader doesn't provide valid files
|
|
683
|
-
organized_files = uploader.handle_upload_files()
|
|
684
|
-
|
|
685
|
-
# Validate the organized files
|
|
686
|
-
if not self._validate_organized_files(organized_files, file_specification_template):
|
|
687
|
-
self.run.log_message('Validation failed.', context=Context.ERROR.value)
|
|
688
|
-
raise ActionError('Upload is aborted due to validation errors.')
|
|
689
|
-
|
|
690
|
-
# Upload files to synapse-backend.
|
|
691
|
-
if not organized_files:
|
|
692
|
-
self.run.log_message('Files not found on the path.', context=Context.WARNING.value)
|
|
693
|
-
raise ActionError('Upload is aborted due to missing files.')
|
|
694
|
-
uploaded_files = self._upload_files(organized_files)
|
|
695
|
-
result['uploaded_files_count'] = len(uploaded_files)
|
|
696
|
-
|
|
697
|
-
# Generate data units for the uploaded data.
|
|
698
|
-
if not uploaded_files:
|
|
699
|
-
self.run.log_message('No files were uploaded.', context=Context.WARNING.value)
|
|
700
|
-
raise ActionError('Upload is aborted due to no uploaded files.')
|
|
701
|
-
generated_data_units = self._generate_data_units(uploaded_files)
|
|
702
|
-
result['generated_data_units_count'] = len(generated_data_units)
|
|
703
|
-
|
|
704
|
-
# Setup task with uploaded synapse-backend data units.
|
|
705
|
-
if not generated_data_units:
|
|
706
|
-
self.run.log_message('No data units were generated.', context=Context.WARNING.value)
|
|
707
|
-
raise ActionError('Upload is aborted due to no generated data units.')
|
|
708
|
-
|
|
709
|
-
self.run.log_message('Import completed.')
|
|
710
|
-
return result
|
|
711
|
-
|
|
712
|
-
def _analyze_collection(self) -> Dict[str, Any]:
|
|
713
|
-
"""Analyze Synapse Collection Specifications.
|
|
714
|
-
|
|
715
|
-
Returns:
|
|
716
|
-
Dict: The file specifications of the collection.
|
|
717
|
-
"""
|
|
718
|
-
|
|
719
|
-
# Initialize progress
|
|
720
|
-
self.run.set_progress(0, 2, category='analyze_collection')
|
|
721
|
-
|
|
722
|
-
collection_id = self.params['data_collection']
|
|
723
|
-
self.run.set_progress(1, 2, category='analyze_collection')
|
|
724
|
-
|
|
725
|
-
collection = self.run.client.get_data_collection(collection_id)
|
|
726
|
-
self.run.set_progress(2, 2, category='analyze_collection')
|
|
727
|
-
|
|
728
|
-
return collection['file_specifications']
|
|
729
|
-
|
|
730
|
-
def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
731
|
-
"""Upload files to synapse-backend.
|
|
732
|
-
|
|
733
|
-
Returns:
|
|
734
|
-
Dict: The result of the upload.
|
|
735
|
-
"""
|
|
736
|
-
# Initialize progress
|
|
737
|
-
organized_files_count = len(organized_files)
|
|
738
|
-
self.run.set_progress(0, organized_files_count, category='upload_data_files')
|
|
739
|
-
self.run.log_message('Uploading data files...')
|
|
740
|
-
|
|
741
|
-
client = self.run.client
|
|
742
|
-
collection_id = self.params['data_collection']
|
|
743
|
-
upload_result = []
|
|
744
|
-
current_progress = 0
|
|
745
|
-
success_count = 0
|
|
746
|
-
failed_count = 0
|
|
747
|
-
|
|
748
|
-
# Initialize metrics
|
|
749
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
750
|
-
|
|
751
|
-
for organized_file in organized_files:
|
|
752
|
-
try:
|
|
753
|
-
uploaded_data_file = client.upload_data_file(organized_file, collection_id)
|
|
754
|
-
self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
|
|
755
|
-
success_count += 1
|
|
756
|
-
upload_result.append(uploaded_data_file)
|
|
757
|
-
except Exception as e:
|
|
758
|
-
self.run.log_data_file(organized_file, UploadStatus.FAILED)
|
|
759
|
-
self.run.log_message(f'Failed to upload file: {str(e)}')
|
|
760
|
-
failed_count += 1
|
|
761
|
-
|
|
762
|
-
current_progress += 1
|
|
763
|
-
self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
|
|
764
|
-
self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
|
|
765
|
-
|
|
766
|
-
# Finish progress
|
|
767
|
-
self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
|
|
768
|
-
|
|
769
|
-
return upload_result
|
|
770
|
-
|
|
771
|
-
def _generate_data_units(self, uploaded_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
772
|
-
"""Generate data units for the uploaded data.
|
|
773
|
-
|
|
774
|
-
TODO: make batch size configurable.
|
|
775
|
-
|
|
776
|
-
Returns:
|
|
777
|
-
Dict: The result of the generate data units process.
|
|
778
|
-
"""
|
|
779
|
-
# Initialize progress
|
|
780
|
-
upload_result_count = len(uploaded_files)
|
|
781
|
-
self.run.set_progress(0, upload_result_count, category='generate_data_units')
|
|
782
|
-
self.run.log_message('Generating data units...')
|
|
783
|
-
|
|
784
|
-
client = self.run.client
|
|
785
|
-
generated_data_units = []
|
|
786
|
-
current_progress = 0
|
|
787
|
-
success_count = 0
|
|
788
|
-
failed_count = 0
|
|
789
|
-
|
|
790
|
-
batches = get_batched_list(uploaded_files, 100)
|
|
791
|
-
batches_count = len(batches)
|
|
792
|
-
|
|
793
|
-
# Initialize metrics
|
|
794
|
-
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
795
|
-
|
|
796
|
-
for batch in batches:
|
|
797
|
-
try:
|
|
798
|
-
created_data_units = client.create_data_units(batch)
|
|
799
|
-
success_count += len(created_data_units)
|
|
800
|
-
generated_data_units.append(created_data_units)
|
|
801
|
-
for created_data_unit in created_data_units:
|
|
802
|
-
self.run.log_data_unit(
|
|
803
|
-
created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
|
|
804
|
-
)
|
|
805
|
-
except Exception as e:
|
|
806
|
-
failed_count += len(batch)
|
|
807
|
-
self.run.log_message(f'Failed to create data units batch: {str(e)}')
|
|
808
|
-
for _ in batch:
|
|
809
|
-
self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
|
|
810
|
-
|
|
811
|
-
current_progress += 1
|
|
812
|
-
self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
|
|
813
|
-
self.run.set_progress(current_progress, batches_count, category='generate_data_units')
|
|
814
|
-
|
|
815
|
-
# Finish progress
|
|
816
|
-
self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
|
|
817
|
-
|
|
818
|
-
return sum(generated_data_units, [])
|
|
819
|
-
|
|
820
|
-
def _validate_organized_files(
|
|
821
|
-
self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
|
|
822
|
-
) -> bool:
|
|
823
|
-
"""Validate organized files from Uploader."""
|
|
824
|
-
validator = FileSpecificationValidator(file_specification_template, organized_files)
|
|
825
|
-
return validator.validate()
|
|
826
|
-
|
|
827
|
-
def _organize_files(
|
|
828
|
-
self,
|
|
829
|
-
directory: Path,
|
|
830
|
-
file_specification: List[Dict[str, Any]],
|
|
831
|
-
excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
832
|
-
) -> List[Dict[str, Any]]:
|
|
833
|
-
"""Organize files according to the file specification.
|
|
834
|
-
|
|
835
|
-
This method handles type-based directory structure where files are organized in
|
|
836
|
-
directories named after file types (e.g., 'image_1/' directory contains image files
|
|
837
|
-
like '1.jpg', '2.jpg'). For each dataset ID found in the primary directory, it attempts
|
|
838
|
-
to find corresponding files in all type directories.
|
|
839
|
-
|
|
840
|
-
TODO: Add Logic to handle file specific name patterns and extensions.
|
|
841
|
-
(e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037,
|
|
842
|
-
image_2:S_DCH_230725_0156_LF_037.jpg)
|
|
843
|
-
|
|
844
|
-
Args:
|
|
845
|
-
directory (Path): Root directory containing files to organize.
|
|
846
|
-
file_specification (List[Dict[str, Any]]): File specification list with metadata.
|
|
847
|
-
excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
|
|
848
|
-
to their metadata key-value pairs from excel file.
|
|
849
|
-
|
|
850
|
-
Returns:
|
|
851
|
-
List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
|
|
852
|
-
"""
|
|
853
|
-
organized_files: List[Dict[str, Any]] = []
|
|
854
|
-
|
|
855
|
-
# Check for type-based directory structure (e.g., image_1/, pcd_1/)
|
|
856
|
-
type_dirs: Dict[str, Path] = {}
|
|
857
|
-
|
|
858
|
-
for spec in file_specification:
|
|
859
|
-
spec_name = spec['name']
|
|
860
|
-
spec_dir = directory / spec_name
|
|
861
|
-
if spec_dir.exists() and spec_dir.is_dir():
|
|
862
|
-
type_dirs[spec_name] = spec_dir
|
|
863
|
-
|
|
864
|
-
if type_dirs:
|
|
865
|
-
self.run.log_message(f'Found type directories: {list(type_dirs.keys())}')
|
|
866
|
-
|
|
867
|
-
# If type-based directories don't exist, exit early
|
|
868
|
-
if not type_dirs:
|
|
869
|
-
self.run.log_message('No type-based directory structure found.', context=Context.INFO.value)
|
|
870
|
-
return organized_files
|
|
871
|
-
|
|
872
|
-
self.run.log_message('Detected type-based directory structure')
|
|
873
|
-
|
|
874
|
-
# Collect and process files in a single pass
|
|
875
|
-
dataset_files = {}
|
|
876
|
-
required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
|
|
877
|
-
|
|
878
|
-
# Process all files from all type directories
|
|
879
|
-
for spec_name, dir_path in type_dirs.items():
|
|
880
|
-
for file_path in dir_path.glob('*'):
|
|
881
|
-
if file_path.is_file():
|
|
882
|
-
file_name = file_path.stem
|
|
883
|
-
|
|
884
|
-
# Initialize dataset entry if it doesn't exist
|
|
885
|
-
if file_name not in dataset_files:
|
|
886
|
-
dataset_files[file_name] = {}
|
|
887
|
-
|
|
888
|
-
# Map this file to its specification (handle duplicates)
|
|
889
|
-
if spec_name not in dataset_files[file_name]:
|
|
890
|
-
dataset_files[file_name][spec_name] = file_path
|
|
891
|
-
else:
|
|
892
|
-
existing_file = dataset_files[file_name][spec_name]
|
|
893
|
-
if file_path.stat().st_mtime > existing_file.stat().st_mtime:
|
|
894
|
-
dataset_files[file_name][spec_name] = file_path
|
|
895
|
-
|
|
896
|
-
if not dataset_files:
|
|
897
|
-
self.run.log_message('No files found.', context=Context.WARNING.value)
|
|
898
|
-
return organized_files
|
|
899
|
-
|
|
900
|
-
self.run.log_message(f'Discovered {len(dataset_files)} files')
|
|
901
|
-
|
|
902
|
-
# Organize datasets - check requirements and create metadata
|
|
903
|
-
for file_name, files_dict in sorted(dataset_files.items()):
|
|
904
|
-
if all(req in files_dict for req in required_specs):
|
|
905
|
-
# Get most common file extension
|
|
906
|
-
file_extensions = {}
|
|
907
|
-
for file_path in files_dict.values():
|
|
908
|
-
ext = file_path.suffix.lower()
|
|
909
|
-
if ext:
|
|
910
|
-
file_extensions[ext] = file_extensions.get(ext, 0) + 1
|
|
911
|
-
|
|
912
|
-
origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
|
|
913
|
-
|
|
914
|
-
# Create metadata for this dataset
|
|
915
|
-
meta_data: Dict[str, Any] = {
|
|
916
|
-
'origin_file_stem': file_name,
|
|
917
|
-
'origin_file_extension': origin_file_extension,
|
|
918
|
-
'created_at': datetime.now().isoformat(),
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
# Add excel metadata if available
|
|
922
|
-
if excel_metadata and file_name in excel_metadata:
|
|
923
|
-
meta_data.update(excel_metadata[file_name])
|
|
924
|
-
|
|
925
|
-
# Add the organized dataset
|
|
926
|
-
organized_files.append({'files': files_dict, 'meta': meta_data})
|
|
927
|
-
else:
|
|
928
|
-
missing = [req for req in required_specs if req not in files_dict]
|
|
929
|
-
self.run.log_message(
|
|
930
|
-
f'{file_name} missing required files: {", ".join(missing)}',
|
|
931
|
-
context=Context.WARNING.value,
|
|
932
|
-
)
|
|
933
|
-
|
|
934
|
-
return organized_files
|
|
935
|
-
|
|
936
|
-
def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
|
|
937
|
-
"""Update metrics for upload progress.
|
|
938
|
-
|
|
939
|
-
Args:
|
|
940
|
-
total_count (int): Total number of items to process.
|
|
941
|
-
success_count (int): Number of successfully processed items.
|
|
942
|
-
failed_count (int): Number of failed items.
|
|
943
|
-
category (str): The category of the metrics.
|
|
944
|
-
"""
|
|
945
|
-
if not self.run:
|
|
946
|
-
raise ValueError('Run instance not properly initialized')
|
|
947
|
-
|
|
948
|
-
# Type assertion to help the linter
|
|
949
|
-
assert isinstance(self.run, UploadRun)
|
|
950
|
-
|
|
951
|
-
metrics = self.run.MetricsRecord(
|
|
952
|
-
stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
|
|
953
|
-
)
|
|
954
|
-
self.run.log_metrics(metrics, category)
|