synapse-sdk 1.0.0b5__py3-none-any.whl → 2025.12.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. synapse_sdk/__init__.py +24 -0
  2. synapse_sdk/cli/code_server.py +305 -33
  3. synapse_sdk/clients/agent/__init__.py +2 -1
  4. synapse_sdk/clients/agent/container.py +143 -0
  5. synapse_sdk/clients/agent/ray.py +296 -38
  6. synapse_sdk/clients/backend/annotation.py +1 -1
  7. synapse_sdk/clients/backend/core.py +31 -4
  8. synapse_sdk/clients/backend/data_collection.py +82 -7
  9. synapse_sdk/clients/backend/hitl.py +1 -1
  10. synapse_sdk/clients/backend/ml.py +1 -1
  11. synapse_sdk/clients/base.py +211 -61
  12. synapse_sdk/loggers.py +46 -0
  13. synapse_sdk/plugins/README.md +1340 -0
  14. synapse_sdk/plugins/categories/base.py +59 -9
  15. synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
  16. synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
  17. synapse_sdk/plugins/categories/export/actions/export/action.py +165 -0
  18. synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
  19. synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
  20. synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
  21. synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
  22. synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
  23. synapse_sdk/plugins/categories/export/templates/config.yaml +19 -1
  24. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +390 -0
  25. synapse_sdk/plugins/categories/export/templates/plugin/export.py +153 -177
  26. synapse_sdk/plugins/categories/neural_net/actions/train.py +1130 -32
  27. synapse_sdk/plugins/categories/neural_net/actions/tune.py +157 -4
  28. synapse_sdk/plugins/categories/neural_net/templates/config.yaml +7 -4
  29. synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
  30. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
  31. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
  32. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
  33. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +148 -0
  34. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
  35. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
  36. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
  37. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +100 -0
  38. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +248 -0
  39. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
  40. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
  41. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +265 -0
  42. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
  43. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
  44. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +92 -0
  45. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +243 -0
  46. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
  47. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +19 -0
  48. synapse_sdk/plugins/categories/upload/actions/upload/action.py +236 -0
  49. synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
  50. synapse_sdk/plugins/categories/upload/actions/upload/enums.py +493 -0
  51. synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
  52. synapse_sdk/plugins/categories/upload/actions/upload/factory.py +138 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/models.py +214 -0
  54. synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +183 -0
  55. synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
  56. synapse_sdk/plugins/categories/upload/actions/upload/run.py +179 -0
  57. synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
  58. synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +107 -0
  59. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
  60. synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +63 -0
  61. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +91 -0
  62. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +82 -0
  63. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +235 -0
  64. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +201 -0
  65. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +104 -0
  66. synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +71 -0
  67. synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
  68. synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +82 -0
  69. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
  70. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
  71. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +29 -0
  72. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
  73. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +300 -0
  74. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +287 -0
  75. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
  76. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
  77. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
  78. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
  79. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +84 -0
  80. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
  81. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +60 -0
  82. synapse_sdk/plugins/categories/upload/actions/upload/utils.py +250 -0
  83. synapse_sdk/plugins/categories/upload/templates/README.md +470 -0
  84. synapse_sdk/plugins/categories/upload/templates/config.yaml +28 -2
  85. synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +310 -0
  86. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +82 -20
  87. synapse_sdk/plugins/models.py +111 -9
  88. synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
  89. synapse_sdk/plugins/templates/schema.json +7 -0
  90. synapse_sdk/plugins/utils/__init__.py +3 -0
  91. synapse_sdk/plugins/utils/ray_gcs.py +66 -0
  92. synapse_sdk/shared/__init__.py +25 -0
  93. synapse_sdk/utils/converters/dm/__init__.py +42 -41
  94. synapse_sdk/utils/converters/dm/base.py +137 -0
  95. synapse_sdk/utils/converters/dm/from_v1.py +208 -562
  96. synapse_sdk/utils/converters/dm/to_v1.py +258 -304
  97. synapse_sdk/utils/converters/dm/tools/__init__.py +214 -0
  98. synapse_sdk/utils/converters/dm/tools/answer.py +95 -0
  99. synapse_sdk/utils/converters/dm/tools/bounding_box.py +132 -0
  100. synapse_sdk/utils/converters/dm/tools/bounding_box_3d.py +121 -0
  101. synapse_sdk/utils/converters/dm/tools/classification.py +75 -0
  102. synapse_sdk/utils/converters/dm/tools/keypoint.py +117 -0
  103. synapse_sdk/utils/converters/dm/tools/named_entity.py +111 -0
  104. synapse_sdk/utils/converters/dm/tools/polygon.py +122 -0
  105. synapse_sdk/utils/converters/dm/tools/polyline.py +124 -0
  106. synapse_sdk/utils/converters/dm/tools/prompt.py +94 -0
  107. synapse_sdk/utils/converters/dm/tools/relation.py +86 -0
  108. synapse_sdk/utils/converters/dm/tools/segmentation.py +141 -0
  109. synapse_sdk/utils/converters/dm/tools/segmentation_3d.py +83 -0
  110. synapse_sdk/utils/converters/dm/types.py +168 -0
  111. synapse_sdk/utils/converters/dm/utils.py +162 -0
  112. synapse_sdk/utils/converters/dm_legacy/__init__.py +56 -0
  113. synapse_sdk/utils/converters/dm_legacy/from_v1.py +627 -0
  114. synapse_sdk/utils/converters/dm_legacy/to_v1.py +367 -0
  115. synapse_sdk/utils/file/__init__.py +58 -0
  116. synapse_sdk/utils/file/archive.py +32 -0
  117. synapse_sdk/utils/file/checksum.py +56 -0
  118. synapse_sdk/utils/file/chunking.py +31 -0
  119. synapse_sdk/utils/file/download.py +385 -0
  120. synapse_sdk/utils/file/encoding.py +40 -0
  121. synapse_sdk/utils/file/io.py +22 -0
  122. synapse_sdk/utils/file/upload.py +165 -0
  123. synapse_sdk/utils/file/video/__init__.py +29 -0
  124. synapse_sdk/utils/file/video/transcode.py +307 -0
  125. synapse_sdk/utils/{file.py → file.py.backup} +77 -0
  126. synapse_sdk/utils/network.py +272 -0
  127. synapse_sdk/utils/storage/__init__.py +6 -2
  128. synapse_sdk/utils/storage/providers/file_system.py +6 -0
  129. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/METADATA +19 -2
  130. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/RECORD +134 -74
  131. synapse_sdk/devtools/docs/.gitignore +0 -20
  132. synapse_sdk/devtools/docs/README.md +0 -41
  133. synapse_sdk/devtools/docs/blog/2019-05-28-first-blog-post.md +0 -12
  134. synapse_sdk/devtools/docs/blog/2019-05-29-long-blog-post.md +0 -44
  135. synapse_sdk/devtools/docs/blog/2021-08-01-mdx-blog-post.mdx +0 -24
  136. synapse_sdk/devtools/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
  137. synapse_sdk/devtools/docs/blog/2021-08-26-welcome/index.md +0 -29
  138. synapse_sdk/devtools/docs/blog/authors.yml +0 -25
  139. synapse_sdk/devtools/docs/blog/tags.yml +0 -19
  140. synapse_sdk/devtools/docs/docusaurus.config.ts +0 -138
  141. synapse_sdk/devtools/docs/package-lock.json +0 -17455
  142. synapse_sdk/devtools/docs/package.json +0 -47
  143. synapse_sdk/devtools/docs/sidebars.ts +0 -44
  144. synapse_sdk/devtools/docs/src/components/HomepageFeatures/index.tsx +0 -71
  145. synapse_sdk/devtools/docs/src/components/HomepageFeatures/styles.module.css +0 -11
  146. synapse_sdk/devtools/docs/src/css/custom.css +0 -30
  147. synapse_sdk/devtools/docs/src/pages/index.module.css +0 -23
  148. synapse_sdk/devtools/docs/src/pages/index.tsx +0 -21
  149. synapse_sdk/devtools/docs/src/pages/markdown-page.md +0 -7
  150. synapse_sdk/devtools/docs/static/.nojekyll +0 -0
  151. synapse_sdk/devtools/docs/static/img/docusaurus-social-card.jpg +0 -0
  152. synapse_sdk/devtools/docs/static/img/docusaurus.png +0 -0
  153. synapse_sdk/devtools/docs/static/img/favicon.ico +0 -0
  154. synapse_sdk/devtools/docs/static/img/logo.png +0 -0
  155. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_mountain.svg +0 -171
  156. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_react.svg +0 -170
  157. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_tree.svg +0 -40
  158. synapse_sdk/devtools/docs/tsconfig.json +0 -8
  159. synapse_sdk/plugins/categories/export/actions/export.py +0 -346
  160. synapse_sdk/plugins/categories/export/enums.py +0 -7
  161. synapse_sdk/plugins/categories/neural_net/actions/gradio.py +0 -151
  162. synapse_sdk/plugins/categories/pre_annotation/actions/to_task.py +0 -943
  163. synapse_sdk/plugins/categories/upload/actions/upload.py +0 -954
  164. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/WHEEL +0 -0
  165. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/entry_points.txt +0 -0
  166. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/licenses/LICENSE +0 -0
  167. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/top_level.txt +0 -0
@@ -1,954 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime
4
- from enum import Enum
5
- from io import BytesIO
6
- from pathlib import Path
7
- from typing import Annotated, Any, Dict, List, Optional
8
-
9
- from openpyxl import load_workbook
10
- from openpyxl.utils.exceptions import InvalidFileException
11
- from pydantic import AfterValidator, BaseModel, field_validator
12
- from pydantic_core import PydanticCustomError
13
-
14
- from synapse_sdk.clients.exceptions import ClientError
15
- from synapse_sdk.clients.utils import get_batched_list
16
- from synapse_sdk.clients.validators.collections import FileSpecificationValidator
17
- from synapse_sdk.i18n import gettext as _
18
- from synapse_sdk.plugins.categories.base import Action
19
- from synapse_sdk.plugins.categories.decorators import register_action
20
- from synapse_sdk.plugins.enums import PluginCategory, RunMethod
21
- from synapse_sdk.plugins.exceptions import ActionError
22
- from synapse_sdk.plugins.models import Run
23
- from synapse_sdk.shared.enums import Context
24
- from synapse_sdk.utils.pydantic.validators import non_blank
25
- from synapse_sdk.utils.storage import get_pathlib
26
-
27
-
28
- class PathAwareJSONEncoder(json.JSONEncoder):
29
- """Custom JSON encoder that handles Path-like objects."""
30
-
31
- def default(self, obj):
32
- if hasattr(obj, '__fspath__') or hasattr(obj, 'as_posix'):
33
- # Handle Path-like objects (including UPath, SFTPPath, pathlib.Path, etc.)
34
- return str(obj)
35
- elif hasattr(obj, 'isoformat'):
36
- # Handle datetime objects
37
- return obj.isoformat()
38
- # Let the base class handle other types
39
- return super().default(obj)
40
-
41
-
42
- class UploadStatus(str, Enum):
43
- SUCCESS = 'success'
44
- FAILED = 'failed'
45
-
46
-
47
- class UploadRun(Run):
48
- class DataFileLog(BaseModel):
49
- """Data file log model."""
50
-
51
- data_file_info: str | None
52
- status: UploadStatus
53
- created: str
54
-
55
- class DataUnitLog(BaseModel):
56
- """Data unit log model."""
57
-
58
- data_unit_id: int | None
59
- status: UploadStatus
60
- created: str
61
- data_unit_meta: dict | None
62
-
63
- class TaskLog(BaseModel):
64
- """Task log model."""
65
-
66
- task_id: int | None
67
- status: UploadStatus
68
- created: str
69
-
70
- class MetricsRecord(BaseModel):
71
- """Metrics record model."""
72
-
73
- stand_by: int
74
- failed: int
75
- success: int
76
-
77
- def log_data_file(self, data_file_info: dict, status: UploadStatus):
78
- """Upload data_file log.
79
-
80
- Args:
81
- data_file_info (dict): The json info of the data file.
82
- status (UploadStatus): The status of the data file.
83
- """
84
- now = datetime.now().isoformat()
85
- # Use custom JSON encoder to handle Path-like objects
86
- data_file_info_str = json.dumps(data_file_info, ensure_ascii=False, cls=PathAwareJSONEncoder)
87
- self.log(
88
- 'upload_data_file',
89
- self.DataFileLog(data_file_info=data_file_info_str, status=status.value, created=now).model_dump(),
90
- )
91
-
92
- def log_data_unit(self, data_unit_id: int, status: UploadStatus, data_unit_meta: dict | None = None):
93
- """Upload data_unit log.
94
-
95
- Args:
96
- data_unit_id (int): The ID of the data unit.
97
- status (UploadStatus): The status of the data unit.
98
- data_unit_meta (dict | None): The metadata of the data unit.
99
- """
100
- now = datetime.now().isoformat()
101
- self.log(
102
- 'upload_data_unit',
103
- self.DataUnitLog(
104
- data_unit_id=data_unit_id, status=status.value, created=now, data_unit_meta=data_unit_meta
105
- ).model_dump(),
106
- )
107
-
108
- def log_task(self, task_id: int, status: UploadStatus):
109
- """Upload task log.
110
-
111
- Args:
112
- task_id (int): The ID of the task.
113
- status (UploadStatus): The status of the task.
114
- """
115
- now = datetime.now().isoformat()
116
- self.log('upload_task', self.TaskLog(task_id=task_id, status=status.value, created=now).model_dump())
117
-
118
- def log_metrics(self, record: MetricsRecord, category: str):
119
- """Log upload metrics.
120
- Args:
121
- record (MetricsRecord): The metrics record to log.
122
- category (str): The category of the metrics.
123
- """
124
- record = self.MetricsRecord.model_validate(record)
125
- self.set_metrics(value=record.model_dump(), category=category)
126
-
127
-
128
- class ExcelSecurityError(Exception):
129
- """Custom exception for Excel security validation errors."""
130
-
131
- pass
132
-
133
-
134
- class ExcelParsingError(Exception):
135
- """Custom exception for Excel parsing errors."""
136
-
137
- pass
138
-
139
-
140
- class ExcelSecurityConfig:
141
- """Configuration class for Excel security settings."""
142
-
143
- def __init__(self):
144
- # File size limits
145
- self.MAX_FILE_SIZE_MB = int(os.getenv('EXCEL_MAX_FILE_SIZE_MB', '10'))
146
- self.MAX_FILE_SIZE_BYTES = self.MAX_FILE_SIZE_MB * 1024 * 1024
147
-
148
- # Memory limits
149
- self.MAX_MEMORY_USAGE_MB = int(os.getenv('EXCEL_MAX_MEMORY_MB', '30'))
150
- self.MAX_MEMORY_USAGE_BYTES = self.MAX_MEMORY_USAGE_MB * 1024 * 1024
151
-
152
- # Content limits
153
- self.MAX_ROWS = int(os.getenv('EXCEL_MAX_ROWS', '10000'))
154
- self.MAX_COLUMNS = int(os.getenv('EXCEL_MAX_COLUMNS', '50'))
155
-
156
- # String length limits
157
- self.MAX_FILENAME_LENGTH = int(os.getenv('EXCEL_MAX_FILENAME_LENGTH', '255'))
158
- self.MAX_COLUMN_NAME_LENGTH = int(os.getenv('EXCEL_MAX_COLUMN_NAME_LENGTH', '100'))
159
- self.MAX_METADATA_VALUE_LENGTH = int(os.getenv('EXCEL_MAX_METADATA_VALUE_LENGTH', '1000'))
160
-
161
-
162
- class ExcelMetadataUtils:
163
- """Utility class for Excel metadata processing with shared validation logic."""
164
-
165
- def __init__(self, config: ExcelSecurityConfig):
166
- self.config = config
167
-
168
- def validate_and_truncate_string(self, value: str, max_length: int) -> str:
169
- """Validate and truncate string to specified maximum length.
170
-
171
- Args:
172
- value: String value to validate and truncate
173
- max_length: Maximum allowed length
174
-
175
- Returns:
176
- str: Validated and potentially truncated string
177
- """
178
- if not isinstance(value, str):
179
- value = str(value)
180
-
181
- value = value.strip()
182
-
183
- if len(value) > max_length:
184
- return value[:max_length]
185
-
186
- return value
187
-
188
- def is_valid_filename_length(self, filename: str) -> bool:
189
- """Check if filename length is within acceptable limits.
190
-
191
- Args:
192
- filename: Filename to check
193
-
194
- Returns:
195
- bool: True if filename length is acceptable
196
- """
197
- return len(filename.strip()) <= self.config.MAX_FILENAME_LENGTH
198
-
199
-
200
- class UploadParams(BaseModel):
201
- """Upload action parameters.
202
-
203
- Args:
204
- name (str): The name of the action.
205
- description (str | None): The description of the action.
206
- checkpoint (int | None): The checkpoint of the action.
207
- path (str): The path of the action.
208
- storage (int): The storage of the action.
209
- collection (int): The collection of the action.
210
- project (int | None): The project of the action.
211
- excel_metadata_path (str | None): Path to excel file containing metadata.
212
- Defaults to 'meta.xlsx' or 'meta.xls' in the path directory.
213
- """
214
-
215
- name: Annotated[str, AfterValidator(non_blank)]
216
- description: str | None
217
- path: str
218
- storage: int
219
- collection: int
220
- project: int | None
221
- excel_metadata_path: str | None = None
222
-
223
- @field_validator('storage', mode='before')
224
- @classmethod
225
- def check_storage_exists(cls, value: str, info) -> str:
226
- """Validate synapse-backend storage exists.
227
-
228
- TODO: Need to define validation method naming convention.
229
- TODO: Need to make validation method reusable.
230
- """
231
- action = info.context['action']
232
- client = action.client
233
- try:
234
- client.get_storage(value)
235
- except ClientError:
236
- raise PydanticCustomError('client_error', _('Error occurred while checking storage exists.'))
237
- return value
238
-
239
- @field_validator('collection', mode='before')
240
- @classmethod
241
- def check_collection_exists(cls, value: str, info) -> str:
242
- """Validate synapse-backend collection exists."""
243
- action = info.context['action']
244
- client = action.client
245
- try:
246
- client.get_data_collection(value)
247
- except ClientError:
248
- raise PydanticCustomError('client_error', _('Error occurred while checking collection exists.'))
249
- return value
250
-
251
- @field_validator('project', mode='before')
252
- @classmethod
253
- def check_project_exists(cls, value: str, info) -> str:
254
- """Validate synapse-backend project exists."""
255
- if not value:
256
- return value
257
-
258
- action = info.context['action']
259
- client = action.client
260
- try:
261
- client.get_project(value)
262
- except ClientError:
263
- raise PydanticCustomError('client_error', _('Error occurred while checking project exists.'))
264
- return value
265
-
266
- @field_validator('excel_metadata_path', mode='before')
267
- @classmethod
268
- def check_excel_metadata_path(cls, value: str, info) -> str:
269
- """Validate excel metadata file exists and is secure if provided.
270
-
271
- This validator performs comprehensive security checks including:
272
- - File existence and format validation
273
- - File size limits (max 10MB)
274
- - Basic security checks for file content
275
-
276
- Args:
277
- value: The excel file path to validate
278
- info: Validation context information
279
-
280
- Returns:
281
- str: The validated file path
282
-
283
- Raises:
284
- PydanticCustomError: If validation fails
285
- """
286
- if not value:
287
- return value
288
-
289
- excel_path = Path(value)
290
-
291
- # Check file existence
292
- if not excel_path.exists():
293
- raise PydanticCustomError('file_not_found', _('Excel metadata file not found.'))
294
-
295
- # Check file extension
296
- if excel_path.suffix.lower() not in ['.xlsx', '.xls']:
297
- raise PydanticCustomError('invalid_file_type', _('Excel metadata file must be .xlsx or .xls format.'))
298
-
299
- # Security check: file size limit
300
- file_size = excel_path.stat().st_size
301
- excel_config = ExcelSecurityConfig()
302
- if file_size > excel_config.MAX_FILE_SIZE_BYTES:
303
- raise PydanticCustomError(
304
- 'file_too_large',
305
- _('Excel metadata file is too large. Maximum size is {}MB.').format(excel_config.MAX_FILE_SIZE_MB),
306
- )
307
-
308
- # Basic security check: ensure file is readable and not corrupted
309
- try:
310
- with open(excel_path, 'rb') as f:
311
- # Read first few bytes to check if it's a valid Excel file
312
- header = f.read(8)
313
- if not header:
314
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be empty.'))
315
-
316
- # Check for valid Excel file signatures
317
- if excel_path.suffix.lower() == '.xlsx':
318
- # XLSX files start with PK (ZIP signature)
319
- if not header.startswith(b'PK'):
320
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
321
- elif excel_path.suffix.lower() == '.xls':
322
- # XLS files have specific OLE signatures
323
- if not (header.startswith(b'\xd0\xcf\x11\xe0') or header.startswith(b'\x09\x08')):
324
- raise PydanticCustomError('invalid_file', _('Excel metadata file appears to be corrupted.'))
325
-
326
- except (OSError, IOError):
327
- raise PydanticCustomError('file_access_error', _('Cannot access Excel metadata file.'))
328
-
329
- return value
330
-
331
-
332
- @register_action
333
- class UploadAction(Action):
334
- """Upload action class.
335
-
336
- Attrs:
337
- name (str): The name of the action.
338
- category (PluginCategory): The category of the action.
339
- method (RunMethod): The method to run of the action.
340
-
341
- Progress Categories:
342
- analyze_collection: The progress category for the analyze collection process.
343
- data_file_upload: The progress category for the upload process.
344
- generate_data_units: The progress category for the generate data units process.
345
- generate_tasks: The progress category for the generate tasks process.
346
- generate_ground_truths: The progress category for the generate ground truths process.
347
-
348
- Metrics Categories:
349
- data_file: The metrics category for the data file.
350
- data_unit: The metrics category for the data unit.
351
- """
352
-
353
- name = 'upload'
354
- category = PluginCategory.UPLOAD
355
- method = RunMethod.JOB
356
- run_class = UploadRun
357
- progress_categories = {
358
- 'analyze_collection': {
359
- 'proportion': 2,
360
- },
361
- 'upload_data_files': {
362
- 'proportion': 38,
363
- },
364
- 'generate_data_units': {
365
- 'proportion': 60,
366
- },
367
- }
368
- metrics_categories = {
369
- 'data_files': {
370
- 'stand_by': 0,
371
- 'failed': 0,
372
- 'success': 0,
373
- },
374
- 'data_units': {
375
- 'stand_by': 0,
376
- 'failed': 0,
377
- 'success': 0,
378
- },
379
- }
380
-
381
- def __init__(self, *args, **kwargs):
382
- super().__init__(*args, **kwargs)
383
- self.excel_config = ExcelSecurityConfig()
384
- self.excel_utils = ExcelMetadataUtils(self.excel_config)
385
-
386
- def get_uploader(self, path, file_specification, organized_files):
387
- """Get uploader from entrypoint."""
388
- return self.entrypoint(self.run, path, file_specification, organized_files)
389
-
390
- def _validate_excel_security(self, excel_path: Path) -> None:
391
- """Validate Excel file security constraints.
392
-
393
- Performs comprehensive security validation including:
394
- - File size limits
395
- - Memory usage constraints
396
- - Basic malicious content detection
397
-
398
- Args:
399
- excel_path: Path to the Excel file to validate
400
-
401
- Raises:
402
- ExcelSecurityError: If security validation fails
403
- """
404
- # File size check (already done in validator, but double-check)
405
- file_size = excel_path.stat().st_size
406
- if file_size > self.excel_config.MAX_FILE_SIZE_BYTES:
407
- raise ExcelSecurityError(
408
- f'Excel file too large: {file_size} bytes (max: {self.excel_config.MAX_FILE_SIZE_BYTES})'
409
- )
410
-
411
- # Memory usage estimation (rough estimate: file_size * 3 for processing)
412
- estimated_memory = file_size * 3
413
- if estimated_memory > self.excel_config.MAX_MEMORY_USAGE_BYTES:
414
- raise ExcelSecurityError(
415
- f'Excel file may consume too much memory: ~{estimated_memory} bytes '
416
- f'(max: {self.excel_config.MAX_MEMORY_USAGE_BYTES})'
417
- )
418
-
419
- def _prepare_excel_file(self, excel_path: Path) -> BytesIO:
420
- """Prepare Excel file for reading with security validation.
421
-
422
- Args:
423
- excel_path: Path to the Excel file
424
-
425
- Returns:
426
- BytesIO: Excel file stream ready for reading
427
-
428
- Raises:
429
- ExcelSecurityError: If security validation fails
430
- """
431
- self._validate_excel_security(excel_path)
432
- excel_bytes = excel_path.read_bytes()
433
- return BytesIO(excel_bytes)
434
-
435
- def _process_excel_headers(self, headers: tuple) -> tuple:
436
- """Process and validate Excel headers.
437
-
438
- Args:
439
- headers: Raw header tuple from Excel
440
-
441
- Returns:
442
- tuple: Validated headers
443
-
444
- Raises:
445
- ExcelParsingError: If headers are invalid
446
- """
447
- if len(headers) < 2:
448
- raise ExcelParsingError('Excel file must have at least 2 columns (file name and metadata)')
449
- self._validate_excel_content(headers, 0) # Validate column count
450
- return headers
451
-
452
- def _process_excel_data_row(self, row: tuple, headers: tuple) -> Optional[Dict[str, Any]]:
453
- """Process a single Excel data row.
454
-
455
- Args:
456
- row: Raw row data from Excel
457
- headers: Excel headers
458
-
459
- Returns:
460
- Optional[Dict[str, Any]]: Processed row data or None if row should be skipped
461
- """
462
- # Skip empty rows
463
- if not row[0] or str(row[0]).strip() == '':
464
- return None
465
-
466
- file_name = str(row[0]).strip()
467
- if not self.excel_utils.is_valid_filename_length(file_name):
468
- self.run.log_message(
469
- f'Skipping file with overly long name: {file_name[:50]}...', context=Context.WARNING.value
470
- )
471
- return None
472
-
473
- # Create metadata dictionary from remaining columns
474
- file_metadata: Dict[str, Any] = {}
475
- for i, value in enumerate(row[1:], start=1):
476
- if value is not None and i < len(headers):
477
- header_value = headers[i]
478
- column_name = str(header_value).strip() if header_value is not None else f'column_{i}'
479
-
480
- # Validate and truncate column name and value
481
- column_name = self.excel_utils.validate_and_truncate_string(
482
- column_name, self.excel_config.MAX_COLUMN_NAME_LENGTH
483
- )
484
- str_value = self.excel_utils.validate_and_truncate_string(
485
- str(value), self.excel_config.MAX_METADATA_VALUE_LENGTH
486
- )
487
- file_metadata[column_name] = str_value
488
-
489
- return {file_name: file_metadata} if file_metadata else None
490
-
491
- def _process_excel_worksheet(self, worksheet) -> Dict[str, Dict[str, Any]]:
492
- """Process Excel worksheet and extract metadata.
493
-
494
- Args:
495
- worksheet: openpyxl worksheet object
496
-
497
- Returns:
498
- Dict[str, Dict[str, Any]]: Extracted metadata dictionary
499
-
500
- Raises:
501
- ExcelParsingError: If worksheet processing fails
502
- """
503
- if worksheet is None:
504
- raise ExcelParsingError('Excel file has no active worksheet')
505
-
506
- metadata_dict: Dict[str, Dict[str, Any]] = {}
507
- headers: Optional[tuple] = None
508
- data_row_count = 0
509
- validation_interval = getattr(self.excel_config, 'VALIDATION_CHECK_INTERVAL', 1000)
510
-
511
- # Process rows one by one for memory efficiency
512
- for row_idx, row in enumerate(worksheet.iter_rows(values_only=True)):
513
- if not row or all(cell is None or str(cell).strip() == '' for cell in row):
514
- continue # Skip completely empty rows
515
-
516
- if row_idx == 0: # Header row
517
- headers = self._process_excel_headers(row)
518
- continue
519
-
520
- # Data rows
521
- if headers is None:
522
- raise ExcelParsingError('Excel file missing header row')
523
-
524
- data_row_count += 1
525
-
526
- # Validate row count periodically
527
- if data_row_count % validation_interval == 0:
528
- self._validate_excel_content(headers, data_row_count)
529
-
530
- # Process individual row
531
- row_result = self._process_excel_data_row(row, headers)
532
- if row_result:
533
- metadata_dict.update(row_result)
534
-
535
- # Final validation
536
- self._validate_excel_content(headers or (), data_row_count)
537
-
538
- return metadata_dict
539
-
540
- def _validate_excel_content(self, headers: tuple, row_count: int) -> None:
541
- """Validate Excel content constraints.
542
-
543
- Args:
544
- headers: Tuple of header values from the first row
545
- row_count: Total number of data rows processed
546
-
547
- Raises:
548
- ExcelParsingError: If content validation fails
549
- """
550
- # Limit number of columns to prevent memory exhaustion
551
- if len(headers) > self.excel_config.MAX_COLUMNS:
552
- raise ExcelParsingError(f'Too many columns: {len(headers)} (max: {self.excel_config.MAX_COLUMNS})')
553
-
554
- # Limit number of rows to prevent excessive processing
555
- if row_count > self.excel_config.MAX_ROWS:
556
- raise ExcelParsingError(f'Too many rows: {row_count} (max: {self.excel_config.MAX_ROWS})')
557
-
558
- def _find_excel_metadata_file(self, pathlib_cwd: Path) -> Optional[Path]:
559
- """Find Excel metadata file in the directory.
560
-
561
- Checks for meta.xlsx and meta.xls in the given directory.
562
-
563
- Args:
564
- pathlib_cwd (Path): The pathlib object representing the current working directory.
565
-
566
- Returns:
567
- Optional[Path]: Path to the Excel metadata file if found, None otherwise.
568
- """
569
- # Check for xlsx first, then xls
570
- for extension in ['.xlsx', '.xls']:
571
- excel_path = pathlib_cwd / f'meta{extension}'
572
- if excel_path.exists() and excel_path.is_file():
573
- return excel_path
574
- return None
575
-
576
- def _read_excel_metadata(self, pathlib_cwd: Path) -> Dict[str, Dict[str, Any]]:
577
- """Read metadata from excel file with comprehensive security validation.
578
-
579
- This method orchestrates the Excel metadata reading process by delegating
580
- to specialized methods for each step.
581
-
582
- Args:
583
- pathlib_cwd (Path): The pathlib object representing the current working directory.
584
-
585
- Returns:
586
- Dict[str, Dict[str, Any]]: Dictionary mapping file names to their metadata key-value pairs.
587
- Empty dict if no metadata is configured or if reading fails.
588
-
589
- Raises:
590
- ExcelSecurityError: If security validation fails
591
- ExcelParsingError: If Excel content is invalid or exceeds limits
592
- """
593
- excel_path = None
594
-
595
- # Check if user provided a specific excel_metadata_path
596
- if self.params.get('excel_metadata_path'):
597
- excel_path = pathlib_cwd / self.params['excel_metadata_path']
598
- if not excel_path.exists():
599
- self.run.log_message(f'Excel metadata file not found: {excel_path}', context=Context.WARNING.value)
600
- return {}
601
- else:
602
- # Look for default meta.xlsx or meta.xls
603
- excel_path = self._find_excel_metadata_file(pathlib_cwd)
604
- if not excel_path:
605
- # No Excel metadata file found, return empty dict (not an error)
606
- return {}
607
-
608
- try:
609
- # Prepare Excel file with security validation
610
- excel_stream = self._prepare_excel_file(excel_path)
611
-
612
- # Load and process workbook
613
- workbook = load_workbook(excel_stream, read_only=True, data_only=True)
614
- try:
615
- return self._process_excel_worksheet(workbook.active)
616
- finally:
617
- workbook.close()
618
-
619
- except ExcelSecurityError as e:
620
- self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
621
- raise
622
- except ExcelParsingError as e:
623
- self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
624
- raise
625
- except InvalidFileException as e:
626
- self.run.log_message(f'Invalid Excel file format: {str(e)}', context=Context.ERROR.value)
627
- raise ExcelParsingError(f'Invalid Excel file format: {str(e)}')
628
- except MemoryError:
629
- self.run.log_message('Excel file too large to process (memory limit exceeded)', context=Context.ERROR.value)
630
- raise ExcelSecurityError('Excel file exceeds memory limits')
631
- except (OSError, IOError) as e:
632
- self.run.log_message(f'File access error reading excel metadata: {str(e)}', context=Context.ERROR.value)
633
- raise ExcelParsingError(f'File access error: {str(e)}')
634
- except Exception as e:
635
- self.run.log_message(f'Unexpected error reading excel metadata: {str(e)}', context=Context.ERROR.value)
636
- raise ExcelParsingError(f'Unexpected error: {str(e)}')
637
-
638
- def start(self) -> Dict[str, Any]:
639
- """Start upload process.
640
-
641
- Returns:
642
- Dict: The result of the upload process.
643
- """
644
- # Setup result dict early for error handling
645
- result: Dict[str, Any] = {}
646
-
647
- # Setup path object with path and storage.
648
- storage = self.client.get_storage(self.params['storage'])
649
- pathlib_cwd = get_pathlib(storage, self.params['path'])
650
-
651
- # Read excel metadata if configured or default file exists
652
- excel_metadata: Dict[str, Dict[str, Any]] = {}
653
- try:
654
- excel_metadata = self._read_excel_metadata(pathlib_cwd)
655
- if excel_metadata:
656
- self.run.log_message(f'Excel metadata loaded for {len(excel_metadata)} files')
657
- except ExcelSecurityError as e:
658
- # Security violations should stop the process entirely
659
- self.run.log_message(f'Excel security validation failed: {str(e)}', context=Context.ERROR.value)
660
- self.run.log_message('Upload aborted due to Excel security concerns.', context=Context.ERROR.value)
661
- return result
662
- except ExcelParsingError as e:
663
- # Parsing errors can be non-critical if user didn't explicitly provide Excel file
664
- if self.params.get('excel_metadata_path'):
665
- # User explicitly provided Excel file, treat as error
666
- self.run.log_message(f'Excel parsing failed: {str(e)}', context=Context.ERROR.value)
667
- self.run.log_message('Upload aborted due to Excel parsing failure.', context=Context.ERROR.value)
668
- return result
669
- else:
670
- # Default Excel file found but failed to parse, treat as warning and continue
671
- self.run.log_message(f'Excel parsing failed (continuing): {str(e)}', context=Context.WARNING.value)
672
- excel_metadata = {}
673
-
674
- # Analyze Collection file specifications to determine the data structure for upload.
675
- file_specification_template = self._analyze_collection()
676
- organized_files = self._organize_files(pathlib_cwd, file_specification_template, excel_metadata)
677
-
678
- # Initialize uploader.
679
- uploader = self.get_uploader(pathlib_cwd, file_specification_template, organized_files)
680
-
681
- # Get organized files from the uploader (plugin developer's custom implementation)
682
- # or use the default organization method if uploader doesn't provide valid files
683
- organized_files = uploader.handle_upload_files()
684
-
685
- # Validate the organized files
686
- if not self._validate_organized_files(organized_files, file_specification_template):
687
- self.run.log_message('Validation failed.', context=Context.ERROR.value)
688
- raise ActionError('Upload is aborted due to validation errors.')
689
-
690
- # Upload files to synapse-backend.
691
- if not organized_files:
692
- self.run.log_message('Files not found on the path.', context=Context.WARNING.value)
693
- raise ActionError('Upload is aborted due to missing files.')
694
- uploaded_files = self._upload_files(organized_files)
695
- result['uploaded_files_count'] = len(uploaded_files)
696
-
697
- # Generate data units for the uploaded data.
698
- if not uploaded_files:
699
- self.run.log_message('No files were uploaded.', context=Context.WARNING.value)
700
- raise ActionError('Upload is aborted due to no uploaded files.')
701
- generated_data_units = self._generate_data_units(uploaded_files)
702
- result['generated_data_units_count'] = len(generated_data_units)
703
-
704
- # Setup task with uploaded synapse-backend data units.
705
- if not generated_data_units:
706
- self.run.log_message('No data units were generated.', context=Context.WARNING.value)
707
- raise ActionError('Upload is aborted due to no generated data units.')
708
-
709
- self.run.log_message('Import completed.')
710
- return result
711
-
712
- def _analyze_collection(self) -> Dict[str, Any]:
713
- """Analyze Synapse Collection Specifications.
714
-
715
- Returns:
716
- Dict: The file specifications of the collection.
717
- """
718
-
719
- # Initialize progress
720
- self.run.set_progress(0, 2, category='analyze_collection')
721
-
722
- collection_id = self.params['data_collection']
723
- self.run.set_progress(1, 2, category='analyze_collection')
724
-
725
- collection = self.run.client.get_data_collection(collection_id)
726
- self.run.set_progress(2, 2, category='analyze_collection')
727
-
728
- return collection['file_specifications']
729
-
730
- def _upload_files(self, organized_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
731
- """Upload files to synapse-backend.
732
-
733
- Returns:
734
- Dict: The result of the upload.
735
- """
736
- # Initialize progress
737
- organized_files_count = len(organized_files)
738
- self.run.set_progress(0, organized_files_count, category='upload_data_files')
739
- self.run.log_message('Uploading data files...')
740
-
741
- client = self.run.client
742
- collection_id = self.params['data_collection']
743
- upload_result = []
744
- current_progress = 0
745
- success_count = 0
746
- failed_count = 0
747
-
748
- # Initialize metrics
749
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
750
-
751
- for organized_file in organized_files:
752
- try:
753
- uploaded_data_file = client.upload_data_file(organized_file, collection_id)
754
- self.run.log_data_file(organized_file, UploadStatus.SUCCESS)
755
- success_count += 1
756
- upload_result.append(uploaded_data_file)
757
- except Exception as e:
758
- self.run.log_data_file(organized_file, UploadStatus.FAILED)
759
- self.run.log_message(f'Failed to upload file: {str(e)}')
760
- failed_count += 1
761
-
762
- current_progress += 1
763
- self._update_metrics(organized_files_count, success_count, failed_count, 'data_files')
764
- self.run.set_progress(current_progress, organized_files_count, category='upload_data_files')
765
-
766
- # Finish progress
767
- self.run.set_progress(organized_files_count, organized_files_count, category='upload_data_files')
768
-
769
- return upload_result
770
-
771
- def _generate_data_units(self, uploaded_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
772
- """Generate data units for the uploaded data.
773
-
774
- TODO: make batch size configurable.
775
-
776
- Returns:
777
- Dict: The result of the generate data units process.
778
- """
779
- # Initialize progress
780
- upload_result_count = len(uploaded_files)
781
- self.run.set_progress(0, upload_result_count, category='generate_data_units')
782
- self.run.log_message('Generating data units...')
783
-
784
- client = self.run.client
785
- generated_data_units = []
786
- current_progress = 0
787
- success_count = 0
788
- failed_count = 0
789
-
790
- batches = get_batched_list(uploaded_files, 100)
791
- batches_count = len(batches)
792
-
793
- # Initialize metrics
794
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
795
-
796
- for batch in batches:
797
- try:
798
- created_data_units = client.create_data_units(batch)
799
- success_count += len(created_data_units)
800
- generated_data_units.append(created_data_units)
801
- for created_data_unit in created_data_units:
802
- self.run.log_data_unit(
803
- created_data_unit['id'], UploadStatus.SUCCESS, data_unit_meta=created_data_unit.get('meta')
804
- )
805
- except Exception as e:
806
- failed_count += len(batch)
807
- self.run.log_message(f'Failed to create data units batch: {str(e)}')
808
- for _ in batch:
809
- self.run.log_data_unit(None, UploadStatus.FAILED, data_unit_meta=None)
810
-
811
- current_progress += 1
812
- self._update_metrics(upload_result_count, success_count, failed_count, 'data_units')
813
- self.run.set_progress(current_progress, batches_count, category='generate_data_units')
814
-
815
- # Finish progress
816
- self.run.set_progress(upload_result_count, upload_result_count, category='generate_data_units')
817
-
818
- return sum(generated_data_units, [])
819
-
820
- def _validate_organized_files(
821
- self, organized_files: List[Dict[str, Any]], file_specification_template: Dict[str, Any]
822
- ) -> bool:
823
- """Validate organized files from Uploader."""
824
- validator = FileSpecificationValidator(file_specification_template, organized_files)
825
- return validator.validate()
826
-
827
- def _organize_files(
828
- self,
829
- directory: Path,
830
- file_specification: List[Dict[str, Any]],
831
- excel_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
832
- ) -> List[Dict[str, Any]]:
833
- """Organize files according to the file specification.
834
-
835
- This method handles type-based directory structure where files are organized in
836
- directories named after file types (e.g., 'image_1/' directory contains image files
837
- like '1.jpg', '2.jpg'). For each dataset ID found in the primary directory, it attempts
838
- to find corresponding files in all type directories.
839
-
840
- TODO: Add Logic to handle file specific name patterns and extensions.
841
- (e.g. pcd:S_DCH_230725_0156_LR_037.pcd, image_1:S_DCH_230725_0156_FC_037,
842
- image_2:S_DCH_230725_0156_LF_037.jpg)
843
-
844
- Args:
845
- directory (Path): Root directory containing files to organize.
846
- file_specification (List[Dict[str, Any]]): File specification list with metadata.
847
- excel_metadata (Optional[Dict[str, Dict[str, Any]]]): Dictionary mapping file names
848
- to their metadata key-value pairs from excel file.
849
-
850
- Returns:
851
- List[Dict[str, Any]]: List of dictionaries containing organized files with metadata.
852
- """
853
- organized_files: List[Dict[str, Any]] = []
854
-
855
- # Check for type-based directory structure (e.g., image_1/, pcd_1/)
856
- type_dirs: Dict[str, Path] = {}
857
-
858
- for spec in file_specification:
859
- spec_name = spec['name']
860
- spec_dir = directory / spec_name
861
- if spec_dir.exists() and spec_dir.is_dir():
862
- type_dirs[spec_name] = spec_dir
863
-
864
- if type_dirs:
865
- self.run.log_message(f'Found type directories: {list(type_dirs.keys())}')
866
-
867
- # If type-based directories don't exist, exit early
868
- if not type_dirs:
869
- self.run.log_message('No type-based directory structure found.', context=Context.INFO.value)
870
- return organized_files
871
-
872
- self.run.log_message('Detected type-based directory structure')
873
-
874
- # Collect and process files in a single pass
875
- dataset_files = {}
876
- required_specs = [spec['name'] for spec in file_specification if spec.get('is_required', False)]
877
-
878
- # Process all files from all type directories
879
- for spec_name, dir_path in type_dirs.items():
880
- for file_path in dir_path.glob('*'):
881
- if file_path.is_file():
882
- file_name = file_path.stem
883
-
884
- # Initialize dataset entry if it doesn't exist
885
- if file_name not in dataset_files:
886
- dataset_files[file_name] = {}
887
-
888
- # Map this file to its specification (handle duplicates)
889
- if spec_name not in dataset_files[file_name]:
890
- dataset_files[file_name][spec_name] = file_path
891
- else:
892
- existing_file = dataset_files[file_name][spec_name]
893
- if file_path.stat().st_mtime > existing_file.stat().st_mtime:
894
- dataset_files[file_name][spec_name] = file_path
895
-
896
- if not dataset_files:
897
- self.run.log_message('No files found.', context=Context.WARNING.value)
898
- return organized_files
899
-
900
- self.run.log_message(f'Discovered {len(dataset_files)} files')
901
-
902
- # Organize datasets - check requirements and create metadata
903
- for file_name, files_dict in sorted(dataset_files.items()):
904
- if all(req in files_dict for req in required_specs):
905
- # Get most common file extension
906
- file_extensions = {}
907
- for file_path in files_dict.values():
908
- ext = file_path.suffix.lower()
909
- if ext:
910
- file_extensions[ext] = file_extensions.get(ext, 0) + 1
911
-
912
- origin_file_extension = max(file_extensions.items(), key=lambda x: x[1])[0] if file_extensions else ''
913
-
914
- # Create metadata for this dataset
915
- meta_data: Dict[str, Any] = {
916
- 'origin_file_stem': file_name,
917
- 'origin_file_extension': origin_file_extension,
918
- 'created_at': datetime.now().isoformat(),
919
- }
920
-
921
- # Add excel metadata if available
922
- if excel_metadata and file_name in excel_metadata:
923
- meta_data.update(excel_metadata[file_name])
924
-
925
- # Add the organized dataset
926
- organized_files.append({'files': files_dict, 'meta': meta_data})
927
- else:
928
- missing = [req for req in required_specs if req not in files_dict]
929
- self.run.log_message(
930
- f'{file_name} missing required files: {", ".join(missing)}',
931
- context=Context.WARNING.value,
932
- )
933
-
934
- return organized_files
935
-
936
- def _update_metrics(self, total_count: int, success_count: int, failed_count: int, category: str):
937
- """Update metrics for upload progress.
938
-
939
- Args:
940
- total_count (int): Total number of items to process.
941
- success_count (int): Number of successfully processed items.
942
- failed_count (int): Number of failed items.
943
- category (str): The category of the metrics.
944
- """
945
- if not self.run:
946
- raise ValueError('Run instance not properly initialized')
947
-
948
- # Type assertion to help the linter
949
- assert isinstance(self.run, UploadRun)
950
-
951
- metrics = self.run.MetricsRecord(
952
- stand_by=total_count - success_count - failed_count, failed=failed_count, success=success_count
953
- )
954
- self.run.log_metrics(metrics, category)