deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
deriva_ml/core/exceptions.py
CHANGED
|
@@ -1,28 +1,311 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Custom exceptions for the DerivaML package.
|
|
2
|
+
|
|
3
|
+
This module defines the exception hierarchy for DerivaML. All DerivaML-specific
|
|
4
|
+
exceptions inherit from DerivaMLException, making it easy to catch all library
|
|
5
|
+
errors with a single except clause.
|
|
6
|
+
|
|
7
|
+
Exception Hierarchy:
|
|
8
|
+
DerivaMLException (base class for all DerivaML errors)
|
|
9
|
+
│
|
|
10
|
+
├── DerivaMLConfigurationError (configuration and initialization)
|
|
11
|
+
│ ├── DerivaMLSchemaError (schema/catalog structure issues)
|
|
12
|
+
│ └── DerivaMLAuthenticationError (authentication failures)
|
|
13
|
+
│
|
|
14
|
+
├── DerivaMLDataError (data access and validation)
|
|
15
|
+
│ ├── DerivaMLNotFoundError (entity not found)
|
|
16
|
+
│ │ ├── DerivaMLDatasetNotFound (dataset lookup failures)
|
|
17
|
+
│ │ ├── DerivaMLTableNotFound (table lookup failures)
|
|
18
|
+
│ │ └── DerivaMLInvalidTerm (vocabulary term not found)
|
|
19
|
+
│ ├── DerivaMLTableTypeError (wrong table type)
|
|
20
|
+
│ ├── DerivaMLValidationError (data validation failures)
|
|
21
|
+
│ └── DerivaMLCycleError (cycle detected in relationships)
|
|
22
|
+
│
|
|
23
|
+
├── DerivaMLExecutionError (execution lifecycle)
|
|
24
|
+
│ ├── DerivaMLWorkflowError (workflow issues)
|
|
25
|
+
│ └── DerivaMLUploadError (asset upload failures)
|
|
26
|
+
│
|
|
27
|
+
└── DerivaMLReadOnlyError (write operation on read-only resource)
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> from deriva_ml.core.exceptions import DerivaMLException, DerivaMLNotFoundError
|
|
31
|
+
>>> try:
|
|
32
|
+
... dataset = ml.lookup_dataset("invalid_rid")
|
|
33
|
+
... except DerivaMLDatasetNotFound as e:
|
|
34
|
+
... print(f"Dataset not found: {e}")
|
|
35
|
+
... except DerivaMLNotFoundError as e:
|
|
36
|
+
... print(f"Entity not found: {e}")
|
|
37
|
+
... except DerivaMLException as e:
|
|
38
|
+
... print(f"DerivaML error: {e}")
|
|
3
39
|
"""
|
|
4
40
|
|
|
5
41
|
|
|
6
42
|
class DerivaMLException(Exception):
|
|
7
|
-
"""
|
|
43
|
+
"""Base exception class for all DerivaML errors.
|
|
44
|
+
|
|
45
|
+
This is the root exception for all DerivaML-specific errors. Catching this
|
|
46
|
+
exception will catch any error raised by the DerivaML library.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
_msg: The error message stored for later access.
|
|
8
50
|
|
|
9
51
|
Args:
|
|
10
|
-
msg
|
|
52
|
+
msg: Descriptive error message. Defaults to empty string.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> raise DerivaMLException("Failed to connect to catalog")
|
|
56
|
+
DerivaMLException: Failed to connect to catalog
|
|
11
57
|
"""
|
|
12
58
|
|
|
13
|
-
def __init__(self, msg=""):
|
|
59
|
+
def __init__(self, msg: str = "") -> None:
|
|
14
60
|
super().__init__(msg)
|
|
15
61
|
self._msg = msg
|
|
16
62
|
|
|
17
63
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
64
|
+
# =============================================================================
|
|
65
|
+
# Configuration and Initialization Errors
|
|
66
|
+
# =============================================================================
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DerivaMLConfigurationError(DerivaMLException):
|
|
70
|
+
"""Exception raised for configuration and initialization errors.
|
|
71
|
+
|
|
72
|
+
Raised when there are issues with DerivaML configuration, catalog
|
|
73
|
+
initialization, or schema setup.
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> raise DerivaMLConfigurationError("Invalid catalog configuration")
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DerivaMLSchemaError(DerivaMLConfigurationError):
|
|
83
|
+
"""Exception raised for schema or catalog structure issues.
|
|
84
|
+
|
|
85
|
+
Raised when the catalog schema is invalid, missing required tables,
|
|
86
|
+
or has structural problems that prevent normal operation.
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> raise DerivaMLSchemaError("Ambiguous domain schema: ['Schema1', 'Schema2']")
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DerivaMLAuthenticationError(DerivaMLConfigurationError):
|
|
96
|
+
"""Exception raised for authentication failures.
|
|
97
|
+
|
|
98
|
+
Raised when authentication with the catalog fails or credentials are invalid.
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> raise DerivaMLAuthenticationError("Failed to authenticate with catalog")
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# =============================================================================
|
|
108
|
+
# Data Access and Validation Errors
|
|
109
|
+
# =============================================================================
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class DerivaMLDataError(DerivaMLException):
|
|
113
|
+
"""Exception raised for data access and validation issues.
|
|
114
|
+
|
|
115
|
+
Base class for errors related to data lookup, validation, and integrity.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
>>> raise DerivaMLDataError("Invalid data format")
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class DerivaMLNotFoundError(DerivaMLDataError):
|
|
125
|
+
"""Exception raised when an entity cannot be found.
|
|
126
|
+
|
|
127
|
+
Raised when a lookup operation fails to find the requested entity
|
|
128
|
+
(dataset, table, term, etc.) in the catalog or bag.
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
>>> raise DerivaMLNotFoundError("Entity '1-ABC' not found in catalog")
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class DerivaMLDatasetNotFound(DerivaMLNotFoundError):
|
|
138
|
+
"""Exception raised when a dataset cannot be found.
|
|
139
|
+
|
|
140
|
+
Raised when attempting to look up a dataset that doesn't exist in the
|
|
141
|
+
catalog or downloaded bag.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
dataset_rid: The RID of the dataset that was not found.
|
|
145
|
+
msg: Additional context. Defaults to "Dataset not found".
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
>>> raise DerivaMLDatasetNotFound("1-ABC")
|
|
149
|
+
DerivaMLDatasetNotFound: Dataset 1-ABC not found
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, dataset_rid: str, msg: str = "Dataset not found") -> None:
|
|
153
|
+
super().__init__(f"{msg}: {dataset_rid}")
|
|
154
|
+
self.dataset_rid = dataset_rid
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class DerivaMLTableNotFound(DerivaMLNotFoundError):
|
|
158
|
+
"""Exception raised when a table cannot be found.
|
|
159
|
+
|
|
160
|
+
Raised when attempting to access a table that doesn't exist in the
|
|
161
|
+
catalog schema or downloaded bag.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
table_name: The name of the table that was not found.
|
|
165
|
+
msg: Additional context. Defaults to "Table not found".
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
>>> raise DerivaMLTableNotFound("MyTable")
|
|
169
|
+
DerivaMLTableNotFound: Table not found: MyTable
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def __init__(self, table_name: str, msg: str = "Table not found") -> None:
|
|
173
|
+
super().__init__(f"{msg}: {table_name}")
|
|
174
|
+
self.table_name = table_name
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class DerivaMLInvalidTerm(DerivaMLNotFoundError):
|
|
178
|
+
"""Exception raised when a vocabulary term is not found or invalid.
|
|
179
|
+
|
|
180
|
+
Raised when attempting to look up or use a term that doesn't exist in
|
|
181
|
+
a controlled vocabulary table, or when a term name/synonym cannot be resolved.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
vocabulary: Name of the vocabulary table being searched.
|
|
185
|
+
term: The term name that was not found.
|
|
186
|
+
msg: Additional context about the error. Defaults to "Term doesn't exist".
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
>>> raise DerivaMLInvalidTerm("Diagnosis", "unknown_condition")
|
|
190
|
+
DerivaMLInvalidTerm: Invalid term unknown_condition in vocabulary Diagnosis: Term doesn't exist.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(self, vocabulary: str, term: str, msg: str = "Term doesn't exist") -> None:
|
|
22
194
|
super().__init__(f"Invalid term {term} in vocabulary {vocabulary}: {msg}.")
|
|
195
|
+
self.vocabulary = vocabulary
|
|
196
|
+
self.term = term
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class DerivaMLTableTypeError(DerivaMLDataError):
|
|
200
|
+
"""Exception raised when a RID or table is not of the expected type.
|
|
201
|
+
|
|
202
|
+
Raised when an operation requires a specific table type (e.g., Dataset,
|
|
203
|
+
Execution) but receives a RID or table reference of a different type.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
table_type: The expected table type (e.g., "Dataset", "Execution").
|
|
207
|
+
table: The actual table name or RID that was provided.
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> raise DerivaMLTableTypeError("Dataset", "1-ABC123")
|
|
211
|
+
DerivaMLTableTypeError: Table 1-ABC123 is not of type Dataset.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
def __init__(self, table_type: str, table: str) -> None:
|
|
215
|
+
super().__init__(f"Table {table} is not of type {table_type}.")
|
|
216
|
+
self.table_type = table_type
|
|
217
|
+
self.table = table
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class DerivaMLValidationError(DerivaMLDataError):
|
|
221
|
+
"""Exception raised when data validation fails.
|
|
222
|
+
|
|
223
|
+
Raised when input data fails validation, such as invalid RID format,
|
|
224
|
+
mismatched metadata, or constraint violations.
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
>>> raise DerivaMLValidationError("Invalid RID format: ABC")
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
pass
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class DerivaMLCycleError(DerivaMLDataError):
|
|
234
|
+
"""Exception raised when a cycle is detected in relationships.
|
|
235
|
+
|
|
236
|
+
Raised when creating dataset hierarchies or other relationships that
|
|
237
|
+
would result in a circular dependency.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
cycle_nodes: List of nodes involved in the cycle.
|
|
241
|
+
msg: Additional context. Defaults to "Cycle detected".
|
|
242
|
+
|
|
243
|
+
Example:
|
|
244
|
+
>>> raise DerivaMLCycleError(["Dataset1", "Dataset2", "Dataset1"])
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(self, cycle_nodes: list[str], msg: str = "Cycle detected") -> None:
|
|
248
|
+
super().__init__(f"{msg}: {cycle_nodes}")
|
|
249
|
+
self.cycle_nodes = cycle_nodes
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# =============================================================================
|
|
253
|
+
# Execution Lifecycle Errors
|
|
254
|
+
# =============================================================================
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class DerivaMLExecutionError(DerivaMLException):
|
|
258
|
+
"""Exception raised for execution lifecycle issues.
|
|
259
|
+
|
|
260
|
+
Base class for errors related to workflow execution, asset management,
|
|
261
|
+
and provenance tracking.
|
|
262
|
+
|
|
263
|
+
Example:
|
|
264
|
+
>>> raise DerivaMLExecutionError("Execution failed to initialize")
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class DerivaMLWorkflowError(DerivaMLExecutionError):
|
|
271
|
+
"""Exception raised for workflow-related issues.
|
|
272
|
+
|
|
273
|
+
Raised when there are problems with workflow lookup, creation, or
|
|
274
|
+
Git integration for workflow tracking.
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
>>> raise DerivaMLWorkflowError("Not executing in a Git repository")
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class DerivaMLUploadError(DerivaMLExecutionError):
|
|
284
|
+
"""Exception raised for asset upload failures.
|
|
285
|
+
|
|
286
|
+
Raised when uploading assets to the catalog fails, including file
|
|
287
|
+
uploads, metadata insertion, and provenance recording.
|
|
288
|
+
|
|
289
|
+
Example:
|
|
290
|
+
>>> raise DerivaMLUploadError("Failed to upload execution assets")
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
pass
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# =============================================================================
|
|
297
|
+
# Read-Only Resource Errors
|
|
298
|
+
# =============================================================================
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class DerivaMLReadOnlyError(DerivaMLException):
|
|
302
|
+
"""Exception raised when attempting write operations on read-only resources.
|
|
303
|
+
|
|
304
|
+
Raised when attempting to modify data in a downloaded bag or other
|
|
305
|
+
read-only context where write operations are not supported.
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
>>> raise DerivaMLReadOnlyError("Cannot create datasets in a downloaded bag")
|
|
309
|
+
"""
|
|
23
310
|
|
|
24
|
-
|
|
25
|
-
"""RID for table is not of correct type."""
|
|
26
|
-
def __init__(self, table_type, table: str):
|
|
27
|
-
"""Exception indicating undefined term type"""
|
|
28
|
-
super().__init__(f"Table {table} is not of type {table_type}.")
|
|
311
|
+
pass
|
deriva_ml/core/filespec.py
CHANGED
|
@@ -1,5 +1,26 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""File specification utilities for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the FileSpec class for creating and managing file metadata
|
|
4
|
+
in the Deriva catalog. FileSpec objects represent files with their checksums,
|
|
5
|
+
sizes, and type classifications, ready for insertion into the File table.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Automatic MD5 checksum computation
|
|
9
|
+
- URL normalization (local paths converted to tag URIs)
|
|
10
|
+
- Support for file type classification
|
|
11
|
+
- Batch processing of directories
|
|
12
|
+
- JSONL serialization/deserialization
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
Create FileSpec from a local file:
|
|
16
|
+
>>> specs = list(FileSpec.create_filespecs(
|
|
17
|
+
... path="/data/images/sample.png",
|
|
18
|
+
... description="Sample image",
|
|
19
|
+
... file_types=["Image", "PNG"]
|
|
20
|
+
... ))
|
|
21
|
+
|
|
22
|
+
Read FileSpecs from a JSONL file:
|
|
23
|
+
>>> specs = list(FileSpec.read_filespec("files.jsonl"))
|
|
3
24
|
"""
|
|
4
25
|
|
|
5
26
|
from __future__ import annotations
|
|
@@ -12,25 +33,44 @@ from typing import Callable, Generator
|
|
|
12
33
|
from urllib.parse import urlparse
|
|
13
34
|
|
|
14
35
|
import deriva.core.utils.hash_utils as hash_utils
|
|
15
|
-
from pydantic import BaseModel, Field,
|
|
36
|
+
from pydantic import BaseModel, Field, field_validator, validate_call
|
|
16
37
|
|
|
17
38
|
|
|
18
39
|
class FileSpec(BaseModel):
|
|
19
|
-
"""
|
|
40
|
+
"""Specification for a file to be added to the Deriva catalog.
|
|
41
|
+
|
|
42
|
+
Represents file metadata required for creating entries in the File table.
|
|
43
|
+
Handles URL normalization, ensuring local file paths are converted to
|
|
44
|
+
tag URIs that uniquely identify the file's origin.
|
|
20
45
|
|
|
21
46
|
Attributes:
|
|
22
|
-
url:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
file_types:
|
|
47
|
+
url: File location as URL or local path. Local paths are converted to tag URIs.
|
|
48
|
+
md5: MD5 checksum for integrity verification.
|
|
49
|
+
length: File size in bytes.
|
|
50
|
+
description: Optional description of the file's contents or purpose.
|
|
51
|
+
file_types: List of file type classifications from the Asset_Type vocabulary.
|
|
52
|
+
|
|
53
|
+
Note:
|
|
54
|
+
The 'File' type is automatically added to file_types if not present when
|
|
55
|
+
using create_filespecs().
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> spec = FileSpec(
|
|
59
|
+
... url="/data/results.csv",
|
|
60
|
+
... md5="d41d8cd98f00b204e9800998ecf8427e",
|
|
61
|
+
... length=1024,
|
|
62
|
+
... description="Analysis results",
|
|
63
|
+
... file_types=["CSV", "Data"]
|
|
64
|
+
... )
|
|
27
65
|
"""
|
|
28
66
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
67
|
+
model_config = {"populate_by_name": True}
|
|
68
|
+
|
|
69
|
+
url: str = Field(alias="URL")
|
|
70
|
+
md5: str = Field(alias="MD5")
|
|
71
|
+
length: int = Field(alias="Length")
|
|
72
|
+
description: str | None = Field(default="", alias="Description")
|
|
73
|
+
file_types: list[str] | None = Field(default_factory=list)
|
|
34
74
|
|
|
35
75
|
@field_validator("url")
|
|
36
76
|
@classmethod
|
|
@@ -61,22 +101,39 @@ class FileSpec(BaseModel):
|
|
|
61
101
|
def create_filespecs(
|
|
62
102
|
cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
|
|
63
103
|
) -> Generator[FileSpec, None, None]:
|
|
64
|
-
"""
|
|
104
|
+
"""Generate FileSpec objects for a file or directory.
|
|
105
|
+
|
|
106
|
+
Creates FileSpec objects with computed MD5 checksums for each file found.
|
|
107
|
+
For directories, recursively processes all files. The 'File' type is
|
|
108
|
+
automatically prepended to file_types if not already present.
|
|
65
109
|
|
|
66
110
|
Args:
|
|
67
|
-
path: Path to
|
|
68
|
-
description:
|
|
69
|
-
file_types:
|
|
111
|
+
path: Path to a file or directory. If directory, all files are processed recursively.
|
|
112
|
+
description: Description to apply to all generated FileSpecs.
|
|
113
|
+
file_types: Either a static list of file types, or a callable that takes a Path
|
|
114
|
+
and returns a list of types for that specific file. Allows dynamic type
|
|
115
|
+
assignment based on file extension, content, etc.
|
|
70
116
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
117
|
+
Yields:
|
|
118
|
+
FileSpec: A specification for each file with computed checksums and metadata.
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
Static file types:
|
|
122
|
+
>>> specs = FileSpec.create_filespecs("/data/images", "Images", ["Image"])
|
|
74
123
|
|
|
124
|
+
Dynamic file types based on extension:
|
|
125
|
+
>>> def get_types(path):
|
|
126
|
+
... ext = path.suffix.lower()
|
|
127
|
+
... return {"png": ["PNG", "Image"], ".jpg": ["JPEG", "Image"]}.get(ext, [])
|
|
128
|
+
>>> specs = FileSpec.create_filespecs("/data", "Mixed files", get_types)
|
|
129
|
+
"""
|
|
75
130
|
path = Path(path)
|
|
76
131
|
file_types = file_types or []
|
|
132
|
+
# Convert static list to callable for uniform handling
|
|
77
133
|
file_types_fn = file_types if callable(file_types) else lambda _x: file_types
|
|
78
134
|
|
|
79
135
|
def create_spec(file_path: Path) -> FileSpec:
|
|
136
|
+
"""Create a FileSpec for a single file with computed hashes."""
|
|
80
137
|
hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
|
|
81
138
|
md5 = hashes["md5"][0]
|
|
82
139
|
type_list = file_types_fn(file_path)
|
|
@@ -85,21 +142,31 @@ class FileSpec(BaseModel):
|
|
|
85
142
|
md5=md5,
|
|
86
143
|
description=description,
|
|
87
144
|
url=file_path.as_posix(),
|
|
145
|
+
# Ensure 'File' type is always included
|
|
88
146
|
file_types=type_list if "File" in type_list else ["File"] + type_list,
|
|
89
147
|
)
|
|
90
148
|
|
|
149
|
+
# Handle both single files and directories (recursive)
|
|
91
150
|
files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
|
|
92
151
|
return (create_spec(file) for file in files)
|
|
93
152
|
|
|
94
153
|
@staticmethod
|
|
95
154
|
def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
|
|
96
|
-
"""
|
|
155
|
+
"""Read FileSpec objects from a JSON Lines file.
|
|
156
|
+
|
|
157
|
+
Parses a JSONL file where each line is a JSON object representing a FileSpec.
|
|
158
|
+
Empty lines are skipped. This is useful for batch processing pre-computed
|
|
159
|
+
file specifications.
|
|
97
160
|
|
|
98
161
|
Args:
|
|
99
|
-
|
|
162
|
+
path: Path to the .jsonl file containing FileSpec data.
|
|
100
163
|
|
|
101
164
|
Yields:
|
|
102
|
-
|
|
165
|
+
FileSpec: Parsed FileSpec object for each valid line.
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
>>> for spec in FileSpec.read_filespec("files.jsonl"):
|
|
169
|
+
... print(f"{spec.url}: {spec.md5}")
|
|
103
170
|
"""
|
|
104
171
|
path = Path(path)
|
|
105
172
|
with path.open("r", encoding="utf-8") as f:
|
|
@@ -110,7 +177,11 @@ class FileSpec(BaseModel):
|
|
|
110
177
|
yield FileSpec(**json.loads(line))
|
|
111
178
|
|
|
112
179
|
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
#
|
|
116
|
-
|
|
180
|
+
# =============================================================================
|
|
181
|
+
# Pydantic Workaround
|
|
182
|
+
# =============================================================================
|
|
183
|
+
# Workaround for Pydantic's validate_call decorator not working directly with
|
|
184
|
+
# classmethods that have forward references. We extract the underlying function,
|
|
185
|
+
# wrap it with validate_call, and re-create the classmethod.
|
|
186
|
+
_raw = FileSpec.create_filespecs.__func__ # type: ignore[attr-defined]
|
|
187
|
+
FileSpec.create_filespecs = classmethod(validate_call(_raw)) # type: ignore[arg-type]
|