deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +2 -2
- deriva_ml/asset/asset.py +0 -4
- deriva_ml/catalog/__init__.py +6 -0
- deriva_ml/catalog/clone.py +1591 -38
- deriva_ml/catalog/localize.py +66 -29
- deriva_ml/core/base.py +12 -9
- deriva_ml/core/definitions.py +13 -12
- deriva_ml/core/ermrest.py +11 -12
- deriva_ml/core/mixins/annotation.py +2 -2
- deriva_ml/core/mixins/asset.py +3 -3
- deriva_ml/core/mixins/dataset.py +3 -3
- deriva_ml/core/mixins/execution.py +1 -0
- deriva_ml/core/mixins/feature.py +2 -2
- deriva_ml/core/mixins/file.py +2 -2
- deriva_ml/core/mixins/path_builder.py +2 -2
- deriva_ml/core/mixins/rid_resolution.py +2 -2
- deriva_ml/core/mixins/vocabulary.py +2 -2
- deriva_ml/core/mixins/workflow.py +3 -3
- deriva_ml/dataset/catalog_graph.py +3 -4
- deriva_ml/dataset/dataset.py +5 -3
- deriva_ml/dataset/dataset_bag.py +0 -2
- deriva_ml/dataset/upload.py +2 -2
- deriva_ml/demo_catalog.py +0 -1
- deriva_ml/execution/__init__.py +8 -8
- deriva_ml/execution/base_config.py +2 -2
- deriva_ml/execution/execution.py +5 -3
- deriva_ml/execution/execution_record.py +0 -1
- deriva_ml/execution/model_protocol.py +1 -1
- deriva_ml/execution/multirun_config.py +0 -1
- deriva_ml/execution/runner.py +3 -3
- deriva_ml/experiment/experiment.py +3 -3
- deriva_ml/feature.py +2 -2
- deriva_ml/interfaces.py +2 -2
- deriva_ml/model/__init__.py +45 -24
- deriva_ml/model/annotations.py +0 -1
- deriva_ml/model/catalog.py +3 -2
- deriva_ml/model/data_loader.py +330 -0
- deriva_ml/model/data_sources.py +439 -0
- deriva_ml/model/database.py +216 -32
- deriva_ml/model/fk_orderer.py +379 -0
- deriva_ml/model/handles.py +1 -1
- deriva_ml/model/schema_builder.py +816 -0
- deriva_ml/run_model.py +3 -3
- deriva_ml/schema/annotations.py +2 -1
- deriva_ml/schema/create_schema.py +1 -1
- deriva_ml/schema/validation.py +1 -1
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
- deriva_ml-1.17.16.dist-info/RECORD +81 -0
- deriva_ml-1.17.14.dist-info/RECORD +0 -77
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0
deriva_ml/catalog/localize.py
CHANGED
|
@@ -7,7 +7,8 @@ import tempfile
|
|
|
7
7
|
from dataclasses import dataclass, field
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
|
-
from urllib.parse import
|
|
10
|
+
from urllib.parse import quote as urlquote
|
|
11
|
+
from urllib.parse import urlparse
|
|
11
12
|
|
|
12
13
|
from deriva.core import ErmrestCatalog, HatracStore, get_credential
|
|
13
14
|
|
|
@@ -44,19 +45,20 @@ def localize_assets(
|
|
|
44
45
|
hatrac_namespace: str | None = None,
|
|
45
46
|
chunk_size: int | None = None,
|
|
46
47
|
dry_run: bool = False,
|
|
48
|
+
source_hostname: str | None = None,
|
|
47
49
|
) -> LocalizeResult:
|
|
48
50
|
"""Localize remote hatrac assets to the local catalog server.
|
|
49
51
|
|
|
50
|
-
Downloads assets from remote hatrac servers
|
|
51
|
-
asset
|
|
52
|
-
table URLs to point to the local copies.
|
|
52
|
+
Downloads assets from remote hatrac servers and uploads them to the local
|
|
53
|
+
hatrac server, updating the asset table URLs to point to the local copies.
|
|
53
54
|
|
|
54
55
|
This is useful after cloning a catalog with asset_mode="refs" where the
|
|
55
|
-
asset URLs still point to the source server
|
|
56
|
-
the assets fully local.
|
|
56
|
+
asset URLs still point to the source server (either as absolute URLs or
|
|
57
|
+
as relative hatrac paths). Use this function to make the assets fully local.
|
|
57
58
|
|
|
58
|
-
The source hatrac server for each asset is determined
|
|
59
|
-
the URL
|
|
59
|
+
The source hatrac server for each asset is determined:
|
|
60
|
+
1. From the URL if it's an absolute URL (e.g., https://source.org/hatrac/...)
|
|
61
|
+
2. From the source_hostname parameter if the URL is relative (e.g., /hatrac/...)
|
|
60
62
|
|
|
61
63
|
This function is optimized for bulk operations:
|
|
62
64
|
- Fetches all asset records in a single query
|
|
@@ -75,6 +77,9 @@ def localize_assets(
|
|
|
75
77
|
chunk_size: Optional chunk size in bytes for large file uploads. If None,
|
|
76
78
|
uses default chunking behavior.
|
|
77
79
|
dry_run: If True, only report what would be done without making changes.
|
|
80
|
+
source_hostname: Hostname to use for assets with relative URLs (e.g.,
|
|
81
|
+
"www.facebase.org"). Required when localizing assets cloned with
|
|
82
|
+
asset_mode="refs" from a different server.
|
|
78
83
|
|
|
79
84
|
Returns:
|
|
80
85
|
LocalizeResult with counts and details of the operation.
|
|
@@ -93,6 +98,15 @@ def localize_assets(
|
|
|
93
98
|
... )
|
|
94
99
|
>>> print(f"Localized {result.assets_processed} assets")
|
|
95
100
|
|
|
101
|
+
Localize assets cloned from another server with relative URLs:
|
|
102
|
+
>>> result = localize_assets(
|
|
103
|
+
... ml,
|
|
104
|
+
... asset_table="file",
|
|
105
|
+
... asset_rids=["TG0", "TG2"],
|
|
106
|
+
... schema_name="isa",
|
|
107
|
+
... source_hostname="www.facebase.org", # Where the hatrac files are
|
|
108
|
+
... )
|
|
109
|
+
|
|
96
110
|
Localize using ErmrestCatalog:
|
|
97
111
|
>>> from deriva.core import DerivaServer
|
|
98
112
|
>>> server = DerivaServer("https", "localhost")
|
|
@@ -131,6 +145,12 @@ def localize_assets(
|
|
|
131
145
|
# Build a map of RID -> record for easy lookup
|
|
132
146
|
records_by_rid = {r["RID"]: r for r in all_records}
|
|
133
147
|
|
|
148
|
+
# Detect URL column name from first record (try URL first, then url)
|
|
149
|
+
url_column = "URL"
|
|
150
|
+
if all_records:
|
|
151
|
+
if "URL" not in all_records[0] and "url" in all_records[0]:
|
|
152
|
+
url_column = "url"
|
|
153
|
+
|
|
134
154
|
# Identify which assets need to be localized
|
|
135
155
|
assets_to_localize = []
|
|
136
156
|
for rid in asset_rids:
|
|
@@ -140,22 +160,29 @@ def localize_assets(
|
|
|
140
160
|
result.assets_skipped += 1
|
|
141
161
|
continue
|
|
142
162
|
|
|
143
|
-
|
|
163
|
+
# Try both URL and url column names (different catalogs use different conventions)
|
|
164
|
+
current_url = record.get("URL") or record.get("url")
|
|
144
165
|
if not current_url:
|
|
145
|
-
logger.warning(f"Asset {rid} has no URL, skipping")
|
|
166
|
+
logger.warning(f"Asset {rid} has no URL column, skipping")
|
|
146
167
|
result.assets_skipped += 1
|
|
147
168
|
continue
|
|
148
169
|
|
|
149
170
|
# Parse the URL to get source hostname
|
|
150
171
|
parsed_url = urlparse(current_url)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if not
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
172
|
+
asset_source_hostname = parsed_url.netloc
|
|
173
|
+
|
|
174
|
+
if not asset_source_hostname:
|
|
175
|
+
# URL is relative (e.g., /hatrac/facebase/data/...)
|
|
176
|
+
if source_hostname:
|
|
177
|
+
# Use provided source_hostname for relative URLs
|
|
178
|
+
asset_source_hostname = source_hostname
|
|
179
|
+
logger.info(f"Asset {rid} has relative URL, using source_hostname={source_hostname}")
|
|
180
|
+
else:
|
|
181
|
+
logger.info(f"Asset {rid} has relative URL, already local (specify source_hostname to localize)")
|
|
182
|
+
result.assets_skipped += 1
|
|
183
|
+
continue
|
|
157
184
|
|
|
158
|
-
if
|
|
185
|
+
if asset_source_hostname == hostname:
|
|
159
186
|
logger.info(f"Asset {rid} is already local, skipping")
|
|
160
187
|
result.assets_skipped += 1
|
|
161
188
|
continue
|
|
@@ -170,7 +197,7 @@ def localize_assets(
|
|
|
170
197
|
assets_to_localize.append({
|
|
171
198
|
"rid": rid,
|
|
172
199
|
"record": record,
|
|
173
|
-
"source_hostname":
|
|
200
|
+
"source_hostname": asset_source_hostname,
|
|
174
201
|
"source_path": source_path,
|
|
175
202
|
"current_url": current_url,
|
|
176
203
|
})
|
|
@@ -209,8 +236,9 @@ def localize_assets(
|
|
|
209
236
|
source_hostname = asset_info["source_hostname"]
|
|
210
237
|
source_path = asset_info["source_path"]
|
|
211
238
|
current_url = asset_info["current_url"]
|
|
212
|
-
|
|
213
|
-
|
|
239
|
+
# Handle case variations in column names
|
|
240
|
+
filename = record.get("Filename") or record.get("filename")
|
|
241
|
+
md5 = record.get("MD5") or record.get("md5")
|
|
214
242
|
|
|
215
243
|
logger.info(f"[{i+1}/{len(assets_to_localize)}] Localizing {rid}: {filename} from {source_hostname}")
|
|
216
244
|
|
|
@@ -232,16 +260,22 @@ def localize_assets(
|
|
|
232
260
|
# Upload to local hatrac
|
|
233
261
|
dest_path = f"{hatrac_namespace}/{md5}.{filename}" if md5 and filename else f"{hatrac_namespace}/{rid}"
|
|
234
262
|
|
|
263
|
+
# Enable chunking for large files (> 100MB) by default
|
|
264
|
+
file_size = local_file.stat().st_size
|
|
265
|
+
default_chunk_size = 50 * 1024 * 1024 # 50MB chunks
|
|
266
|
+
use_chunked = chunk_size is not None or file_size > 100 * 1024 * 1024
|
|
267
|
+
actual_chunk_size = chunk_size or default_chunk_size
|
|
268
|
+
|
|
235
269
|
new_url = local_hatrac.put_loc(
|
|
236
270
|
dest_path,
|
|
237
271
|
str(local_file),
|
|
238
272
|
headers={"Content-Disposition": f"filename*=UTF-8''{urlquote(filename or 'asset')}"},
|
|
239
|
-
chunked=
|
|
240
|
-
chunk_size=
|
|
273
|
+
chunked=use_chunked,
|
|
274
|
+
chunk_size=actual_chunk_size if use_chunked else 0,
|
|
241
275
|
)
|
|
242
276
|
|
|
243
|
-
# Queue the catalog update
|
|
244
|
-
catalog_updates.append({"RID": rid,
|
|
277
|
+
# Queue the catalog update using the detected URL column name
|
|
278
|
+
catalog_updates.append({"RID": rid, url_column: new_url})
|
|
245
279
|
|
|
246
280
|
logger.info(f"Localized asset {rid}: {current_url} -> {new_url}")
|
|
247
281
|
result.assets_processed += 1
|
|
@@ -257,20 +291,23 @@ def localize_assets(
|
|
|
257
291
|
result.errors.append(error_msg)
|
|
258
292
|
result.assets_failed += 1
|
|
259
293
|
|
|
260
|
-
# Batch update the catalog records
|
|
294
|
+
# Batch update the catalog records using datapath
|
|
261
295
|
if catalog_updates:
|
|
262
296
|
logger.info(f"Updating {len(catalog_updates)} catalog records...")
|
|
263
297
|
try:
|
|
264
|
-
table_path.
|
|
265
|
-
|
|
298
|
+
# Use datapath update - table_path.update() handles the update correctly
|
|
299
|
+
table_path.update(catalog_updates)
|
|
300
|
+
logger.info(f"Updated {len(catalog_updates)} catalog records successfully")
|
|
266
301
|
except Exception as e:
|
|
267
302
|
# If batch update fails, try individual updates as fallback
|
|
268
303
|
logger.warning(f"Batch update failed ({e}), falling back to individual updates...")
|
|
269
304
|
for update in catalog_updates:
|
|
305
|
+
rid = update["RID"]
|
|
270
306
|
try:
|
|
271
|
-
table_path.
|
|
307
|
+
table_path.update([update])
|
|
308
|
+
logger.info(f"Updated catalog record {rid}")
|
|
272
309
|
except Exception as e2:
|
|
273
|
-
error_msg = f"Failed to update catalog record {
|
|
310
|
+
error_msg = f"Failed to update catalog record {rid}: {e2}"
|
|
274
311
|
logger.error(error_msg)
|
|
275
312
|
result.errors.append(error_msg)
|
|
276
313
|
|
deriva_ml/core/base.py
CHANGED
|
@@ -47,20 +47,20 @@ from deriva_ml.core.config import DerivaMLConfig
|
|
|
47
47
|
from deriva_ml.core.definitions import ML_SCHEMA, RID, Status, TableDefinition, VocabularyTableDef
|
|
48
48
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
49
49
|
from deriva_ml.core.logging_config import apply_logger_overrides, configure_logging
|
|
50
|
-
from deriva_ml.dataset.upload import bulk_upload_configuration
|
|
51
|
-
from deriva_ml.interfaces import DerivaMLCatalog
|
|
52
50
|
from deriva_ml.core.mixins import (
|
|
53
51
|
AnnotationMixin,
|
|
54
|
-
VocabularyMixin,
|
|
55
|
-
RidResolutionMixin,
|
|
56
|
-
PathBuilderMixin,
|
|
57
|
-
WorkflowMixin,
|
|
58
|
-
FeatureMixin,
|
|
59
|
-
DatasetMixin,
|
|
60
52
|
AssetMixin,
|
|
53
|
+
DatasetMixin,
|
|
61
54
|
ExecutionMixin,
|
|
55
|
+
FeatureMixin,
|
|
62
56
|
FileMixin,
|
|
57
|
+
PathBuilderMixin,
|
|
58
|
+
RidResolutionMixin,
|
|
59
|
+
VocabularyMixin,
|
|
60
|
+
WorkflowMixin,
|
|
63
61
|
)
|
|
62
|
+
from deriva_ml.dataset.upload import bulk_upload_configuration
|
|
63
|
+
from deriva_ml.interfaces import DerivaMLCatalog
|
|
64
64
|
|
|
65
65
|
# Optional debug imports
|
|
66
66
|
try:
|
|
@@ -74,6 +74,7 @@ if TYPE_CHECKING:
|
|
|
74
74
|
from deriva_ml.catalog.clone import CatalogProvenance
|
|
75
75
|
from deriva_ml.execution.execution import Execution
|
|
76
76
|
from deriva_ml.model.catalog import DerivaModel
|
|
77
|
+
from deriva_ml.schema.validation import SchemaValidationReport
|
|
77
78
|
|
|
78
79
|
# Stop pycharm from complaining about undefined references.
|
|
79
80
|
ml: DerivaML
|
|
@@ -1098,6 +1099,7 @@ class DerivaML(
|
|
|
1098
1099
|
... print(f"{d['execution_rid']}: {d['size_mb']:.1f} MB")
|
|
1099
1100
|
"""
|
|
1100
1101
|
from datetime import datetime
|
|
1102
|
+
|
|
1101
1103
|
from deriva_ml.dataset.upload import upload_root
|
|
1102
1104
|
|
|
1103
1105
|
results = []
|
|
@@ -1155,6 +1157,7 @@ class DerivaML(
|
|
|
1155
1157
|
"""
|
|
1156
1158
|
import shutil
|
|
1157
1159
|
import time
|
|
1160
|
+
|
|
1158
1161
|
from deriva_ml.dataset.upload import upload_root
|
|
1159
1162
|
|
|
1160
1163
|
stats = {'dirs_removed': 0, 'bytes_freed': 0, 'errors': 0}
|
|
@@ -1292,7 +1295,7 @@ class DerivaML(
|
|
|
1292
1295
|
- deriva_ml.schema.validation.SchemaValidationReport
|
|
1293
1296
|
- deriva_ml.schema.validation.validate_ml_schema
|
|
1294
1297
|
"""
|
|
1295
|
-
from deriva_ml.schema.validation import
|
|
1298
|
+
from deriva_ml.schema.validation import validate_ml_schema
|
|
1296
1299
|
return validate_ml_schema(self, strict=strict)
|
|
1297
1300
|
|
|
1298
1301
|
# Methods moved to mixins:
|
deriva_ml/core/definitions.py
CHANGED
|
@@ -25,6 +25,9 @@ For more specialized imports, you can import directly from submodules:
|
|
|
25
25
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
|
+
# Also export BuiltinType directly (BuiltinTypes is the backwards-compatible alias)
|
|
29
|
+
from deriva.core.typed import BuiltinType
|
|
30
|
+
|
|
28
31
|
# =============================================================================
|
|
29
32
|
# Re-exported Constants
|
|
30
33
|
# =============================================================================
|
|
@@ -58,8 +61,6 @@ from deriva_ml.core.enums import (
|
|
|
58
61
|
Status,
|
|
59
62
|
UploadState,
|
|
60
63
|
)
|
|
61
|
-
# Also export BuiltinType directly (BuiltinTypes is the backwards-compatible alias)
|
|
62
|
-
from deriva.core.typed import BuiltinType
|
|
63
64
|
|
|
64
65
|
# =============================================================================
|
|
65
66
|
# Re-exported ERMrest Models
|
|
@@ -67,24 +68,24 @@ from deriva.core.typed import BuiltinType
|
|
|
67
68
|
# From ermrest.py: Dataclass-based models for catalog structure definitions
|
|
68
69
|
# New typed classes from deriva.core.typed
|
|
69
70
|
from deriva_ml.core.ermrest import (
|
|
70
|
-
# New dataclass-based definitions from deriva.core.typed
|
|
71
|
-
ColumnDef,
|
|
72
|
-
KeyDef,
|
|
73
|
-
ForeignKeyDef,
|
|
74
|
-
TableDef,
|
|
75
|
-
VocabularyTableDef,
|
|
76
71
|
AssetTableDef,
|
|
77
72
|
AssociationTableDef,
|
|
78
|
-
|
|
73
|
+
# New dataclass-based definitions from deriva.core.typed
|
|
74
|
+
ColumnDef,
|
|
79
75
|
# Legacy aliases for backwards compatibility
|
|
80
76
|
ColumnDefinition,
|
|
81
|
-
KeyDefinition,
|
|
82
|
-
ForeignKeyDefinition,
|
|
83
|
-
TableDefinition,
|
|
84
77
|
# DerivaML-specific classes
|
|
85
78
|
FileUploadState,
|
|
79
|
+
ForeignKeyDef,
|
|
80
|
+
ForeignKeyDefinition,
|
|
81
|
+
KeyDef,
|
|
82
|
+
KeyDefinition,
|
|
83
|
+
SchemaDef,
|
|
84
|
+
TableDef,
|
|
85
|
+
TableDefinition,
|
|
86
86
|
UploadCallback,
|
|
87
87
|
UploadProgress,
|
|
88
|
+
VocabularyTableDef,
|
|
88
89
|
VocabularyTerm,
|
|
89
90
|
VocabularyTermHandle,
|
|
90
91
|
)
|
deriva_ml/core/ermrest.py
CHANGED
|
@@ -21,6 +21,17 @@ import warnings
|
|
|
21
21
|
from dataclasses import dataclass
|
|
22
22
|
from typing import Any, Protocol
|
|
23
23
|
|
|
24
|
+
# Import and re-export typed definitions from deriva.core.typed
|
|
25
|
+
from deriva.core.typed import (
|
|
26
|
+
AssetTableDef,
|
|
27
|
+
AssociationTableDef,
|
|
28
|
+
ColumnDef,
|
|
29
|
+
ForeignKeyDef,
|
|
30
|
+
KeyDef,
|
|
31
|
+
SchemaDef,
|
|
32
|
+
TableDef,
|
|
33
|
+
VocabularyTableDef,
|
|
34
|
+
)
|
|
24
35
|
from pydantic import (
|
|
25
36
|
BaseModel,
|
|
26
37
|
Field,
|
|
@@ -31,18 +42,6 @@ from pydantic import (
|
|
|
31
42
|
from .constants import RID
|
|
32
43
|
from .enums import UploadState
|
|
33
44
|
|
|
34
|
-
# Import and re-export typed definitions from deriva.core.typed
|
|
35
|
-
from deriva.core.typed import (
|
|
36
|
-
ColumnDef,
|
|
37
|
-
KeyDef,
|
|
38
|
-
ForeignKeyDef,
|
|
39
|
-
TableDef,
|
|
40
|
-
VocabularyTableDef,
|
|
41
|
-
AssetTableDef,
|
|
42
|
-
AssociationTableDef,
|
|
43
|
-
SchemaDef,
|
|
44
|
-
)
|
|
45
|
-
|
|
46
45
|
# Re-export all typed classes for convenience
|
|
47
46
|
__all__ = [
|
|
48
47
|
# New typed definitions from deriva.core.typed
|
|
@@ -14,10 +14,10 @@ Annotation Tags:
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
18
|
-
|
|
19
17
|
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
20
18
|
import importlib
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
20
|
+
|
|
21
21
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
22
22
|
Column = _ermrest_model.Column
|
|
23
23
|
Table = _ermrest_model.Table
|
deriva_ml/core/mixins/asset.py
CHANGED
|
@@ -6,14 +6,14 @@ asset table operations including creating, listing, and looking up assets.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
10
|
-
|
|
11
9
|
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
12
10
|
import importlib
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
12
|
+
|
|
13
13
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
14
14
|
Table = _ermrest_model.Table
|
|
15
15
|
|
|
16
|
-
from deriva_ml.core.definitions import AssetTableDef, ColumnDefinition, MLVocab,
|
|
16
|
+
from deriva_ml.core.definitions import RID, AssetTableDef, ColumnDefinition, MLVocab, VocabularyTerm
|
|
17
17
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
18
18
|
from deriva_ml.schema.annotations import asset_annotation
|
|
19
19
|
|
deriva_ml/core/mixins/dataset.py
CHANGED
|
@@ -7,16 +7,16 @@ deleting, and managing dataset elements.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
11
|
-
|
|
12
10
|
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
11
|
import importlib
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
13
|
+
|
|
14
14
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
15
15
|
Table = _ermrest_model.Table
|
|
16
16
|
|
|
17
17
|
from pydantic import ConfigDict, validate_call
|
|
18
18
|
|
|
19
|
-
from deriva_ml.core.definitions import RID
|
|
19
|
+
from deriva_ml.core.definitions import RID
|
|
20
20
|
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
21
21
|
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
22
22
|
|
deriva_ml/core/mixins/feature.py
CHANGED
|
@@ -7,11 +7,11 @@ and listing feature values.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
11
|
+
import importlib
|
|
10
12
|
from itertools import chain
|
|
11
13
|
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
12
14
|
|
|
13
|
-
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
14
|
-
import importlib
|
|
15
15
|
datapath = importlib.import_module("deriva.core.datapath")
|
|
16
16
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
17
17
|
Key = _ermrest_model.Key
|
deriva_ml/core/mixins/file.py
CHANGED
|
@@ -6,14 +6,14 @@ file operations including adding and listing files.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
10
|
+
import importlib
|
|
9
11
|
from collections import defaultdict
|
|
10
12
|
from itertools import chain
|
|
11
13
|
from pathlib import Path
|
|
12
14
|
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
13
15
|
from urllib.parse import urlsplit
|
|
14
16
|
|
|
15
|
-
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
16
|
-
import importlib
|
|
17
17
|
datapath = importlib.import_module("deriva.core.datapath")
|
|
18
18
|
|
|
19
19
|
from deriva_ml.core.definitions import RID, FileSpec, MLTable, MLVocab, VocabularyTerm
|
|
@@ -6,11 +6,11 @@ catalog path building and table access utilities.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
10
|
+
import importlib
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
from typing import TYPE_CHECKING, Any, Iterable
|
|
11
13
|
|
|
12
|
-
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
-
import importlib
|
|
14
14
|
datapath = importlib.import_module("deriva.core.datapath")
|
|
15
15
|
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
16
16
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
@@ -6,11 +6,11 @@ Resource Identifier (RID) resolution and retrieval operations.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
10
|
+
import importlib
|
|
9
11
|
from dataclasses import dataclass
|
|
10
12
|
from typing import TYPE_CHECKING, Any
|
|
11
13
|
|
|
12
|
-
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
-
import importlib
|
|
14
14
|
_datapath = importlib.import_module("deriva.core.datapath")
|
|
15
15
|
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
16
16
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
@@ -7,10 +7,10 @@ controlled vocabulary tables.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
11
|
-
|
|
12
10
|
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
11
|
import importlib
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
13
|
+
|
|
14
14
|
_datapath = importlib.import_module("deriva.core.datapath")
|
|
15
15
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
16
16
|
DataPathException = _datapath.DataPathException
|
|
@@ -7,10 +7,10 @@ and creating workflows.
|
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
11
|
-
|
|
12
10
|
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
11
|
import importlib
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
13
|
+
|
|
14
14
|
_deriva_core = importlib.import_module("deriva.core")
|
|
15
15
|
format_exception = _deriva_core.format_exception
|
|
16
16
|
|
|
@@ -19,7 +19,7 @@ from deriva_ml.core.exceptions import DerivaMLException
|
|
|
19
19
|
from deriva_ml.execution.workflow import Workflow
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
22
|
-
|
|
22
|
+
pass
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class WorkflowMixin:
|
|
@@ -9,6 +9,7 @@ from deriva.core.utils.core_utils import tag as deriva_tags
|
|
|
9
9
|
|
|
10
10
|
from deriva_ml.core.constants import RID
|
|
11
11
|
from deriva_ml.interfaces import DatasetLike, DerivaMLCatalog
|
|
12
|
+
from deriva_ml.model.catalog import ASSET_COLUMNS
|
|
12
13
|
|
|
13
14
|
try:
|
|
14
15
|
|
|
@@ -117,8 +118,7 @@ class CatalogGraph:
|
|
|
117
118
|
]
|
|
118
119
|
|
|
119
120
|
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
120
|
-
|
|
121
|
-
if asset_columns.issubset({c.name for c in table.columns}):
|
|
121
|
+
if ASSET_COLUMNS.issubset({c.name for c in table.columns}):
|
|
122
122
|
exports.append(
|
|
123
123
|
{
|
|
124
124
|
"processor": "fetch",
|
|
@@ -168,8 +168,7 @@ class CatalogGraph:
|
|
|
168
168
|
]
|
|
169
169
|
|
|
170
170
|
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
171
|
-
|
|
172
|
-
if asset_columns.issubset({c.name for c in table.columns}):
|
|
171
|
+
if ASSET_COLUMNS.issubset({c.name for c in table.columns}):
|
|
173
172
|
exports.append(
|
|
174
173
|
{
|
|
175
174
|
"source": {
|
deriva_ml/dataset/dataset.py
CHANGED
|
@@ -39,12 +39,15 @@ from pathlib import Path
|
|
|
39
39
|
# Local imports
|
|
40
40
|
from pprint import pformat
|
|
41
41
|
from tempfile import TemporaryDirectory
|
|
42
|
-
from typing import Any, Generator, Iterable, Self
|
|
42
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Self
|
|
43
43
|
from urllib.parse import urlparse
|
|
44
44
|
|
|
45
45
|
# Deriva imports
|
|
46
46
|
import deriva.core.utils.hash_utils as hash_utils
|
|
47
47
|
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from deriva_ml.execution.execution import Execution
|
|
50
|
+
|
|
48
51
|
# Third-party imports
|
|
49
52
|
import pandas as pd
|
|
50
53
|
import requests
|
|
@@ -581,7 +584,7 @@ class Dataset:
|
|
|
581
584
|
>>> ds = ml.lookup_dataset("4HM")
|
|
582
585
|
>>> ds.display_markdown(show_children=True)
|
|
583
586
|
"""
|
|
584
|
-
from IPython.display import
|
|
587
|
+
from IPython.display import Markdown, display
|
|
585
588
|
|
|
586
589
|
display(Markdown(self.to_markdown(show_children, indent)))
|
|
587
590
|
|
|
@@ -1331,7 +1334,6 @@ class Dataset:
|
|
|
1331
1334
|
... print(f"Execution {exe.execution_rid}: {exe.status}")
|
|
1332
1335
|
"""
|
|
1333
1336
|
# Import here to avoid circular dependency
|
|
1334
|
-
from deriva_ml.execution.execution import Execution
|
|
1335
1337
|
|
|
1336
1338
|
pb = self._ml_instance.pathBuilder()
|
|
1337
1339
|
dataset_execution_path = pb.schemas[self._ml_instance.ml_schema].Dataset_Execution
|
deriva_ml/dataset/dataset_bag.py
CHANGED
deriva_ml/dataset/upload.py
CHANGED
|
@@ -35,6 +35,8 @@ Here is the directory layout we support:
|
|
|
35
35
|
file1.jsonl, file2.jsonl
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
39
|
+
import importlib
|
|
38
40
|
import json
|
|
39
41
|
import os
|
|
40
42
|
from pathlib import Path
|
|
@@ -43,8 +45,6 @@ from typing import Any, Callable, Optional
|
|
|
43
45
|
|
|
44
46
|
import regex as re
|
|
45
47
|
|
|
46
|
-
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
47
|
-
import importlib
|
|
48
48
|
_deriva_core = importlib.import_module("deriva.core")
|
|
49
49
|
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
50
50
|
_hatrac_store = importlib.import_module("deriva.core.hatrac_store")
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -20,7 +20,6 @@ from random import choice, randint, random
|
|
|
20
20
|
from tempfile import TemporaryDirectory
|
|
21
21
|
|
|
22
22
|
from deriva.core import BaseCLI, ErmrestCatalog
|
|
23
|
-
from deriva.core.ermrest_model import Schema, Table
|
|
24
23
|
from deriva.core.typed import BuiltinType, ColumnDef, SchemaDef, TableDef
|
|
25
24
|
from pydantic import BaseModel, ConfigDict
|
|
26
25
|
from requests.exceptions import HTTPError
|
deriva_ml/execution/__init__.py
CHANGED
|
@@ -4,27 +4,27 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from deriva_ml.execution.base_config import (
|
|
5
5
|
BaseConfig,
|
|
6
6
|
DerivaBaseConfig,
|
|
7
|
+
# Config metadata helpers
|
|
8
|
+
DescribedList,
|
|
7
9
|
base_defaults,
|
|
8
10
|
get_notebook_configuration,
|
|
11
|
+
load_configs,
|
|
9
12
|
# New simplified API
|
|
10
13
|
notebook_config,
|
|
11
|
-
load_configs,
|
|
12
14
|
run_notebook,
|
|
13
|
-
# Config metadata helpers
|
|
14
|
-
DescribedList,
|
|
15
15
|
with_description,
|
|
16
16
|
)
|
|
17
|
+
from deriva_ml.execution.execution_configuration import AssetRID, ExecutionConfiguration
|
|
18
|
+
from deriva_ml.execution.model_protocol import DerivaMLModel
|
|
17
19
|
from deriva_ml.execution.multirun_config import (
|
|
18
20
|
MultirunSpec,
|
|
19
|
-
|
|
21
|
+
get_all_multirun_configs,
|
|
20
22
|
get_multirun_config,
|
|
21
23
|
list_multirun_configs,
|
|
22
|
-
|
|
24
|
+
multirun_config,
|
|
23
25
|
)
|
|
24
|
-
from deriva_ml.execution.
|
|
26
|
+
from deriva_ml.execution.runner import create_model_config, reset_multirun_state, run_model
|
|
25
27
|
from deriva_ml.execution.workflow import Workflow
|
|
26
|
-
from deriva_ml.execution.runner import run_model, create_model_config, reset_multirun_state
|
|
27
|
-
from deriva_ml.execution.model_protocol import DerivaMLModel
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
30
|
from deriva_ml.execution.execution import Execution
|
|
@@ -44,9 +44,9 @@ import os
|
|
|
44
44
|
import pkgutil
|
|
45
45
|
from dataclasses import dataclass, field
|
|
46
46
|
from pathlib import Path
|
|
47
|
-
from typing import Any, TypeVar
|
|
47
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
48
48
|
|
|
49
|
-
from hydra_zen import builds, instantiate, launch,
|
|
49
|
+
from hydra_zen import builds, instantiate, launch, store
|
|
50
50
|
|
|
51
51
|
if TYPE_CHECKING:
|
|
52
52
|
from deriva_ml import DerivaML
|