deriva-ml 1.14.47__py3-none-any.whl → 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +59 -30
- deriva_ml/core/__init__.py +2 -2
- deriva_ml/core/base.py +28 -16
- deriva_ml/core/config.py +67 -0
- deriva_ml/dataset/__init__.py +10 -2
- deriva_ml/dataset/aux_classes.py +31 -2
- deriva_ml/dataset/dataset.py +7 -5
- deriva_ml/dataset/dataset_bag.py +214 -106
- deriva_ml/dataset/upload.py +7 -4
- deriva_ml/demo_catalog.py +17 -3
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/execution.py +50 -28
- deriva_ml/execution/execution_configuration.py +26 -31
- deriva_ml/execution/workflow.py +8 -0
- deriva_ml/model/catalog.py +119 -2
- deriva_ml/model/database.py +457 -83
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +55 -50
- deriva_ml/schema/annotations.py +7 -5
- deriva_ml/test.py +94 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/METADATA +10 -7
- deriva_ml-1.17.0.dist-info/RECORD +45 -0
- deriva_ml/model/sql_mapper.py +0 -44
- deriva_ml-1.14.47.dist-info/RECORD +0 -42
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/top_level.txt +0 -0
deriva_ml/.DS_Store
ADDED
|
Binary file
|
deriva_ml/__init__.py
CHANGED
|
@@ -1,45 +1,74 @@
|
|
|
1
|
-
__all__ = [
|
|
2
|
-
"DerivaML",
|
|
3
|
-
"DerivaMLException",
|
|
4
|
-
"DerivaMLInvalidTerm",
|
|
5
|
-
"DerivaMLTableTypeError",
|
|
6
|
-
"Execution",
|
|
7
|
-
"ExecAssetType",
|
|
8
|
-
"ExecMetadataType",
|
|
9
|
-
"Workflow",
|
|
10
|
-
"DatasetBag",
|
|
11
|
-
"DatasetVersion",
|
|
12
|
-
"DatasetSpec",
|
|
13
|
-
"FileSpec",
|
|
14
|
-
"VersionPart",
|
|
15
|
-
"RID",
|
|
16
|
-
"BuiltinTypes",
|
|
17
|
-
"ColumnDefinition",
|
|
18
|
-
"MLVocab",
|
|
19
|
-
"MLAsset",
|
|
20
|
-
"TableDefinition",
|
|
21
|
-
"ExecutionConfiguration",
|
|
22
|
-
]
|
|
23
|
-
|
|
24
1
|
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
25
3
|
|
|
26
|
-
|
|
4
|
+
# Safe imports - no circular dependencies
|
|
5
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
6
|
+
from deriva_ml.core.definitions import (
|
|
27
7
|
RID,
|
|
28
8
|
BuiltinTypes,
|
|
29
9
|
ColumnDefinition,
|
|
30
|
-
|
|
10
|
+
DerivaAssetColumns,
|
|
11
|
+
DerivaSystemColumns,
|
|
31
12
|
ExecAssetType,
|
|
32
13
|
ExecMetadataType,
|
|
33
14
|
FileSpec,
|
|
15
|
+
FileUploadState,
|
|
16
|
+
ForeignKeyDefinition,
|
|
17
|
+
KeyDefinition,
|
|
34
18
|
MLAsset,
|
|
35
19
|
MLVocab,
|
|
36
20
|
TableDefinition,
|
|
21
|
+
UploadState,
|
|
37
22
|
)
|
|
38
|
-
from deriva_ml.core.exceptions import
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
23
|
+
from deriva_ml.core.exceptions import (
|
|
24
|
+
DerivaMLException,
|
|
25
|
+
DerivaMLInvalidTerm,
|
|
26
|
+
DerivaMLTableTypeError,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Type-checking only - avoid circular import at runtime
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from deriva_ml.core.base import DerivaML
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Lazy import function for runtime usage
|
|
35
|
+
def __getattr__(name):
|
|
36
|
+
"""Lazy import to avoid circular dependencies."""
|
|
37
|
+
if name == "DerivaML":
|
|
38
|
+
from deriva_ml.core.base import DerivaML
|
|
39
|
+
|
|
40
|
+
return DerivaML
|
|
41
|
+
elif name == "Execution":
|
|
42
|
+
from deriva_ml.execution.execution import Execution
|
|
43
|
+
|
|
44
|
+
return Execution
|
|
45
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"DerivaML", # Lazy-loaded
|
|
50
|
+
"DerivaMLConfig",
|
|
51
|
+
# Exceptions
|
|
52
|
+
"DerivaMLException",
|
|
53
|
+
"DerivaMLInvalidTerm",
|
|
54
|
+
"DerivaMLTableTypeError",
|
|
55
|
+
# Definitions
|
|
56
|
+
"RID",
|
|
57
|
+
"BuiltinTypes",
|
|
58
|
+
"ColumnDefinition",
|
|
59
|
+
"DerivaSystemColumns",
|
|
60
|
+
"DerivaAssetColumns",
|
|
61
|
+
"ExecAssetType",
|
|
62
|
+
"ExecMetadataType",
|
|
63
|
+
"FileSpec",
|
|
64
|
+
"FileUploadState",
|
|
65
|
+
"ForeignKeyDefinition",
|
|
66
|
+
"KeyDefinition",
|
|
67
|
+
"MLAsset",
|
|
68
|
+
"MLVocab",
|
|
69
|
+
"TableDefinition",
|
|
70
|
+
"UploadState",
|
|
71
|
+
]
|
|
43
72
|
|
|
44
73
|
try:
|
|
45
74
|
__version__ = version("deriva_ml")
|
deriva_ml/core/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from deriva_ml.core.base import DerivaML
|
|
2
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
2
3
|
from deriva_ml.core.definitions import (
|
|
3
4
|
RID,
|
|
4
5
|
BuiltinTypes,
|
|
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
|
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
19
20
|
"DerivaML",
|
|
20
|
-
|
|
21
|
+
"DerivaMLConfig",
|
|
21
22
|
# Exceptions
|
|
22
23
|
"DerivaMLException",
|
|
23
24
|
"DerivaMLInvalidTerm",
|
|
24
25
|
"DerivaMLTableTypeError",
|
|
25
|
-
|
|
26
26
|
# Definitions
|
|
27
27
|
"RID",
|
|
28
28
|
"BuiltinTypes",
|
deriva_ml/core/base.py
CHANGED
|
@@ -15,12 +15,11 @@ from __future__ import annotations # noqa: I001
|
|
|
15
15
|
|
|
16
16
|
# Standard library imports
|
|
17
17
|
from collections import defaultdict
|
|
18
|
-
import getpass
|
|
19
18
|
import logging
|
|
20
19
|
from datetime import datetime
|
|
21
20
|
from itertools import chain
|
|
22
21
|
from pathlib import Path
|
|
23
|
-
from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any
|
|
22
|
+
from typing import Dict, Iterable, List, cast, TYPE_CHECKING, Any, Self
|
|
24
23
|
from urllib.parse import urlsplit
|
|
25
24
|
|
|
26
25
|
|
|
@@ -29,18 +28,14 @@ import requests
|
|
|
29
28
|
from pydantic import ConfigDict, validate_call
|
|
30
29
|
|
|
31
30
|
# Deriva imports
|
|
32
|
-
from deriva.core import
|
|
33
|
-
DEFAULT_SESSION_CONFIG,
|
|
34
|
-
format_exception,
|
|
35
|
-
get_credential,
|
|
36
|
-
urlquote,
|
|
37
|
-
)
|
|
31
|
+
from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote
|
|
38
32
|
|
|
39
33
|
import deriva.core.datapath as datapath
|
|
40
34
|
from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
|
|
41
35
|
from deriva.core.deriva_server import DerivaServer
|
|
42
36
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
43
37
|
from deriva.core.ermrest_model import Key, Table
|
|
38
|
+
from deriva.core.utils.core_utils import DEFAULT_LOGGER_OVERRIDES
|
|
44
39
|
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
45
40
|
|
|
46
41
|
from deriva_ml.core.exceptions import DerivaMLInvalidTerm
|
|
@@ -55,6 +50,7 @@ from deriva_ml.core.definitions import (
|
|
|
55
50
|
TableDefinition,
|
|
56
51
|
VocabularyTerm,
|
|
57
52
|
)
|
|
53
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
58
54
|
from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
|
|
59
55
|
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
60
56
|
from deriva_ml.dataset.dataset import Dataset
|
|
@@ -108,6 +104,10 @@ class DerivaML(Dataset):
|
|
|
108
104
|
>>> ml.add_term('vocabulary_table', 'new_term', description='Description of term')
|
|
109
105
|
"""
|
|
110
106
|
|
|
107
|
+
@classmethod
|
|
108
|
+
def instantiate(cls, config: DerivaMLConfig) -> Self:
|
|
109
|
+
return cls(**config.model_dump())
|
|
110
|
+
|
|
111
111
|
def __init__(
|
|
112
112
|
self,
|
|
113
113
|
hostname: str,
|
|
@@ -116,8 +116,10 @@ class DerivaML(Dataset):
|
|
|
116
116
|
project_name: str | None = None,
|
|
117
117
|
cache_dir: str | Path | None = None,
|
|
118
118
|
working_dir: str | Path | None = None,
|
|
119
|
+
hydra_runtime_output_dir: str | Path | None = None,
|
|
119
120
|
ml_schema: str = ML_SCHEMA,
|
|
120
121
|
logging_level=logging.WARNING,
|
|
122
|
+
deriva_logging_level=logging.WARNING,
|
|
121
123
|
credential=None,
|
|
122
124
|
use_minid: bool = True,
|
|
123
125
|
check_auth: bool = True,
|
|
@@ -152,7 +154,6 @@ class DerivaML(Dataset):
|
|
|
152
154
|
credentials=self.credential,
|
|
153
155
|
session_config=self._get_session_config(),
|
|
154
156
|
)
|
|
155
|
-
|
|
156
157
|
try:
|
|
157
158
|
if check_auth and server.get_authn_session():
|
|
158
159
|
pass
|
|
@@ -161,17 +162,14 @@ class DerivaML(Dataset):
|
|
|
161
162
|
"You are not authorized to access this catalog. "
|
|
162
163
|
"Please check your credentials and make sure you have logged in."
|
|
163
164
|
)
|
|
164
|
-
|
|
165
165
|
self.catalog = server.connect_ermrest(catalog_id)
|
|
166
166
|
self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
|
|
167
167
|
|
|
168
168
|
# Set up working and cache directories
|
|
169
|
-
|
|
170
|
-
self.working_dir = (
|
|
171
|
-
Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
|
|
172
|
-
) / default_workdir
|
|
173
|
-
|
|
169
|
+
self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
|
|
174
170
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
self.hydra_runtime_output_dir = hydra_runtime_output_dir
|
|
172
|
+
|
|
175
173
|
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
176
174
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
177
175
|
|
|
@@ -181,6 +179,15 @@ class DerivaML(Dataset):
|
|
|
181
179
|
# Set up logging
|
|
182
180
|
self._logger = logging.getLogger("deriva_ml")
|
|
183
181
|
self._logger.setLevel(logging_level)
|
|
182
|
+
self._logging_level = logging_level
|
|
183
|
+
self._deriva_logging_level = deriva_logging_level
|
|
184
|
+
|
|
185
|
+
# Configure deriva logging level
|
|
186
|
+
logger_config = DEFAULT_LOGGER_OVERRIDES
|
|
187
|
+
# allow for reconfiguration of module-specific logging levels
|
|
188
|
+
[logging.getLogger(name).setLevel(level) for name, level in logger_config.items()]
|
|
189
|
+
logging.getLogger("bagit").setLevel(deriva_logging_level)
|
|
190
|
+
logging.getLogger("bdbag").setLevel(deriva_logging_level)
|
|
184
191
|
|
|
185
192
|
# Store instance configuration
|
|
186
193
|
self.host_name = hostname
|
|
@@ -1081,7 +1088,12 @@ class DerivaML(Dataset):
|
|
|
1081
1088
|
return self._download_dataset_bag(
|
|
1082
1089
|
dataset=dataset,
|
|
1083
1090
|
execution_rid=execution_rid,
|
|
1084
|
-
snapshot_catalog=DerivaML(
|
|
1091
|
+
snapshot_catalog=DerivaML(
|
|
1092
|
+
self.host_name,
|
|
1093
|
+
self._version_snapshot(dataset),
|
|
1094
|
+
logging_level=self._logging_level,
|
|
1095
|
+
deriva_logging_level=self._deriva_logging_level,
|
|
1096
|
+
),
|
|
1085
1097
|
)
|
|
1086
1098
|
|
|
1087
1099
|
def _update_status(self, new_status: Status, status_detail: str, execution_rid: RID):
|
deriva_ml/core/config.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from hydra.conf import HydraConf, RunDir
|
|
6
|
+
from hydra.core.hydra_config import HydraConfig
|
|
7
|
+
from hydra_zen import store
|
|
8
|
+
from omegaconf import OmegaConf
|
|
9
|
+
from pydantic import BaseModel, model_validator
|
|
10
|
+
|
|
11
|
+
from deriva_ml.core.definitions import ML_SCHEMA
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DerivaMLConfig(BaseModel):
|
|
15
|
+
hostname: str
|
|
16
|
+
catalog_id: str | int = 1
|
|
17
|
+
domain_schema: str | None = None
|
|
18
|
+
project_name: str | None = None
|
|
19
|
+
cache_dir: str | Path | None = None
|
|
20
|
+
working_dir: str | Path | None = None
|
|
21
|
+
hydra_runtime_output_dir: str | Path | None = None
|
|
22
|
+
ml_schema: str = ML_SCHEMA
|
|
23
|
+
logging_level: Any = logging.WARNING
|
|
24
|
+
deriva_logging_level: Any = logging.WARNING
|
|
25
|
+
credential: Any = None
|
|
26
|
+
use_minid: bool = True
|
|
27
|
+
check_auth: bool = True
|
|
28
|
+
|
|
29
|
+
@model_validator(mode="after")
|
|
30
|
+
def init_working_dir(self):
|
|
31
|
+
"""
|
|
32
|
+
Sets up the working directory for the model.
|
|
33
|
+
|
|
34
|
+
This method configures the working directory, ensuring that all required
|
|
35
|
+
file operations are performed in the appropriate location. If the user does not
|
|
36
|
+
specify a directory, a default directory based on the user's home directory
|
|
37
|
+
or username will be used.
|
|
38
|
+
|
|
39
|
+
This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
|
|
40
|
+
directory is available to hydra.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Self: The object instance with the working directory initialized.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
|
|
47
|
+
self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def compute_workdir(working_dir) -> Path:
|
|
52
|
+
# Create a default working directory if none is provided
|
|
53
|
+
working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
|
|
54
|
+
return working_dir.absolute()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
|
|
58
|
+
store(
|
|
59
|
+
HydraConf(
|
|
60
|
+
run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
61
|
+
output_subdir="hydra-config",
|
|
62
|
+
),
|
|
63
|
+
group="hydra",
|
|
64
|
+
name="config",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
store.add_to_hydra_store()
|
deriva_ml/dataset/__init__.py
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
from .aux_classes import DatasetSpec
|
|
1
|
+
from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
|
|
2
2
|
from .dataset import Dataset
|
|
3
|
+
from .dataset_bag import DatasetBag
|
|
3
4
|
|
|
4
|
-
__all__ = [
|
|
5
|
+
__all__ = [
|
|
6
|
+
"Dataset",
|
|
7
|
+
"DatasetSpec",
|
|
8
|
+
"DatasetSpecConfig",
|
|
9
|
+
"DatasetBag",
|
|
10
|
+
"DatasetVersion",
|
|
11
|
+
"VersionPart",
|
|
12
|
+
]
|
deriva_ml/dataset/aux_classes.py
CHANGED
|
@@ -3,8 +3,9 @@ THis module defines the DataSet class with is used to manipulate n
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from enum import Enum
|
|
6
|
-
from typing import Any, Optional, SupportsInt
|
|
6
|
+
from typing import Any, Optional, SupportsInt, overload
|
|
7
7
|
|
|
8
|
+
from hydra_zen import hydrated_dataclass
|
|
8
9
|
from pydantic import (
|
|
9
10
|
BaseModel,
|
|
10
11
|
ConfigDict,
|
|
@@ -42,6 +43,9 @@ class DatasetVersion(Version):
|
|
|
42
43
|
replace(major, minor, patch): Replace the major and minor versions
|
|
43
44
|
"""
|
|
44
45
|
|
|
46
|
+
@overload
|
|
47
|
+
def __init__(self, version: str): ...
|
|
48
|
+
@overload
|
|
45
49
|
def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
|
|
46
50
|
"""Initialize a DatasetVersion object.
|
|
47
51
|
|
|
@@ -50,6 +54,21 @@ class DatasetVersion(Version):
|
|
|
50
54
|
minor: Minor version number. Used to indicate additional members added, or change in member values.
|
|
51
55
|
patch: Patch number of the dataset. Used to indicate minor clean-up and edits
|
|
52
56
|
"""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
def __init__(self, *args):
|
|
60
|
+
"""Initialize a DatasetVersion object.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
major: Major version number. Used to indicate schema changes.
|
|
64
|
+
minor: Minor version number. Used to indicate additional members added, or change in member values.
|
|
65
|
+
patch: Patch number of the dataset. Used to indicate minor clean-up and edits
|
|
66
|
+
"""
|
|
67
|
+
if len(args) == 1 and isinstance(args[0], str):
|
|
68
|
+
v = Version.parse(args[0])
|
|
69
|
+
major, minor, patch = v.major, v.minor, v.patch
|
|
70
|
+
else:
|
|
71
|
+
major, minor, patch = args
|
|
53
72
|
super().__init__(major, minor, patch)
|
|
54
73
|
|
|
55
74
|
def to_dict(self) -> dict[str, Any]:
|
|
@@ -182,8 +201,9 @@ class DatasetSpec(BaseModel):
|
|
|
182
201
|
"""
|
|
183
202
|
|
|
184
203
|
rid: RID
|
|
185
|
-
materialize: bool = True
|
|
186
204
|
version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
|
|
205
|
+
materialize: bool = True
|
|
206
|
+
description: str = ""
|
|
187
207
|
|
|
188
208
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
209
|
|
|
@@ -208,3 +228,12 @@ class DatasetSpec(BaseModel):
|
|
|
208
228
|
@field_serializer("version")
|
|
209
229
|
def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
|
|
210
230
|
return version.to_dict()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# Interface for hydra-zen
|
|
234
|
+
@hydrated_dataclass(DatasetSpec)
|
|
235
|
+
class DatasetSpecConfig:
|
|
236
|
+
rid: str
|
|
237
|
+
version: str
|
|
238
|
+
materialize: bool = True
|
|
239
|
+
description: str = ""
|
deriva_ml/dataset/dataset.py
CHANGED
|
@@ -22,14 +22,16 @@ Typical usage example:
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
-
# Standard library imports
|
|
26
25
|
import json
|
|
27
26
|
import logging
|
|
28
27
|
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
# Standard library imports
|
|
29
30
|
from graphlib import TopologicalSorter
|
|
30
31
|
from pathlib import Path
|
|
31
32
|
from tempfile import TemporaryDirectory
|
|
32
33
|
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
|
|
34
|
+
from urllib.parse import urlparse
|
|
33
35
|
|
|
34
36
|
import deriva.core.utils.hash_utils as hash_utils
|
|
35
37
|
import requests
|
|
@@ -1039,7 +1041,6 @@ class Dataset:
|
|
|
1039
1041
|
envars={"RID": dataset.rid},
|
|
1040
1042
|
)
|
|
1041
1043
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
1042
|
-
|
|
1043
1044
|
except (
|
|
1044
1045
|
DerivaDownloadError,
|
|
1045
1046
|
DerivaDownloadConfigurationError,
|
|
@@ -1095,7 +1096,8 @@ class Dataset:
|
|
|
1095
1096
|
|
|
1096
1097
|
# Check or create MINID
|
|
1097
1098
|
minid_url = version_record.minid
|
|
1098
|
-
|
|
1099
|
+
# If we either don't have a MINID, or we have a MINID, but we don't want to use it, generate a new one.
|
|
1100
|
+
if (not minid_url) or (not self._use_minid):
|
|
1099
1101
|
if not create:
|
|
1100
1102
|
raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
|
|
1101
1103
|
if self._use_minid:
|
|
@@ -1105,7 +1107,6 @@ class Dataset:
|
|
|
1105
1107
|
# Return based on MINID usage
|
|
1106
1108
|
if self._use_minid:
|
|
1107
1109
|
return self._fetch_minid_metadata(minid_url, dataset.version)
|
|
1108
|
-
|
|
1109
1110
|
return DatasetMinid(
|
|
1110
1111
|
dataset_version=dataset.version,
|
|
1111
1112
|
RID=f"{rid}@{version_record.snapshot}",
|
|
@@ -1138,7 +1139,8 @@ class Dataset:
|
|
|
1138
1139
|
with TemporaryDirectory() as tmp_dir:
|
|
1139
1140
|
if self._use_minid:
|
|
1140
1141
|
# Get bag from S3
|
|
1141
|
-
|
|
1142
|
+
bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
|
|
1143
|
+
archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
|
|
1142
1144
|
else:
|
|
1143
1145
|
exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
|
|
1144
1146
|
archive_path = exporter.retrieve_file(minid.bag_url)
|