deriva-ml 1.12.1__py3-none-any.whl → 1.12.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +9 -2
- deriva_ml/deriva_definitions.py +1 -0
- deriva_ml/execution.py +71 -29
- deriva_ml/upload.py +11 -12
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/METADATA +1 -1
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/RECORD +10 -10
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/WHEEL +0 -0
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.12.1.dist-info → deriva_ml-1.12.3.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -41,7 +41,14 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
|
41
41
|
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
42
42
|
|
|
43
43
|
from deriva_ml import DatasetBag
|
|
44
|
-
from .deriva_definitions import
|
|
44
|
+
from .deriva_definitions import (
|
|
45
|
+
ML_SCHEMA,
|
|
46
|
+
DerivaMLException,
|
|
47
|
+
MLVocab,
|
|
48
|
+
Status,
|
|
49
|
+
RID,
|
|
50
|
+
DRY_RUN_RID,
|
|
51
|
+
)
|
|
45
52
|
from .history import iso_to_snap
|
|
46
53
|
from .deriva_model import DerivaModel
|
|
47
54
|
from .database_model import DatabaseModel
|
|
@@ -957,7 +964,7 @@ class Dataset:
|
|
|
957
964
|
for the dataset.
|
|
958
965
|
"""
|
|
959
966
|
if (
|
|
960
|
-
execution_rid
|
|
967
|
+
execution_rid != DRY_RUN_RID
|
|
961
968
|
and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
|
|
962
969
|
):
|
|
963
970
|
raise DerivaMLException(f"RID {execution_rid} is not an execution")
|
deriva_ml/deriva_definitions.py
CHANGED
deriva_ml/execution.py
CHANGED
|
@@ -5,21 +5,30 @@ This module defined the Execution class which is used to interact with the state
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
from collections import defaultdict
|
|
8
|
+
from datetime import datetime
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import os
|
|
11
|
-
import shutil
|
|
12
|
-
from datetime import datetime
|
|
13
12
|
from pathlib import Path
|
|
14
|
-
from typing import Iterable, Any, Optional
|
|
15
13
|
|
|
16
|
-
from deriva.core import format_exception
|
|
17
14
|
from pydantic import validate_call, ConfigDict
|
|
15
|
+
import regex as re
|
|
18
16
|
import sys
|
|
19
|
-
|
|
17
|
+
import shutil
|
|
18
|
+
from typing import Iterable, Any, Optional
|
|
20
19
|
|
|
20
|
+
from deriva.core import format_exception
|
|
21
|
+
from deriva.core.datapath import DataPathException
|
|
22
|
+
from deriva.core.hatrac_store import HatracStore
|
|
21
23
|
from .deriva_definitions import ExecMetadataVocab
|
|
22
|
-
from .deriva_definitions import
|
|
24
|
+
from .deriva_definitions import (
|
|
25
|
+
RID,
|
|
26
|
+
Status,
|
|
27
|
+
FileUploadState,
|
|
28
|
+
DerivaMLException,
|
|
29
|
+
MLVocab,
|
|
30
|
+
DRY_RUN_RID,
|
|
31
|
+
)
|
|
23
32
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
24
33
|
from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
25
34
|
from .dataset_bag import DatasetBag
|
|
@@ -167,7 +176,7 @@ class Execution:
|
|
|
167
176
|
self.workflow_rid = (
|
|
168
177
|
self._ml_object.add_workflow(self.configuration.workflow)
|
|
169
178
|
if not self._dry_run
|
|
170
|
-
else
|
|
179
|
+
else DRY_RUN_RID
|
|
171
180
|
)
|
|
172
181
|
else:
|
|
173
182
|
self.workflow_rid = self.configuration.workflow
|
|
@@ -195,10 +204,10 @@ class Execution:
|
|
|
195
204
|
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
196
205
|
if reload:
|
|
197
206
|
self.execution_rid = reload
|
|
198
|
-
if self.execution_rid ==
|
|
207
|
+
if self.execution_rid == DRY_RUN_RID:
|
|
199
208
|
self._dry_run = True
|
|
200
209
|
elif self._dry_run:
|
|
201
|
-
self.execution_rid =
|
|
210
|
+
self.execution_rid = DRY_RUN_RID
|
|
202
211
|
else:
|
|
203
212
|
self.execution_rid = schema_path.Execution.insert(
|
|
204
213
|
[
|
|
@@ -625,9 +634,20 @@ class Execution:
|
|
|
625
634
|
with open(feature_file, "r") as feature_values:
|
|
626
635
|
entities = [json.loads(line.strip()) for line in feature_values]
|
|
627
636
|
# Update the asset columns in the feature and add to the catalog.
|
|
628
|
-
|
|
629
|
-
[
|
|
630
|
-
|
|
637
|
+
try:
|
|
638
|
+
self._ml_object.domain_path.tables[feature_table].insert(
|
|
639
|
+
[map_path(e) for e in entities]
|
|
640
|
+
)
|
|
641
|
+
except DataPathException as e:
|
|
642
|
+
if re.match(
|
|
643
|
+
rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
|
|
644
|
+
e.message,
|
|
645
|
+
):
|
|
646
|
+
self._logger.info(
|
|
647
|
+
f"Skipping reload of feature values for {feature_table}"
|
|
648
|
+
)
|
|
649
|
+
else:
|
|
650
|
+
raise e
|
|
631
651
|
|
|
632
652
|
def _update_asset_execution_table(
|
|
633
653
|
self,
|
|
@@ -652,16 +672,27 @@ class Execution:
|
|
|
652
672
|
asset_exe = self._model.find_association(asset_table_name, "Execution")
|
|
653
673
|
asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
|
|
654
674
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
675
|
+
try:
|
|
676
|
+
asset_exe_path.insert(
|
|
677
|
+
[
|
|
678
|
+
{
|
|
679
|
+
asset_table_name: asset_path.asset_rid,
|
|
680
|
+
"Execution": self.execution_rid,
|
|
681
|
+
"Asset_Role": asset_role,
|
|
682
|
+
}
|
|
683
|
+
for asset_path in asset_list
|
|
684
|
+
]
|
|
685
|
+
)
|
|
686
|
+
except DataPathException as e:
|
|
687
|
+
if re.match(
|
|
688
|
+
rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
|
|
689
|
+
e.message,
|
|
690
|
+
):
|
|
691
|
+
self._logger.info(
|
|
692
|
+
f"Skipping reload of execution assocations for {asset_table_name}"
|
|
693
|
+
)
|
|
694
|
+
else:
|
|
695
|
+
raise e
|
|
665
696
|
|
|
666
697
|
# Now add in the type names via the asset_asset_type association table.
|
|
667
698
|
# Get the list of types for each file in the asset.
|
|
@@ -687,13 +718,24 @@ class Execution:
|
|
|
687
718
|
type_path = pb.schemas[asset_asset_type.schema.name].tables[
|
|
688
719
|
asset_asset_type.name
|
|
689
720
|
]
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
721
|
+
try:
|
|
722
|
+
type_path.insert(
|
|
723
|
+
[
|
|
724
|
+
{asset_table_name: asset.asset_rid, "Asset_Type": t}
|
|
725
|
+
for asset in asset_list
|
|
726
|
+
for t in asset_type_map[asset.file_name]
|
|
727
|
+
]
|
|
728
|
+
)
|
|
729
|
+
except DataPathException as e:
|
|
730
|
+
if re.match(
|
|
731
|
+
rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
|
|
732
|
+
e.message,
|
|
733
|
+
):
|
|
734
|
+
self._logger.info(
|
|
735
|
+
f"Skipping reload of execution asset types for {asset_table_name}"
|
|
736
|
+
)
|
|
737
|
+
else:
|
|
738
|
+
raise e
|
|
697
739
|
|
|
698
740
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
699
741
|
def asset_file_path(
|
deriva_ml/upload.py
CHANGED
|
@@ -74,17 +74,16 @@ feature_table_dir_regex = (
|
|
|
74
74
|
+ r"/(?P<schema>[-\w]+)/(?P<target_table>[-\w]+)/(?P<feature_name>[-\w]+)"
|
|
75
75
|
)
|
|
76
76
|
feature_value_regex = (
|
|
77
|
-
feature_table_dir_regex + r"/(?P=feature_name)[.](?P<
|
|
77
|
+
feature_table_dir_regex + r"/(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
|
|
78
78
|
)
|
|
79
79
|
feature_asset_dir_regex = feature_table_dir_regex + r"/asset/(?P<asset_table>[-\w]+)"
|
|
80
80
|
feature_asset_regex = (
|
|
81
|
-
feature_asset_dir_regex
|
|
82
|
-
+ r"/(?P<file_name>[A-Za-z0-9_-]+)[.](?P<file_ext>[a-z0-9]*)$"
|
|
81
|
+
feature_asset_dir_regex + r"/(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
|
|
83
82
|
)
|
|
84
83
|
|
|
85
84
|
asset_path_regex = exec_dir_regex + r"/asset/(?P<schema>[-\w]+)/(?P<asset_table>[-\w]*)"
|
|
86
85
|
|
|
87
|
-
asset_file_regex = r"(?P<
|
|
86
|
+
asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
|
|
88
87
|
|
|
89
88
|
table_regex = (
|
|
90
89
|
exec_dir_regex
|
|
@@ -211,16 +210,16 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
211
210
|
"Filename": "{file_name}",
|
|
212
211
|
}
|
|
213
212
|
| {c: f"{{{c}}}" for c in metadata_columns},
|
|
214
|
-
"file_pattern": asset_path, # Sets schema, asset_table,
|
|
213
|
+
"file_pattern": asset_path, # Sets schema, asset_table, file
|
|
215
214
|
"asset_type": "file",
|
|
216
215
|
"target_table": [schema, asset_table.name],
|
|
217
216
|
"checksum_types": ["sha256", "md5"],
|
|
218
217
|
"hatrac_options": {"versioned_urls": True},
|
|
219
218
|
"hatrac_templates": {
|
|
220
|
-
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}
|
|
221
|
-
"content-disposition": "filename*=UTF-8''{file_name}
|
|
219
|
+
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
|
|
220
|
+
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
222
221
|
},
|
|
223
|
-
"record_query_template": "/entity/{target_table}/MD5={
|
|
222
|
+
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
|
|
224
223
|
}
|
|
225
224
|
|
|
226
225
|
|
|
@@ -249,14 +248,14 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
249
248
|
"target_table": ["{schema}", "{asset_table}"],
|
|
250
249
|
"file_pattern": asset_path_regex
|
|
251
250
|
+ "/"
|
|
252
|
-
+ asset_file_regex, # Sets schema, asset_table,
|
|
251
|
+
+ asset_file_regex, # Sets schema, asset_table, name, ext
|
|
253
252
|
"checksum_types": ["sha256", "md5"],
|
|
254
253
|
"hatrac_options": {"versioned_urls": True},
|
|
255
254
|
"hatrac_templates": {
|
|
256
|
-
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}
|
|
257
|
-
"content-disposition": "filename*=UTF-8''{file_name}
|
|
255
|
+
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
|
|
256
|
+
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
258
257
|
},
|
|
259
|
-
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}
|
|
258
|
+
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
|
|
260
259
|
},
|
|
261
260
|
# {
|
|
262
261
|
# Upload the records into a table
|
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
|
|
2
2
|
deriva_ml/database_model.py,sha256=lMbAEqn4n0m7h_JstMX_LX9gbvBIEydG3sRilPn3eLU,14885
|
|
3
|
-
deriva_ml/dataset.py,sha256=
|
|
3
|
+
deriva_ml/dataset.py,sha256=OyWUKWnYeP0ctimSBQ4em-uJrzCNOohx4GPT2uIl6R4,60649
|
|
4
4
|
deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
|
|
5
5
|
deriva_ml/dataset_bag.py,sha256=yS8oYVshfFtRDyhGPRqtbvxjyd3ZFF29lrB783OP4vM,11849
|
|
6
6
|
deriva_ml/demo_catalog.py,sha256=9Qo3JD4bUIwnL3ngPctc2QBeWApvMR_5UyaK9ockTrY,11536
|
|
7
|
-
deriva_ml/deriva_definitions.py,sha256=
|
|
7
|
+
deriva_ml/deriva_definitions.py,sha256=HLaQ0zWO-Yd17Yp8hvqFSGkvjANJ52Ws5yHCVYMhfGA,8918
|
|
8
8
|
deriva_ml/deriva_ml_base.py,sha256=rrImShp1RXvMuXVLft5GfTnxf_PfF1LONHgV1Ee_E9I,46517
|
|
9
9
|
deriva_ml/deriva_model.py,sha256=wytGCAHutiUaRfnRKr80Ks_P6ci0_wXRU3vq3lthfYU,13260
|
|
10
|
-
deriva_ml/execution.py,sha256=
|
|
10
|
+
deriva_ml/execution.py,sha256=xYS4wYRYcksNjUZ-Rwys_H4jZchW3YVu-uWg7ySJMjk,37510
|
|
11
11
|
deriva_ml/execution_configuration.py,sha256=XQeXzPz9Gh_AGa_iYW8zF95niwHed3ojv4gnibB0thA,4082
|
|
12
12
|
deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
|
|
13
13
|
deriva_ml/feature.py,sha256=07g0uSrhumdopJluWuWSRMrzagaikAOihqB09bzXBP4,5475
|
|
14
14
|
deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
|
|
15
15
|
deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
|
|
16
|
-
deriva_ml/upload.py,sha256=
|
|
16
|
+
deriva_ml/upload.py,sha256=gHTGXAVlf56EwNzmw5zY0gbBf8h08eU2q2GBbb2FdVc,16087
|
|
17
17
|
deriva_ml/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUsO3WWnu1U,9528
|
|
19
19
|
deriva_ml/schema_setup/create_schema.py,sha256=hNMc-v5tferd0UjfdB6nBw7Rc_o-Mg6NkPqQGie9YOw,11700
|
|
20
20
|
deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
|
|
21
21
|
deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
|
|
22
|
-
deriva_ml-1.12.
|
|
23
|
-
deriva_ml-1.12.
|
|
24
|
-
deriva_ml-1.12.
|
|
25
|
-
deriva_ml-1.12.
|
|
26
|
-
deriva_ml-1.12.
|
|
27
|
-
deriva_ml-1.12.
|
|
22
|
+
deriva_ml-1.12.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
23
|
+
deriva_ml-1.12.3.dist-info/METADATA,sha256=CNoKyLpxijU8MrLj4VQzrOQLAU3oIT232DF9RI-eFbw,974
|
|
24
|
+
deriva_ml-1.12.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
25
|
+
deriva_ml-1.12.3.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
|
|
26
|
+
deriva_ml-1.12.3.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
27
|
+
deriva_ml-1.12.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|