deriva-ml 1.12.1__tar.gz → 1.12.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {deriva_ml-1.12.1/src/deriva_ml.egg-info → deriva_ml-1.12.3}/PKG-INFO +1 -1
  2. deriva_ml-1.12.3/docs/user-guide/file-assets.md +3 -0
  3. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/dataset.py +9 -2
  4. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/deriva_definitions.py +1 -0
  5. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/execution.py +71 -29
  6. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/upload.py +11 -12
  7. {deriva_ml-1.12.1 → deriva_ml-1.12.3/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  8. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml.egg-info/SOURCES.txt +1 -0
  9. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/.github/workflows/publish-docs.yml +0 -0
  10. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/.gitignore +0 -0
  11. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/LICENSE +0 -0
  12. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/README.md +0 -0
  13. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/.DS_Store +0 -0
  14. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  15. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  16. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
  17. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  18. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  19. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/ERD.png +0 -0
  20. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/Launcher.png +0 -0
  21. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/copy_minid.png +0 -0
  22. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/deriva-logo.png +0 -0
  23. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/deriva-ml.pdf +0 -0
  24. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/assets/sharing-at-home.pdf +0 -0
  25. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/dataset.md +0 -0
  26. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/dataset_aux_classes.md +0 -0
  27. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/dataset_bag.md +0 -0
  28. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/deriva_ml_base.md +0 -0
  29. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/deriva_model.md +0 -0
  30. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/execution.md +0 -0
  31. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/execution_configuration.md +0 -0
  32. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/feature.md +0 -0
  33. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/code-docs/upload.md +0 -0
  34. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/deriva_ml_structure.md +0 -0
  35. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/index.md +0 -0
  36. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/release-notes.md +0 -0
  37. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/user-guide/datasets.md +0 -0
  38. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/user-guide/execution-configuration.md +0 -0
  39. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/user-guide/identifiers.md +0 -0
  40. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/user-guide/install.md +0 -0
  41. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/docs/user-guide/ml_workflow_instruction.md +0 -0
  42. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/mkdocs.yml +0 -0
  43. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/pyproject.toml +0 -0
  44. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/release.sh +0 -0
  45. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/setup.cfg +0 -0
  46. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/__init__.py +0 -0
  47. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/database_model.py +0 -0
  48. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/dataset_aux_classes.py +0 -0
  49. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/dataset_bag.py +0 -0
  50. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/demo_catalog.py +0 -0
  51. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/deriva_ml_base.py +0 -0
  52. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/deriva_model.py +0 -0
  53. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/execution_configuration.py +0 -0
  54. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/execution_environment.py +0 -0
  55. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/feature.py +0 -0
  56. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/history.py +0 -0
  57. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/schema_setup/__init__.py +0 -0
  58. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/schema_setup/annotations.py +0 -0
  59. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/schema_setup/create_schema.py +0 -0
  60. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/schema_setup/policy.json +0 -0
  61. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  62. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml/test_functions.py +0 -0
  63. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  64. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  65. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml.egg-info/requires.txt +0 -0
  66. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/src/deriva_ml.egg-info/top_level.txt +0 -0
  67. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/__init__.py +0 -0
  68. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/derivaml_test.py +0 -0
  69. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/runner.py +0 -0
  70. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_basic_tables.py +0 -0
  71. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_dataset.py +0 -0
  72. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_download.py +0 -0
  73. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_execution.py +0 -0
  74. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_features.py +0 -0
  75. {deriva_ml-1.12.1 → deriva_ml-1.12.3}/tests/test_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.12.1
3
+ Version: 1.12.3
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1,3 @@
1
+ # File Assets
2
+
3
+
@@ -41,7 +41,14 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile
41
41
  from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
42
42
 
43
43
  from deriva_ml import DatasetBag
44
- from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
44
+ from .deriva_definitions import (
45
+ ML_SCHEMA,
46
+ DerivaMLException,
47
+ MLVocab,
48
+ Status,
49
+ RID,
50
+ DRY_RUN_RID,
51
+ )
45
52
  from .history import iso_to_snap
46
53
  from .deriva_model import DerivaModel
47
54
  from .database_model import DatabaseModel
@@ -957,7 +964,7 @@ class Dataset:
957
964
  for the dataset.
958
965
  """
959
966
  if (
960
- execution_rid
967
+ execution_rid != DRY_RUN_RID
961
968
  and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
962
969
  ):
963
970
  raise DerivaMLException(f"RID {execution_rid} is not an execution")
@@ -21,6 +21,7 @@ from pydantic import (
21
21
  from socket import gethostname
22
22
 
23
23
  ML_SCHEMA = "deriva-ml"
24
+ DRY_RUN_RID = "0000"
24
25
 
25
26
  # We are going to use schema as a field name and this collides with method in pydantic base class
26
27
  warnings.filterwarnings(
@@ -5,21 +5,30 @@ This module defined the Execution class which is used to interact with the state
5
5
  from __future__ import annotations
6
6
 
7
7
  from collections import defaultdict
8
+ from datetime import datetime
8
9
  import json
9
10
  import logging
10
11
  import os
11
- import shutil
12
- from datetime import datetime
13
12
  from pathlib import Path
14
- from typing import Iterable, Any, Optional
15
13
 
16
- from deriva.core import format_exception
17
14
  from pydantic import validate_call, ConfigDict
15
+ import regex as re
18
16
  import sys
19
- from deriva.core.hatrac_store import HatracStore
17
+ import shutil
18
+ from typing import Iterable, Any, Optional
20
19
 
20
+ from deriva.core import format_exception
21
+ from deriva.core.datapath import DataPathException
22
+ from deriva.core.hatrac_store import HatracStore
21
23
  from .deriva_definitions import ExecMetadataVocab
22
- from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
24
+ from .deriva_definitions import (
25
+ RID,
26
+ Status,
27
+ FileUploadState,
28
+ DerivaMLException,
29
+ MLVocab,
30
+ DRY_RUN_RID,
31
+ )
23
32
  from .deriva_ml_base import DerivaML, FeatureRecord
24
33
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
25
34
  from .dataset_bag import DatasetBag
@@ -167,7 +176,7 @@ class Execution:
167
176
  self.workflow_rid = (
168
177
  self._ml_object.add_workflow(self.configuration.workflow)
169
178
  if not self._dry_run
170
- else "0000"
179
+ else DRY_RUN_RID
171
180
  )
172
181
  else:
173
182
  self.workflow_rid = self.configuration.workflow
@@ -195,10 +204,10 @@ class Execution:
195
204
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
196
205
  if reload:
197
206
  self.execution_rid = reload
198
- if self.execution_rid == "0000":
207
+ if self.execution_rid == DRY_RUN_RID:
199
208
  self._dry_run = True
200
209
  elif self._dry_run:
201
- self.execution_rid = "0000"
210
+ self.execution_rid = DRY_RUN_RID
202
211
  else:
203
212
  self.execution_rid = schema_path.Execution.insert(
204
213
  [
@@ -625,9 +634,20 @@ class Execution:
625
634
  with open(feature_file, "r") as feature_values:
626
635
  entities = [json.loads(line.strip()) for line in feature_values]
627
636
  # Update the asset columns in the feature and add to the catalog.
628
- self._ml_object.domain_path.tables[feature_table].insert(
629
- [map_path(e) for e in entities]
630
- )
637
+ try:
638
+ self._ml_object.domain_path.tables[feature_table].insert(
639
+ [map_path(e) for e in entities]
640
+ )
641
+ except DataPathException as e:
642
+ if re.match(
643
+ rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
644
+ e.message,
645
+ ):
646
+ self._logger.info(
647
+ f"Skipping reload of feature values for {feature_table}"
648
+ )
649
+ else:
650
+ raise e
631
651
 
632
652
  def _update_asset_execution_table(
633
653
  self,
@@ -652,16 +672,27 @@ class Execution:
652
672
  asset_exe = self._model.find_association(asset_table_name, "Execution")
653
673
  asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
654
674
 
655
- asset_exe_path.insert(
656
- [
657
- {
658
- asset_table_name: asset_path.asset_rid,
659
- "Execution": self.execution_rid,
660
- "Asset_Role": asset_role,
661
- }
662
- for asset_path in asset_list
663
- ]
664
- )
675
+ try:
676
+ asset_exe_path.insert(
677
+ [
678
+ {
679
+ asset_table_name: asset_path.asset_rid,
680
+ "Execution": self.execution_rid,
681
+ "Asset_Role": asset_role,
682
+ }
683
+ for asset_path in asset_list
684
+ ]
685
+ )
686
+ except DataPathException as e:
687
+ if re.match(
688
+ rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
689
+ e.message,
690
+ ):
691
+ self._logger.info(
692
+ f"Skipping reload of execution assocations for {asset_table_name}"
693
+ )
694
+ else:
695
+ raise e
665
696
 
666
697
  # Now add in the type names via the asset_asset_type association table.
667
698
  # Get the list of types for each file in the asset.
@@ -687,13 +718,24 @@ class Execution:
687
718
  type_path = pb.schemas[asset_asset_type.schema.name].tables[
688
719
  asset_asset_type.name
689
720
  ]
690
- type_path.insert(
691
- [
692
- {asset_table_name: asset.asset_rid, "Asset_Type": t}
693
- for asset in asset_list
694
- for t in asset_type_map[asset.file_name]
695
- ]
696
- )
721
+ try:
722
+ type_path.insert(
723
+ [
724
+ {asset_table_name: asset.asset_rid, "Asset_Type": t}
725
+ for asset in asset_list
726
+ for t in asset_type_map[asset.file_name]
727
+ ]
728
+ )
729
+ except DataPathException as e:
730
+ if re.match(
731
+ rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
732
+ e.message,
733
+ ):
734
+ self._logger.info(
735
+ f"Skipping reload of execution asset types for {asset_table_name}"
736
+ )
737
+ else:
738
+ raise e
697
739
 
698
740
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
699
741
  def asset_file_path(
@@ -74,17 +74,16 @@ feature_table_dir_regex = (
74
74
  + r"/(?P<schema>[-\w]+)/(?P<target_table>[-\w]+)/(?P<feature_name>[-\w]+)"
75
75
  )
76
76
  feature_value_regex = (
77
- feature_table_dir_regex + r"/(?P=feature_name)[.](?P<file_ext>[(csv|json)]*)$"
77
+ feature_table_dir_regex + r"/(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
78
78
  )
79
79
  feature_asset_dir_regex = feature_table_dir_regex + r"/asset/(?P<asset_table>[-\w]+)"
80
80
  feature_asset_regex = (
81
- feature_asset_dir_regex
82
- + r"/(?P<file_name>[A-Za-z0-9_-]+)[.](?P<file_ext>[a-z0-9]*)$"
81
+ feature_asset_dir_regex + r"/(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
83
82
  )
84
83
 
85
84
  asset_path_regex = exec_dir_regex + r"/asset/(?P<schema>[-\w]+)/(?P<asset_table>[-\w]*)"
86
85
 
87
- asset_file_regex = r"(?P<file_name>[-\w]+)[.](?P<file_ext>[a-z0-9]*)$"
86
+ asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
88
87
 
89
88
  table_regex = (
90
89
  exec_dir_regex
@@ -211,16 +210,16 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
211
210
  "Filename": "{file_name}",
212
211
  }
213
212
  | {c: f"{{{c}}}" for c in metadata_columns},
214
- "file_pattern": asset_path, # Sets schema, asset_table, file_name, file_ext
213
+ "file_pattern": asset_path, # Sets schema, asset_table, file
215
214
  "asset_type": "file",
216
215
  "target_table": [schema, asset_table.name],
217
216
  "checksum_types": ["sha256", "md5"],
218
217
  "hatrac_options": {"versioned_urls": True},
219
218
  "hatrac_templates": {
220
- "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}.{{file_ext}}",
221
- "content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
219
+ "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
220
+ "content-disposition": "filename*=UTF-8''{file_name}",
222
221
  },
223
- "record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={file_name}.{file_ext}",
222
+ "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
224
223
  }
225
224
 
226
225
 
@@ -249,14 +248,14 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
249
248
  "target_table": ["{schema}", "{asset_table}"],
250
249
  "file_pattern": asset_path_regex
251
250
  + "/"
252
- + asset_file_regex, # Sets schema, asset_table, file_name, file_ext
251
+ + asset_file_regex, # Sets schema, asset_table, name, ext
253
252
  "checksum_types": ["sha256", "md5"],
254
253
  "hatrac_options": {"versioned_urls": True},
255
254
  "hatrac_templates": {
256
- "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}.{file_ext}",
257
- "content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
255
+ "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
256
+ "content-disposition": "filename*=UTF-8''{file_name}",
258
257
  },
259
- "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}.{file_ext}",
258
+ "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
260
259
  },
261
260
  # {
262
261
  # Upload the records into a table
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.12.1
3
+ Version: 1.12.3
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -31,6 +31,7 @@ docs/code-docs/feature.md
31
31
  docs/code-docs/upload.md
32
32
  docs/user-guide/datasets.md
33
33
  docs/user-guide/execution-configuration.md
34
+ docs/user-guide/file-assets.md
34
35
  docs/user-guide/identifiers.md
35
36
  docs/user-guide/install.md
36
37
  docs/user-guide/ml_workflow_instruction.md
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes