deriva-ml 1.8.5__py3-none-any.whl → 1.8.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +9 -0
- deriva_ml/deriva_ml_base.py +98 -73
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/METADATA +3 -2
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/RECORD +8 -9
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/WHEEL +1 -1
- deriva_ml/VERSION.py +0 -1
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.8.5.dist-info → deriva_ml-1.8.10.dist-info}/top_level.txt +0 -0
deriva_ml/__init__.py
CHANGED
|
@@ -41,3 +41,12 @@ from .execution_configuration import (
|
|
|
41
41
|
Workflow,
|
|
42
42
|
)
|
|
43
43
|
from .execution import Execution
|
|
44
|
+
|
|
45
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
__version__ = version("deriva_ml")
|
|
49
|
+
except PackageNotFoundError:
|
|
50
|
+
# package is not installed
|
|
51
|
+
pass
|
|
52
|
+
|
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -13,17 +13,15 @@ from __future__ import annotations
|
|
|
13
13
|
import getpass
|
|
14
14
|
import logging
|
|
15
15
|
from datetime import datetime
|
|
16
|
-
import hashlib
|
|
17
16
|
from itertools import chain
|
|
18
17
|
import inspect
|
|
18
|
+
import setuptools_scm
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
import requests
|
|
21
|
-
from setuptools_git_versioning import get_latest_file_commit
|
|
22
21
|
import subprocess
|
|
23
22
|
import shutil
|
|
24
23
|
from typing import Optional, Any, Iterable, TYPE_CHECKING
|
|
25
24
|
from deriva.core import (
|
|
26
|
-
ErmrestCatalog,
|
|
27
25
|
get_credential,
|
|
28
26
|
urlquote,
|
|
29
27
|
DEFAULT_SESSION_CONFIG,
|
|
@@ -144,6 +142,8 @@ class DerivaML(Dataset):
|
|
|
144
142
|
|
|
145
143
|
# Initialize dataset class.
|
|
146
144
|
super().__init__(self.model, self.cache_dir)
|
|
145
|
+
self._logger = logging.getLogger("deriva_ml")
|
|
146
|
+
self._logger.setLevel(logging_level)
|
|
147
147
|
|
|
148
148
|
self.host_name = hostname
|
|
149
149
|
self.catalog_id = catalog_id
|
|
@@ -151,37 +151,12 @@ class DerivaML(Dataset):
|
|
|
151
151
|
self.version = model_version
|
|
152
152
|
self.configuration = None
|
|
153
153
|
self._execution: Optional[Execution] = None
|
|
154
|
-
self.
|
|
155
|
-
|
|
156
|
-
from IPython import get_ipython
|
|
157
|
-
|
|
158
|
-
ipython = get_ipython()
|
|
159
|
-
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
160
|
-
if ipython is not None and "IPKernelApp" in ipython.config:
|
|
161
|
-
self._notebook = Path(ipython.user_ns.get("__session__"))
|
|
162
|
-
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
163
|
-
try:
|
|
164
|
-
if subprocess.run(
|
|
165
|
-
[shutil.which("nbstripout"), "--is-installed"],
|
|
166
|
-
check=False,
|
|
167
|
-
capture_output=True,
|
|
168
|
-
).returncode:
|
|
169
|
-
self._logger.warn(
|
|
170
|
-
"nbstripout is not installed in repository. Please run nbstripout --install"
|
|
171
|
-
)
|
|
172
|
-
except subprocess.CalledProcessError:
|
|
173
|
-
self._logger.error("nbstripout is not found.")
|
|
174
|
-
|
|
175
|
-
except (ImportError, AttributeError):
|
|
176
|
-
pass
|
|
177
|
-
|
|
154
|
+
self._script_path, self._is_notebook = self._get_python_script()
|
|
155
|
+
self._notebook = self._get_python_notebook()
|
|
178
156
|
self.domain_schema = self.model.domain_schema
|
|
179
157
|
self.project_name = project_name or self.domain_schema
|
|
180
|
-
|
|
181
158
|
self.start_time = datetime.now()
|
|
182
159
|
self.status = Status.pending.value
|
|
183
|
-
self._logger = logging.getLogger("deriva_ml")
|
|
184
|
-
self._logger.setLevel(logging_level)
|
|
185
160
|
|
|
186
161
|
logging.basicConfig(
|
|
187
162
|
level=logging_level,
|
|
@@ -204,6 +179,65 @@ class DerivaML(Dataset):
|
|
|
204
179
|
except (AttributeError, requests.HTTPError):
|
|
205
180
|
pass
|
|
206
181
|
|
|
182
|
+
def _get_python_notebook(self) -> Path | None:
|
|
183
|
+
"""Figure out if you are running in a Jupyter notebook
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A Path to the notebook file that is currently being executed.
|
|
187
|
+
"""
|
|
188
|
+
notebook = None
|
|
189
|
+
try:
|
|
190
|
+
ipython = get_ipython()
|
|
191
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
192
|
+
if ipython is not None and "IPKernelApp" in ipython.config:
|
|
193
|
+
notebook = Path(ipython.user_ns.get("__session__"))
|
|
194
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
195
|
+
try:
|
|
196
|
+
if subprocess.run(
|
|
197
|
+
[shutil.which("nbstripout"), "--is-installed"],
|
|
198
|
+
check=False,
|
|
199
|
+
capture_output=True,
|
|
200
|
+
).returncode:
|
|
201
|
+
self._logger.warning(
|
|
202
|
+
"nbstripout is not installed in repository. Please run nbstripout --install"
|
|
203
|
+
)
|
|
204
|
+
except subprocess.CalledProcessError:
|
|
205
|
+
self._logger.error("nbstripout is not found.")
|
|
206
|
+
except (ImportError, AttributeError):
|
|
207
|
+
pass
|
|
208
|
+
return notebook
|
|
209
|
+
|
|
210
|
+
def _get_python_script(self) -> tuple[Path, bool]:
|
|
211
|
+
"""Return the path to the currently executing script"""
|
|
212
|
+
is_notebook = False
|
|
213
|
+
if filename := self._get_python_notebook():
|
|
214
|
+
is_notebook = True
|
|
215
|
+
else:
|
|
216
|
+
stack = inspect.stack()
|
|
217
|
+
if len(stack) > 1:
|
|
218
|
+
filename = Path(
|
|
219
|
+
stack[2].filename
|
|
220
|
+
) # Get the caller's filename, which is two up the stack from here.
|
|
221
|
+
else:
|
|
222
|
+
raise DerivaMLException(
|
|
223
|
+
f"Looking for caller failed"
|
|
224
|
+
) # Stack is too shallow
|
|
225
|
+
return filename, is_notebook
|
|
226
|
+
|
|
227
|
+
def _get_git_root(self):
|
|
228
|
+
try:
|
|
229
|
+
result = subprocess.run(
|
|
230
|
+
["git", "rev-parse", "--show-toplevel"],
|
|
231
|
+
cwd=self._script_path.parent,
|
|
232
|
+
stdout=subprocess.PIPE,
|
|
233
|
+
stderr=subprocess.DEVNULL,
|
|
234
|
+
text=True,
|
|
235
|
+
check=True
|
|
236
|
+
)
|
|
237
|
+
return result.stdout.strip()
|
|
238
|
+
except subprocess.CalledProcessError:
|
|
239
|
+
return None # Not in a git repository
|
|
240
|
+
|
|
207
241
|
@staticmethod
|
|
208
242
|
def _get_session_config():
|
|
209
243
|
""" """
|
|
@@ -227,6 +261,9 @@ class DerivaML(Dataset):
|
|
|
227
261
|
"""Get a new instance of a pathBuilder object."""
|
|
228
262
|
return self.catalog.getPathBuilder()
|
|
229
263
|
|
|
264
|
+
def get_version(self) -> str:
|
|
265
|
+
return setuptools_scm.get_version(root=self._get_git_root())
|
|
266
|
+
|
|
230
267
|
@property
|
|
231
268
|
def domain_path(self):
|
|
232
269
|
"""Get a new instance of a pathBuilder object to the domain schema"""
|
|
@@ -1028,27 +1065,25 @@ class DerivaML(Dataset):
|
|
|
1028
1065
|
"""
|
|
1029
1066
|
# Make sure type is correct.
|
|
1030
1067
|
self.lookup_term(MLVocab.workflow_type, workflow_type)
|
|
1031
|
-
|
|
1068
|
+
github_url, is_dirty = self._github_url()
|
|
1032
1069
|
|
|
1033
1070
|
if is_dirty:
|
|
1034
1071
|
self._logger.warning(
|
|
1035
|
-
f"File {
|
|
1072
|
+
f"File {self._script_path} has been modified since last commit. Consider commiting before executing"
|
|
1036
1073
|
)
|
|
1037
1074
|
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
sha256_hash.update(f.read())
|
|
1051
|
-
checksum = "SHA-256:" + sha256_hash.hexdigest()
|
|
1075
|
+
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
1076
|
+
cmd = (
|
|
1077
|
+
f"nbstripout {self._script_path} | git hash-object --stdin"
|
|
1078
|
+
if self._is_notebook
|
|
1079
|
+
else f"git hash-object {self._script_path}"
|
|
1080
|
+
)
|
|
1081
|
+
checksum = subprocess.run(
|
|
1082
|
+
cmd,
|
|
1083
|
+
capture_output=True,
|
|
1084
|
+
text=True,
|
|
1085
|
+
check=True,
|
|
1086
|
+
).stdout.strip()
|
|
1052
1087
|
|
|
1053
1088
|
workflow = Workflow(
|
|
1054
1089
|
name=name,
|
|
@@ -1059,54 +1094,36 @@ class DerivaML(Dataset):
|
|
|
1059
1094
|
)
|
|
1060
1095
|
return self.add_workflow(workflow) if create else None
|
|
1061
1096
|
|
|
1062
|
-
def _github_url(self) -> tuple[
|
|
1097
|
+
def _github_url(self) -> tuple[str, bool]:
|
|
1063
1098
|
"""Return a GitHUB URL for the latest commit of the script from which this routine is called.
|
|
1064
1099
|
|
|
1065
1100
|
This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
|
|
1066
1101
|
the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
|
|
1067
1102
|
file in GitHUB.
|
|
1068
1103
|
|
|
1069
|
-
Returns: A tuple with the
|
|
1104
|
+
Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
|
|
1070
1105
|
have been made to the file.
|
|
1071
1106
|
|
|
1072
1107
|
"""
|
|
1073
1108
|
|
|
1074
|
-
# Get the name of the script that is calling this function.
|
|
1075
|
-
if self._notebook:
|
|
1076
|
-
# Try to get the __session__ variable from the user namespace.
|
|
1077
|
-
filename = Path("").absolute().parent / self._notebook
|
|
1078
|
-
else:
|
|
1079
|
-
stack = inspect.stack()
|
|
1080
|
-
if len(stack) > 1:
|
|
1081
|
-
filename = Path(
|
|
1082
|
-
stack[2].filename
|
|
1083
|
-
) # Get the caller's filename, which is two up the stack from here.
|
|
1084
|
-
else:
|
|
1085
|
-
raise DerivaMLException(
|
|
1086
|
-
f"Looking for caller failed"
|
|
1087
|
-
) # Stack is too shallow
|
|
1088
|
-
|
|
1089
1109
|
# Get repo URL from local github repo.
|
|
1090
1110
|
try:
|
|
1091
1111
|
result = subprocess.run(
|
|
1092
|
-
["git", "remote", "get-url", "origin"], capture_output=True, text=True
|
|
1112
|
+
["git", "remote", "get-url", "origin"], capture_output=True, text=True,
|
|
1113
|
+
cwd=self._script_path.parent,
|
|
1093
1114
|
)
|
|
1094
1115
|
github_url = result.stdout.strip().removesuffix(".git")
|
|
1095
1116
|
except subprocess.CalledProcessError:
|
|
1096
1117
|
raise DerivaMLException(f"No GIT remote found")
|
|
1097
1118
|
|
|
1098
1119
|
# Find the root directory for the repository
|
|
1099
|
-
repo_root =
|
|
1100
|
-
while repo_root != repo_root.root:
|
|
1101
|
-
if (repo_root / ".git").exists():
|
|
1102
|
-
break
|
|
1103
|
-
else:
|
|
1104
|
-
repo_root = repo_root.parent
|
|
1120
|
+
repo_root = self._get_git_root()
|
|
1105
1121
|
|
|
1106
1122
|
# Now check to see if file has been modified since the last commit.
|
|
1107
1123
|
try:
|
|
1108
1124
|
result = subprocess.run(
|
|
1109
1125
|
["git", "status", "--porcelain"],
|
|
1126
|
+
cwd=self._script_path.parent,
|
|
1110
1127
|
capture_output=True,
|
|
1111
1128
|
text=True,
|
|
1112
1129
|
check=True,
|
|
@@ -1117,9 +1134,17 @@ class DerivaML(Dataset):
|
|
|
1117
1134
|
except subprocess.CalledProcessError:
|
|
1118
1135
|
is_dirty = False # If Git command fails, assume no changes
|
|
1119
1136
|
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1137
|
+
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
1138
|
+
result = subprocess.run(
|
|
1139
|
+
["git", "log", "-n", "1", "--pretty=format:%H" "--", self._script_path],
|
|
1140
|
+
cwd=self._script_path.parent,
|
|
1141
|
+
capture_output=True,
|
|
1142
|
+
text=True,
|
|
1143
|
+
check=True,
|
|
1144
|
+
)
|
|
1145
|
+
sha = result.stdout.strip()
|
|
1146
|
+
url = f"{github_url}/blob/{sha}/{self._script_path.relative_to(repo_root)}"
|
|
1147
|
+
return url, is_dirty
|
|
1123
1148
|
|
|
1124
1149
|
# @validate_call
|
|
1125
1150
|
def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.10
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -11,7 +11,8 @@ Requires-Dist: pandas
|
|
|
11
11
|
Requires-Dist: regex~=2024.7.24
|
|
12
12
|
Requires-Dist: pydantic>=2.10.6
|
|
13
13
|
Requires-Dist: semver>3.0.0
|
|
14
|
-
Requires-Dist: setuptools
|
|
14
|
+
Requires-Dist: setuptools>=64
|
|
15
|
+
Requires-Dist: setuptools-scm<=6.0
|
|
15
16
|
Requires-Dist: nbstripout
|
|
16
17
|
Dynamic: license-file
|
|
17
18
|
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
deriva_ml/
|
|
2
|
-
deriva_ml/__init__.py,sha256=DyHiqklSer7q7oPGAemkzg5Qcq2swMZf9ALwJhGf6Jo,905
|
|
1
|
+
deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
|
|
3
2
|
deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
|
|
4
3
|
deriva_ml/dataset.py,sha256=xC6QPUp4MZcJiEnOEU3NnzoLBL9RcJWtPTyzIQP0Ivw,60666
|
|
5
4
|
deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
|
|
6
5
|
deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
|
|
7
6
|
deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
|
|
8
7
|
deriva_ml/deriva_definitions.py,sha256=pZLPoUxiuJ-uGglmQ6sF9oVXsSUuOnPEqywoec78XNM,8893
|
|
9
|
-
deriva_ml/deriva_ml_base.py,sha256=
|
|
8
|
+
deriva_ml/deriva_ml_base.py,sha256=aVyGsFERZtpjNxfaVYzvKa7J0Ma-U3DEibfjnbr7lFQ,43817
|
|
10
9
|
deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
|
|
11
10
|
deriva_ml/execution.py,sha256=VlapQGPDQI2MOmYnA5-hpf-XM6Fu4hPLpFjNN5q9Udo,29889
|
|
12
11
|
deriva_ml/execution_configuration.py,sha256=bjnZwXN6M7YPy5dFQwoGEBU8YjhQRSe1FW0rL0V9TaM,3422
|
|
@@ -26,9 +25,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
|
|
|
26
25
|
deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
|
|
27
26
|
deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
|
|
28
27
|
deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
|
|
29
|
-
deriva_ml-1.8.
|
|
30
|
-
deriva_ml-1.8.
|
|
31
|
-
deriva_ml-1.8.
|
|
32
|
-
deriva_ml-1.8.
|
|
33
|
-
deriva_ml-1.8.
|
|
34
|
-
deriva_ml-1.8.
|
|
28
|
+
deriva_ml-1.8.10.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
29
|
+
deriva_ml-1.8.10.dist-info/METADATA,sha256=Mhx0joyR1gPEX8G6ZoEpvxNVW4sUG9C_S5TIA6ueZKk,670
|
|
30
|
+
deriva_ml-1.8.10.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
|
31
|
+
deriva_ml-1.8.10.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
|
|
32
|
+
deriva_ml-1.8.10.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
33
|
+
deriva_ml-1.8.10.dist-info/RECORD,,
|
deriva_ml/VERSION.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.8.5"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|