deriva-ml 1.8.5__py3-none-any.whl → 1.8.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/__init__.py CHANGED
@@ -41,3 +41,12 @@ from .execution_configuration import (
41
41
  Workflow,
42
42
  )
43
43
  from .execution import Execution
44
+
45
+ from importlib.metadata import version, PackageNotFoundError
46
+
47
+ try:
48
+ __version__ = version("deriva_ml")
49
+ except PackageNotFoundError:
50
+ # package is not installed
51
+ pass
52
+
@@ -13,17 +13,14 @@ from __future__ import annotations
13
13
  import getpass
14
14
  import logging
15
15
  from datetime import datetime
16
- import hashlib
17
16
  from itertools import chain
18
17
  import inspect
18
+ import setuptools_scm
19
19
  from pathlib import Path
20
20
  import requests
21
- from setuptools_git_versioning import get_latest_file_commit
22
21
  import subprocess
23
- import shutil
24
22
  from typing import Optional, Any, Iterable, TYPE_CHECKING
25
23
  from deriva.core import (
26
- ErmrestCatalog,
27
24
  get_credential,
28
25
  urlquote,
29
26
  DEFAULT_SESSION_CONFIG,
@@ -36,6 +33,7 @@ from deriva.core.ermrest_catalog import ResolveRidResult
36
33
  from deriva.core.ermrest_model import Key, Table
37
34
  from deriva.core.hatrac_store import HatracStore
38
35
  from pydantic import validate_call, ConfigDict
36
+ from requests import RequestException
39
37
 
40
38
  from .execution_configuration import ExecutionConfiguration, Workflow
41
39
  from .feature import Feature, FeatureRecord
@@ -74,6 +72,15 @@ try:
74
72
  except ImportError: # Graceful fallback if IPython isn't installed.
75
73
  get_ipython = lambda: None
76
74
 
75
+ try:
76
+ from jupyter_server.serverapp import list_running_servers
77
+ except ImportError:
78
+ list_running_servers = lambda: []
79
+
80
+ try:
81
+ from ipykernel import get_connection_file
82
+ except ImportError:
83
+ get_connection_file = lambda: ""
77
84
 
78
85
  if TYPE_CHECKING:
79
86
  from .execution import Execution
@@ -144,6 +151,8 @@ class DerivaML(Dataset):
144
151
 
145
152
  # Initialize dataset class.
146
153
  super().__init__(self.model, self.cache_dir)
154
+ self._logger = logging.getLogger("deriva_ml")
155
+ self._logger.setLevel(logging_level)
147
156
 
148
157
  self.host_name = hostname
149
158
  self.catalog_id = catalog_id
@@ -151,37 +160,11 @@ class DerivaML(Dataset):
151
160
  self.version = model_version
152
161
  self.configuration = None
153
162
  self._execution: Optional[Execution] = None
154
- self._notebook = None
155
- try:
156
- from IPython import get_ipython
157
-
158
- ipython = get_ipython()
159
- # Check if running in Jupyter's ZMQ kernel (used by notebooks)
160
- if ipython is not None and "IPKernelApp" in ipython.config:
161
- self._notebook = Path(ipython.user_ns.get("__session__"))
162
- # Check if running in Jupyter's ZMQ kernel (used by notebooks)
163
- try:
164
- if subprocess.run(
165
- [shutil.which("nbstripout"), "--is-installed"],
166
- check=False,
167
- capture_output=True,
168
- ).returncode:
169
- self._logger.warn(
170
- "nbstripout is not installed in repository. Please run nbstripout --install"
171
- )
172
- except subprocess.CalledProcessError:
173
- self._logger.error("nbstripout is not found.")
174
-
175
- except (ImportError, AttributeError):
176
- pass
177
-
163
+ self.executable_path, self._is_notebook = self._get_python_script()
178
164
  self.domain_schema = self.model.domain_schema
179
165
  self.project_name = project_name or self.domain_schema
180
-
181
166
  self.start_time = datetime.now()
182
167
  self.status = Status.pending.value
183
- self._logger = logging.getLogger("deriva_ml")
184
- self._logger.setLevel(logging_level)
185
168
 
186
169
  logging.basicConfig(
187
170
  level=logging_level,
@@ -204,6 +187,104 @@ class DerivaML(Dataset):
204
187
  except (AttributeError, requests.HTTPError):
205
188
  pass
206
189
 
190
+ def _check_nbstrip_status(self) -> None:
191
+ """Figure out if you are running in a Jupyter notebook
192
+
193
+ Returns:
194
+ A Path to the notebook file that is currently being executed.
195
+ """
196
+ try:
197
+ if subprocess.run(
198
+ ["nbstripout", "--is-installed"],
199
+ check=False,
200
+ capture_output=True,
201
+ ).returncode:
202
+ self._logger.warning(
203
+ "nbstripout is not installed in repository. Please run nbstripout --install"
204
+ )
205
+ except subprocess.CalledProcessError:
206
+ self._logger.error("nbstripout is not found.")
207
+
208
+ def _get_notebook_session(
209
+ self,
210
+ ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
211
+ """Return the absolute path of the current notebook."""
212
+ # Get the kernel's connection file and extract the kernel ID
213
+ try:
214
+ if not (connection_file := Path(get_connection_file()).name):
215
+ return None, None
216
+ except RuntimeError:
217
+ return None, None
218
+
219
+ kernel_id = connection_file.split("-", 1)[1].split(".")[0]
220
+
221
+ # Look through the running server sessions to find the matching kernel ID
222
+ for server in list_running_servers():
223
+ try:
224
+ # If a token is required for authentication, include it in headers
225
+ token = server.get("token", "")
226
+ headers = {}
227
+ if token:
228
+ headers["Authorization"] = f"token {token}"
229
+
230
+ try:
231
+ sessions_url = server["url"] + "api/sessions"
232
+ response = requests.get(sessions_url, headers=headers)
233
+ response.raise_for_status()
234
+ sessions = response.json()
235
+ except RequestException as e:
236
+ raise e
237
+ for sess in sessions:
238
+ if sess["kernel"]["id"] == kernel_id:
239
+ return server, sess
240
+ except Exception as _e:
241
+ # Ignore servers we can't connect to.
242
+ pass
243
+ return None, None
244
+
245
+ def _get_notebook_path(self) -> Path | None:
246
+ """Return the absolute path of the current notebook."""
247
+
248
+ server, session = self._get_notebook_session()
249
+ if server and session:
250
+ self._check_nbstrip_status()
251
+ relative_path = session["notebook"]["path"]
252
+ # Join the notebook directory with the relative path
253
+ return Path(server["root_dir"]) / relative_path
254
+ else:
255
+ return None
256
+
257
+ def _get_python_script(self) -> tuple[Path, bool]:
258
+ """Return the path to the currently executing script"""
259
+ is_notebook = False
260
+ if filename := self._get_notebook_path():
261
+ is_notebook = True
262
+ else:
263
+ stack = inspect.stack()
264
+ if len(stack) > 1:
265
+ filename = Path(
266
+ stack[2].filename
267
+ ) # Get the caller's filename, which is two up the stack from here.
268
+ else:
269
+ raise DerivaMLException(
270
+ f"Looking for caller failed"
271
+ ) # Stack is too shallow
272
+ return filename, is_notebook
273
+
274
+ def _get_git_root(self):
275
+ try:
276
+ result = subprocess.run(
277
+ ["git", "rev-parse", "--show-toplevel"],
278
+ cwd=self.executable_path.parent,
279
+ stdout=subprocess.PIPE,
280
+ stderr=subprocess.DEVNULL,
281
+ text=True,
282
+ check=True,
283
+ )
284
+ return result.stdout.strip()
285
+ except subprocess.CalledProcessError:
286
+ return None # Not in a git repository
287
+
207
288
  @staticmethod
208
289
  def _get_session_config():
209
290
  """ """
@@ -227,6 +308,10 @@ class DerivaML(Dataset):
227
308
  """Get a new instance of a pathBuilder object."""
228
309
  return self.catalog.getPathBuilder()
229
310
 
311
+ def get_version(self) -> str:
312
+ """Return the version number of the executable"""
313
+ return setuptools_scm.get_version(root=self._get_git_root())
314
+
230
315
  @property
231
316
  def domain_path(self):
232
317
  """Get a new instance of a pathBuilder object to the domain schema"""
@@ -1003,6 +1088,7 @@ class DerivaML(Dataset):
1003
1088
  return workflow_rid
1004
1089
 
1005
1090
  def lookup_workflow(self, url: str) -> Optional[RID]:
1091
+ """Given a URL, look in the workflow table to find a matching workflow."""
1006
1092
  workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1007
1093
  try:
1008
1094
  url_column = workflow_path.URL
@@ -1028,27 +1114,26 @@ class DerivaML(Dataset):
1028
1114
  """
1029
1115
  # Make sure type is correct.
1030
1116
  self.lookup_term(MLVocab.workflow_type, workflow_type)
1031
- filename, github_url, is_dirty = self._github_url()
1117
+ github_url, is_dirty = self._github_url()
1032
1118
 
1033
1119
  if is_dirty:
1034
1120
  self._logger.warning(
1035
- f"File {filename} has been modified since last commit. Consider commiting before executing"
1121
+ f"File {self.executable_path} has been modified since last commit. Consider commiting before executing"
1036
1122
  )
1037
1123
 
1038
- sha256_hash = hashlib.sha256()
1039
- if self._notebook:
1040
- # If you are in a notebook, strip out the outputs before computing the checksum.
1041
- result = subprocess.run(
1042
- ["nbstripout", "-t", filename],
1043
- capture_output=True,
1044
- text=False,
1045
- check=True,
1046
- )
1047
- sha256_hash.update(result.stdout)
1048
- else:
1049
- with open(filename, "rb") as f:
1050
- sha256_hash.update(f.read())
1051
- checksum = "SHA-256:" + sha256_hash.hexdigest()
1124
+ # If you are in a notebook, strip out the outputs before computing the checksum.
1125
+ cmd = (
1126
+ f"nbstripout {self.executable_path} | git hash-object --stdin"
1127
+ if self._is_notebook
1128
+ else f"git hash-object {self.executable_path}"
1129
+ )
1130
+ checksum = subprocess.run(
1131
+ cmd,
1132
+ capture_output=True,
1133
+ text=True,
1134
+ check=True,
1135
+ shell=True,
1136
+ ).stdout.strip()
1052
1137
 
1053
1138
  workflow = Workflow(
1054
1139
  name=name,
@@ -1059,54 +1144,38 @@ class DerivaML(Dataset):
1059
1144
  )
1060
1145
  return self.add_workflow(workflow) if create else None
1061
1146
 
1062
- def _github_url(self) -> tuple[Path, str, bool]:
1147
+ def _github_url(self) -> tuple[str, bool]:
1063
1148
  """Return a GitHUB URL for the latest commit of the script from which this routine is called.
1064
1149
 
1065
1150
  This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
1066
1151
  the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
1067
1152
  file in GitHUB.
1068
1153
 
1069
- Returns: A tuple with the filename, gethub_url and a boolean to indicated if uncommited changes
1154
+ Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
1070
1155
  have been made to the file.
1071
1156
 
1072
1157
  """
1073
1158
 
1074
- # Get the name of the script that is calling this function.
1075
- if self._notebook:
1076
- # Try to get the __session__ variable from the user namespace.
1077
- filename = Path("").absolute().parent / self._notebook
1078
- else:
1079
- stack = inspect.stack()
1080
- if len(stack) > 1:
1081
- filename = Path(
1082
- stack[2].filename
1083
- ) # Get the caller's filename, which is two up the stack from here.
1084
- else:
1085
- raise DerivaMLException(
1086
- f"Looking for caller failed"
1087
- ) # Stack is too shallow
1088
-
1089
1159
  # Get repo URL from local github repo.
1090
1160
  try:
1091
1161
  result = subprocess.run(
1092
- ["git", "remote", "get-url", "origin"], capture_output=True, text=True
1162
+ ["git", "remote", "get-url", "origin"],
1163
+ capture_output=True,
1164
+ text=True,
1165
+ cwd=self.executable_path.parent,
1093
1166
  )
1094
1167
  github_url = result.stdout.strip().removesuffix(".git")
1095
1168
  except subprocess.CalledProcessError:
1096
1169
  raise DerivaMLException(f"No GIT remote found")
1097
1170
 
1098
1171
  # Find the root directory for the repository
1099
- repo_root = filename
1100
- while repo_root != repo_root.root:
1101
- if (repo_root / ".git").exists():
1102
- break
1103
- else:
1104
- repo_root = repo_root.parent
1172
+ repo_root = self._get_git_root()
1105
1173
 
1106
1174
  # Now check to see if file has been modified since the last commit.
1107
1175
  try:
1108
1176
  result = subprocess.run(
1109
1177
  ["git", "status", "--porcelain"],
1178
+ cwd=self.executable_path.parent,
1110
1179
  capture_output=True,
1111
1180
  text=True,
1112
1181
  check=True,
@@ -1117,9 +1186,17 @@ class DerivaML(Dataset):
1117
1186
  except subprocess.CalledProcessError:
1118
1187
  is_dirty = False # If Git command fails, assume no changes
1119
1188
 
1120
- sha = get_latest_file_commit(filename)
1121
- url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
1122
- return filename, url, is_dirty
1189
+ """Get SHA-1 hash of latest commit of the file in the repository"""
1190
+ result = subprocess.run(
1191
+ ["git", "log", "-n", "1", "--pretty=format:%H" "--", self.executable_path],
1192
+ cwd=self.executable_path.parent,
1193
+ capture_output=True,
1194
+ text=True,
1195
+ check=True,
1196
+ )
1197
+ sha = result.stdout.strip()
1198
+ url = f"{github_url}/blob/{sha}/{self.executable_path.relative_to(repo_root)}"
1199
+ return url, is_dirty
1123
1200
 
1124
1201
  # @validate_call
1125
1202
  def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
@@ -1149,6 +1226,7 @@ class DerivaML(Dataset):
1149
1226
 
1150
1227
  # @validate_call
1151
1228
  def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
1229
+ """Return an Execution object for a previously started execution with the specified RID."""
1152
1230
  from .execution import Execution
1153
1231
 
1154
1232
  # Find path to execution
deriva_ml/execution.py CHANGED
@@ -253,17 +253,9 @@ class Execution:
253
253
 
254
254
  def _create_notebook_checkpoint(self):
255
255
  """Trigger a checkpoint creation using Jupyter's API."""
256
- notebook_name = self._ml_object._notebook
257
-
258
- # Look for the server running this notebook.
259
- root = Path("").absolute().parent.as_posix()
260
- servers = list(list_running_servers())
261
- # Jupyterhub seems to handle root_dir differently then server case.
262
- server = (
263
- servers
264
- if len(servers) == 1
265
- else [s for s in servers if s["root_dir"] == root]
266
- )[0]
256
+
257
+ server, session = self._ml_object._get_notebook_session()
258
+ notebook_name = session["notebook"]["path"]
267
259
  notebook_url = f"{server['url']}api/contents/{notebook_name}"
268
260
 
269
261
  # Get notebook content
@@ -275,7 +267,7 @@ class Execution:
275
267
  # Execution metadata cannot be in a directory, so map path into filename.
276
268
  checkpoint_path = (
277
269
  self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
278
- / f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
270
+ / f"{notebook_name.replace('/','_')}.checkpoint"
279
271
  )
280
272
  with open(checkpoint_path, "w", encoding="utf-8") as f:
281
273
  json.dump(notebook_content, f)
@@ -295,7 +287,7 @@ class Execution:
295
287
  minutes, seconds = divmod(remainder, 60)
296
288
  duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
297
289
 
298
- if self._ml_object._notebook:
290
+ if self._ml_object._is_notebook:
299
291
  self._create_notebook_checkpoint()
300
292
 
301
293
  self.update_status(Status.completed, "Algorithm execution ended.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.8.5
3
+ Version: 1.8.11
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -11,7 +11,8 @@ Requires-Dist: pandas
11
11
  Requires-Dist: regex~=2024.7.24
12
12
  Requires-Dist: pydantic>=2.10.6
13
13
  Requires-Dist: semver>3.0.0
14
- Requires-Dist: setuptools-git-versioning<3,>=2.0
14
+ Requires-Dist: setuptools>=64
15
+ Requires-Dist: setuptools-scm<=6.0
15
16
  Requires-Dist: nbstripout
16
17
  Dynamic: license-file
17
18
 
@@ -1,14 +1,13 @@
1
- deriva_ml/VERSION.py,sha256=Dtbi4ISKI_kkTsaWuM-q8NfE3DySQu91TTP04Yhd8d8,22
2
- deriva_ml/__init__.py,sha256=DyHiqklSer7q7oPGAemkzg5Qcq2swMZf9ALwJhGf6Jo,905
1
+ deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
3
2
  deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
4
3
  deriva_ml/dataset.py,sha256=xC6QPUp4MZcJiEnOEU3NnzoLBL9RcJWtPTyzIQP0Ivw,60666
5
4
  deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
6
5
  deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
7
6
  deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
8
7
  deriva_ml/deriva_definitions.py,sha256=pZLPoUxiuJ-uGglmQ6sF9oVXsSUuOnPEqywoec78XNM,8893
9
- deriva_ml/deriva_ml_base.py,sha256=e2UtT3TlDpFQrG6z0DaB2iV22wmi4TLP7qXF3hvb8to,42868
8
+ deriva_ml/deriva_ml_base.py,sha256=KbmJ0-mGuZn7-wuzQemzof8o8mA_3-UVlQCrnDsKod0,45741
10
9
  deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
11
- deriva_ml/execution.py,sha256=VlapQGPDQI2MOmYnA5-hpf-XM6Fu4hPLpFjNN5q9Udo,29889
10
+ deriva_ml/execution.py,sha256=uDblqngcldgR7X4W1PfMV4iPWkxwQYSr9CBmXNlIv1E,29572
12
11
  deriva_ml/execution_configuration.py,sha256=bjnZwXN6M7YPy5dFQwoGEBU8YjhQRSe1FW0rL0V9TaM,3422
13
12
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
14
13
  deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
@@ -26,9 +25,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
26
25
  deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
27
26
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
28
27
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
29
- deriva_ml-1.8.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- deriva_ml-1.8.5.dist-info/METADATA,sha256=uuAq67MUZyY2LP8NbZ8RMJ5q-aX3pJV5ioYZqqLbuFA,653
31
- deriva_ml-1.8.5.dist-info/WHEEL,sha256=tTnHoFhvKQHCh4jz3yCn0WPTYIy7wXx3CJtJ7SJGV7c,91
32
- deriva_ml-1.8.5.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
33
- deriva_ml-1.8.5.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
34
- deriva_ml-1.8.5.dist-info/RECORD,,
28
+ deriva_ml-1.8.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
29
+ deriva_ml-1.8.11.dist-info/METADATA,sha256=RyttxTkz_MCnWX1hZK2g7ffPyd54txE6AAZ2GMSpQ54,670
30
+ deriva_ml-1.8.11.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
31
+ deriva_ml-1.8.11.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
32
+ deriva_ml-1.8.11.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
33
+ deriva_ml-1.8.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.1)
2
+ Generator: setuptools (77.0.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
deriva_ml/VERSION.py DELETED
@@ -1 +0,0 @@
1
- __version__ = "1.8.5"