deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
|
@@ -2,20 +2,32 @@
|
|
|
2
2
|
The module implements the sqllite interface to a set of directories representing a dataset bag.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import sqlite3
|
|
7
8
|
|
|
9
|
+
# Standard library imports
|
|
8
10
|
from collections import defaultdict
|
|
9
11
|
from copy import copy
|
|
10
|
-
from typing import Any, Generator,
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, cast
|
|
13
|
+
|
|
14
|
+
import deriva.core.datapath as datapath
|
|
11
15
|
|
|
16
|
+
# Third-party imports
|
|
12
17
|
import pandas as pd
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
from .
|
|
18
|
+
|
|
19
|
+
# Deriva imports
|
|
20
|
+
from deriva.core.ermrest_model import Column, Table
|
|
21
|
+
from pydantic import ConfigDict, validate_call
|
|
22
|
+
|
|
23
|
+
# Local imports
|
|
24
|
+
from deriva_ml.core.definitions import RID, VocabularyTerm
|
|
25
|
+
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm
|
|
26
|
+
from deriva_ml.feature import Feature
|
|
27
|
+
from deriva_ml.model.sql_mapper import SQLMapper
|
|
16
28
|
|
|
17
29
|
if TYPE_CHECKING:
|
|
18
|
-
from .
|
|
30
|
+
from deriva_ml.model.database import DatabaseModel
|
|
19
31
|
|
|
20
32
|
try:
|
|
21
33
|
from icecream import ic
|
|
@@ -24,41 +36,41 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
24
36
|
|
|
25
37
|
|
|
26
38
|
class DatasetBag:
|
|
27
|
-
"""
|
|
28
|
-
|
|
39
|
+
"""
|
|
40
|
+
DatasetBag is a class that manages a materialized bag. It is created from a locally materialized
|
|
41
|
+
BDBag for a dataset_table, which is created either by DerivaML.create_execution, or directly by
|
|
42
|
+
calling DerivaML.download_dataset.
|
|
29
43
|
|
|
30
|
-
A general a bag may contain multiple datasets, if the dataset is nested. The DatasetBag is used to
|
|
31
|
-
one of the datasets in the bag.
|
|
44
|
+
A general a bag may contain multiple datasets, if the dataset is nested. The DatasetBag is used to
|
|
45
|
+
represent only one of the datasets in the bag.
|
|
32
46
|
|
|
33
47
|
All the metadata associated with the dataset is stored in a SQLLite database that can be queried using SQL.
|
|
34
48
|
|
|
35
|
-
Attributes
|
|
49
|
+
Attributes:
|
|
36
50
|
dataset_rid (RID): RID for the specified dataset
|
|
37
51
|
version: The version of the dataset
|
|
38
52
|
model (DatabaseModel): The Database model that has all the catalog metadata associated with this dataset.
|
|
39
53
|
database:
|
|
40
|
-
dbase (Connection): connection to the sqlite database holding table values
|
|
54
|
+
dbase (sqlite3.Connection): connection to the sqlite database holding table values
|
|
41
55
|
domain_schema (str): Name of the domain schema
|
|
42
56
|
"""
|
|
43
57
|
|
|
44
|
-
|
|
45
|
-
def __init__(
|
|
46
|
-
self, database_model: "DatabaseModel", dataset_rid: Optional[RID]
|
|
47
|
-
) -> None:
|
|
58
|
+
def __init__(self, database_model: DatabaseModel, dataset_rid: RID | None = None) -> None:
|
|
48
59
|
"""
|
|
49
60
|
Initialize a DatasetBag instance.
|
|
50
61
|
|
|
51
62
|
Args:
|
|
52
63
|
database_model: Database version of the bag.
|
|
64
|
+
dataset_rid: Optional RID for the dataset.
|
|
53
65
|
"""
|
|
54
|
-
|
|
55
66
|
self.model = database_model
|
|
56
|
-
self.database = self.model.dbase
|
|
67
|
+
self.database = cast(sqlite3.Connection, self.model.dbase)
|
|
57
68
|
|
|
58
69
|
self.dataset_rid = dataset_rid or self.model.dataset_rid
|
|
59
|
-
self.
|
|
60
|
-
|
|
61
|
-
|
|
70
|
+
if not self.dataset_rid:
|
|
71
|
+
raise DerivaMLException("No dataset RID provided")
|
|
72
|
+
|
|
73
|
+
self.model.rid_lookup(self.dataset_rid) # Check to make sure that this dataset is in the bag.
|
|
62
74
|
|
|
63
75
|
self.version = self.model.dataset_version(self.dataset_rid)
|
|
64
76
|
self._dataset_table = self.model.dataset_table
|
|
@@ -75,20 +87,24 @@ class DatasetBag:
|
|
|
75
87
|
return self.model.list_tables()
|
|
76
88
|
|
|
77
89
|
def _dataset_table_view(self, table: str) -> str:
|
|
90
|
+
"""Return a SQL command that will return all of the elements in the specified table that are associated with
|
|
91
|
+
dataset_rid"""
|
|
92
|
+
|
|
78
93
|
table_name = self.model.normalize_table_name(table)
|
|
94
|
+
|
|
95
|
+
# Get the names of the columns in the table.
|
|
79
96
|
with self.database as dbase:
|
|
80
97
|
select_args = ",".join(
|
|
81
|
-
[
|
|
82
|
-
f'"{table_name}"."{c[1]}"'
|
|
83
|
-
for c in dbase.execute(
|
|
84
|
-
f'PRAGMA table_info("{table_name}")'
|
|
85
|
-
).fetchall()
|
|
86
|
-
]
|
|
98
|
+
[f'"{table_name}"."{c[1]}"' for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
|
|
87
99
|
)
|
|
100
|
+
|
|
101
|
+
# Get the list of datasets in the bag including the dataset itself.
|
|
88
102
|
datasets = ",".join(
|
|
89
|
-
[f'"{self.dataset_rid}"']
|
|
90
|
-
+ [f'"{ds.dataset_rid}"' for ds in self.list_dataset_children(recurse=True)]
|
|
103
|
+
[f'"{self.dataset_rid}"'] + [f'"{ds.dataset_rid}"' for ds in self.list_dataset_children(recurse=True)]
|
|
91
104
|
)
|
|
105
|
+
|
|
106
|
+
# Find the paths that terminate in the table we are looking for
|
|
107
|
+
# Assemble the ON clause by looking at each table pair, and looking up the FK columns that connect them.
|
|
92
108
|
paths = [
|
|
93
109
|
(
|
|
94
110
|
[f'"{self.model.normalize_table_name(t.name)}"' for t in p],
|
|
@@ -99,21 +115,23 @@ class DatasetBag:
|
|
|
99
115
|
]
|
|
100
116
|
|
|
101
117
|
sql = []
|
|
102
|
-
dataset_table_name = (
|
|
103
|
-
f'"{self.model.normalize_table_name(self._dataset_table.name)}"'
|
|
104
|
-
)
|
|
118
|
+
dataset_table_name = f'"{self.model.normalize_table_name(self._dataset_table.name)}"'
|
|
105
119
|
|
|
106
120
|
def column_name(col: Column) -> str:
|
|
107
121
|
return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
|
|
108
122
|
|
|
109
123
|
for ts, on in paths:
|
|
110
124
|
tables = " JOIN ".join(ts)
|
|
111
|
-
on_expression = " and ".join(
|
|
112
|
-
[f"{column_name(left)}={column_name(right)}" for left, right in on]
|
|
113
|
-
)
|
|
125
|
+
on_expression = " and ".join([f"{column_name(left)}={column_name(right)}" for left, right in on])
|
|
114
126
|
sql.append(
|
|
115
|
-
f"SELECT {select_args} FROM {tables}
|
|
127
|
+
f"SELECT {select_args} FROM {tables} "
|
|
128
|
+
f"{'ON ' + on_expression if on_expression else ''} "
|
|
129
|
+
f"WHERE {dataset_table_name}.RID IN ({datasets})"
|
|
116
130
|
)
|
|
131
|
+
if table_name == self.model.normalize_table_name(self._dataset_table.name):
|
|
132
|
+
sql.append(
|
|
133
|
+
f"SELECT {select_args} FROM {dataset_table_name} WHERE {dataset_table_name}.RID IN ({datasets})"
|
|
134
|
+
)
|
|
117
135
|
sql = " UNION ".join(sql) if len(sql) > 1 else sql[0]
|
|
118
136
|
return sql
|
|
119
137
|
|
|
@@ -157,120 +175,105 @@ class DatasetBag:
|
|
|
157
175
|
Returns:
|
|
158
176
|
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
159
177
|
"""
|
|
178
|
+
|
|
160
179
|
table_name = self.model.normalize_table_name(table)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
165
|
-
]
|
|
180
|
+
schema, table = table_name.split(":")
|
|
181
|
+
with self.database as _dbase:
|
|
182
|
+
mapper = SQLMapper(self.model, table)
|
|
166
183
|
result = self.database.execute(self._dataset_table_view(table))
|
|
167
184
|
while row := result.fetchone():
|
|
168
|
-
yield
|
|
185
|
+
yield mapper.transform_tuple(row)
|
|
169
186
|
|
|
170
187
|
@validate_call
|
|
171
|
-
def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str,
|
|
172
|
-
"""Return a list of entities associated with a specific
|
|
188
|
+
def list_dataset_members(self, recurse: bool = False) -> dict[str, list[dict[str, Any]]]:
|
|
189
|
+
"""Return a list of entities associated with a specific dataset.
|
|
173
190
|
|
|
174
191
|
Args:
|
|
175
|
-
recurse:
|
|
192
|
+
recurse: Whether to include nested datasets.
|
|
176
193
|
|
|
177
194
|
Returns:
|
|
178
|
-
Dictionary of entities associated with
|
|
179
|
-
were taken.
|
|
195
|
+
Dictionary of entities associated with the dataset.
|
|
180
196
|
"""
|
|
181
197
|
|
|
182
198
|
# Look at each of the element types that might be in the _dataset_table and get the list of rid for them from
|
|
183
199
|
# the appropriate association table.
|
|
184
200
|
members = defaultdict(list)
|
|
185
201
|
for assoc_table in self._dataset_table.find_associations():
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
202
|
+
member_fkey = assoc_table.other_fkeys.pop()
|
|
203
|
+
if member_fkey.pk_table.name == "Dataset" and member_fkey.foreign_key_columns[0].name != "Nested_Dataset":
|
|
204
|
+
# Sometimes find_assoc gets confused on Dataset_Dataset.
|
|
205
|
+
member_fkey = assoc_table.self_fkey
|
|
206
|
+
|
|
207
|
+
target_table = member_fkey.pk_table
|
|
189
208
|
member_table = assoc_table.table
|
|
190
209
|
|
|
191
|
-
if (
|
|
192
|
-
target_table.
|
|
193
|
-
and target_table != self._dataset_table
|
|
210
|
+
if target_table.schema.name != self.model.domain_schema and not (
|
|
211
|
+
target_table == self._dataset_table or target_table.name == "File"
|
|
194
212
|
):
|
|
195
213
|
# Look at domain tables and nested datasets.
|
|
196
214
|
continue
|
|
197
|
-
if target_table == self._dataset_table:
|
|
198
|
-
# find_assoc gives us the keys in the wrong position, so swap.
|
|
199
|
-
self_fkey, other_fkey = other_fkey, self_fkey
|
|
200
215
|
sql_target = self.model.normalize_table_name(target_table.name)
|
|
201
216
|
sql_member = self.model.normalize_table_name(member_table.name)
|
|
202
217
|
|
|
203
218
|
# Get the names of the columns that we are going to need for linking
|
|
204
|
-
member_link = tuple(
|
|
205
|
-
c.name for c in next(iter(other_fkey.column_map.items()))
|
|
206
|
-
)
|
|
207
|
-
|
|
219
|
+
member_link = tuple(c.name for c in next(iter(member_fkey.column_map.items())))
|
|
208
220
|
with self.database as db:
|
|
209
|
-
col_names = [
|
|
210
|
-
c[1]
|
|
211
|
-
for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
|
|
212
|
-
]
|
|
221
|
+
col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()]
|
|
213
222
|
select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
|
|
214
223
|
sql_cmd = (
|
|
215
224
|
f'SELECT {select_cols} FROM "{sql_member}" '
|
|
216
225
|
f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
|
|
217
226
|
f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
|
|
218
227
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
]
|
|
222
|
-
members[target_table.name].extend(target_entities)
|
|
223
|
-
|
|
224
|
-
target_entities = [] # path.entities().fetch()
|
|
228
|
+
mapper = SQLMapper(self.model, sql_target)
|
|
229
|
+
target_entities = [mapper.transform_tuple(e) for e in db.execute(sql_cmd).fetchall()]
|
|
225
230
|
members[target_table.name].extend(target_entities)
|
|
226
|
-
if recurse and target_table.name == self._dataset_table:
|
|
231
|
+
if recurse and (target_table.name == self._dataset_table.name):
|
|
227
232
|
# Get the members for all the nested datasets and add to the member list.
|
|
228
233
|
nested_datasets = [d["RID"] for d in target_entities]
|
|
229
234
|
for ds in nested_datasets:
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
).items():
|
|
235
|
+
nested_dataset = self.model.get_dataset(ds)
|
|
236
|
+
for k, v in nested_dataset.list_dataset_members(recurse=recurse).items():
|
|
233
237
|
members[k].extend(v)
|
|
234
238
|
return dict(members)
|
|
235
239
|
|
|
236
240
|
def find_features(self, table: str | Table) -> Iterable[Feature]:
|
|
237
|
-
"""
|
|
241
|
+
"""Find features for a table.
|
|
242
|
+
|
|
238
243
|
Args:
|
|
239
244
|
table: The table to find features for.
|
|
240
|
-
table: Table | str:
|
|
241
245
|
|
|
242
246
|
Returns:
|
|
243
|
-
An iterable of
|
|
247
|
+
An iterable of Feature instances.
|
|
244
248
|
"""
|
|
245
249
|
return self.model.find_features(table)
|
|
246
250
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
self, table: Table | str, feature_name: str
|
|
250
|
-
) -> datapath._ResultSet:
|
|
251
|
-
"""Return a datapath ResultSet containing all values of a feature associated with a table.
|
|
251
|
+
def list_feature_values(self, table: Table | str, feature_name: str) -> datapath._ResultSet:
|
|
252
|
+
"""Return feature values for a table.
|
|
252
253
|
|
|
253
254
|
Args:
|
|
254
|
-
table:
|
|
255
|
-
|
|
256
|
-
feature_name: str:
|
|
255
|
+
table: The table to get feature values for.
|
|
256
|
+
feature_name: Name of the feature.
|
|
257
257
|
|
|
258
258
|
Returns:
|
|
259
|
-
|
|
259
|
+
Feature values.
|
|
260
260
|
"""
|
|
261
261
|
feature = self.model.lookup_feature(table, feature_name)
|
|
262
262
|
feature_table = self.model.normalize_table_name(feature.feature_table.name)
|
|
263
|
+
|
|
263
264
|
with self.database as db:
|
|
265
|
+
col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{feature_table}")').fetchall()]
|
|
264
266
|
sql_cmd = f'SELECT * FROM "{feature_table}"'
|
|
265
|
-
return db.execute(sql_cmd).fetchall()
|
|
267
|
+
return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
|
|
266
268
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
"""Given a _dataset_table RID, return a list of RIDs of any nested datasets.
|
|
269
|
+
def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
|
|
270
|
+
"""Get nested datasets.
|
|
270
271
|
|
|
271
|
-
|
|
272
|
-
|
|
272
|
+
Args:
|
|
273
|
+
recurse: Whether to include children of children.
|
|
273
274
|
|
|
275
|
+
Returns:
|
|
276
|
+
List of child dataset bags.
|
|
274
277
|
"""
|
|
275
278
|
ds_table = self.model.normalize_table_name("Dataset")
|
|
276
279
|
nds_table = self.model.normalize_table_name("Dataset_Dataset")
|
|
@@ -283,9 +286,7 @@ class DatasetBag:
|
|
|
283
286
|
f'"{nds_table}".Nested_Dataset == "{ds_table}".RID '
|
|
284
287
|
f'where "{nds_table}".Dataset == "{self.dataset_rid}"'
|
|
285
288
|
)
|
|
286
|
-
nested = [
|
|
287
|
-
DatasetBag(self.model, r[0]) for r in db.execute(sql_cmd).fetchall()
|
|
288
|
-
]
|
|
289
|
+
nested = [DatasetBag(self.model, r[0]) for r in db.execute(sql_cmd).fetchall()]
|
|
289
290
|
|
|
290
291
|
result = copy(nested)
|
|
291
292
|
if recurse:
|
|
@@ -293,10 +294,49 @@ class DatasetBag:
|
|
|
293
294
|
result.extend(child.list_dataset_children(recurse))
|
|
294
295
|
return result
|
|
295
296
|
|
|
297
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
298
|
+
def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
|
|
299
|
+
"""Finds a term in a vocabulary table.
|
|
300
|
+
|
|
301
|
+
Searches for a term in the specified vocabulary table, matching either the primary name
|
|
302
|
+
or any of its synonyms.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
table: Vocabulary table to search in (name or Table object).
|
|
306
|
+
term_name: Name or synonym of the term to find.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
VocabularyTerm: The matching vocabulary term.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
|
|
313
|
+
|
|
314
|
+
Examples:
|
|
315
|
+
Look up by primary name:
|
|
316
|
+
>>> term = ml.lookup_term("tissue_types", "epithelial")
|
|
317
|
+
>>> print(term.description)
|
|
318
|
+
|
|
319
|
+
Look up by synonym:
|
|
320
|
+
>>> term = ml.lookup_term("tissue_types", "epithelium")
|
|
321
|
+
"""
|
|
322
|
+
# Get and validate vocabulary table reference
|
|
323
|
+
vocab_table = self.model.normalize_table_name(table)
|
|
324
|
+
if not self.model.is_vocabulary(table):
|
|
325
|
+
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
326
|
+
|
|
327
|
+
# Search for term by name or synonym
|
|
328
|
+
for term in self.get_table_as_dict(vocab_table):
|
|
329
|
+
if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
|
|
330
|
+
term["Synonyms"] = list(term["Synonyms"])
|
|
331
|
+
return VocabularyTerm.model_validate(term)
|
|
332
|
+
|
|
333
|
+
# Term not found
|
|
334
|
+
raise DerivaMLInvalidTerm(vocab_table, term_name)
|
|
335
|
+
|
|
296
336
|
|
|
297
337
|
# Add annotations after definition to deal with forward reference issues in pydantic
|
|
298
338
|
|
|
299
339
|
DatasetBag.list_dataset_children = validate_call(
|
|
300
|
-
config=
|
|
340
|
+
config=ConfigDict(arbitrary_types_allowed=True),
|
|
301
341
|
validate_return=True,
|
|
302
342
|
)(DatasetBag.list_dataset_children)
|
|
@@ -1,10 +1,30 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import struct
|
|
1
3
|
from datetime import datetime
|
|
4
|
+
|
|
2
5
|
from dateutil.parser import isoparse
|
|
3
6
|
from deriva.core import urlquote
|
|
4
7
|
|
|
5
8
|
|
|
6
9
|
# -- ==============================================================================================
|
|
7
10
|
def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=None):
|
|
11
|
+
"""Get the history of a record from the catalog.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
server: The server instance.
|
|
15
|
+
cid: The catalog ID.
|
|
16
|
+
sname: The schema name.
|
|
17
|
+
tname: The table name.
|
|
18
|
+
kvals: The key values to look up.
|
|
19
|
+
kcols: The key columns. Defaults to ["RID"].
|
|
20
|
+
snap: Optional snapshot ID.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The history data for the record.
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValueError: If more than one row is returned.
|
|
27
|
+
"""
|
|
8
28
|
parts = {
|
|
9
29
|
"cid": urlquote(cid),
|
|
10
30
|
"sname": urlquote(sname),
|
|
@@ -30,13 +50,13 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
|
|
|
30
50
|
while True:
|
|
31
51
|
url = path % parts
|
|
32
52
|
# sys.stderr.write("%s\n" % url)
|
|
33
|
-
|
|
34
|
-
if len(
|
|
53
|
+
response_data = server.get(url).json()
|
|
54
|
+
if len(response_data) > 1:
|
|
35
55
|
raise ValueError("got more than one row for %r" % url)
|
|
36
|
-
if len(
|
|
56
|
+
if len(response_data) == 0:
|
|
37
57
|
# sys.stderr.write("ERROR: %s: No record found \n" % (url))
|
|
38
58
|
break
|
|
39
|
-
row =
|
|
59
|
+
row = response_data[0]
|
|
40
60
|
snap2rows[parts["snap"]] = row
|
|
41
61
|
rows_found.append(row)
|
|
42
62
|
rmt = datetime.fromisoformat(row["RMT"])
|
|
@@ -48,8 +68,15 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
|
|
|
48
68
|
|
|
49
69
|
# -- --------------------------------------------------------------------------------------
|
|
50
70
|
def datetime_epoch_us(dt):
|
|
51
|
-
"""
|
|
52
|
-
|
|
71
|
+
"""Convert datetime to epoch microseconds.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
dt: The datetime object to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
The epoch time in microseconds.
|
|
78
|
+
"""
|
|
79
|
+
return int(dt.timestamp() * 1000000)
|
|
53
80
|
|
|
54
81
|
|
|
55
82
|
# -- --------------------------------------------------------------------------------------
|
|
@@ -58,34 +85,25 @@ def datetime_epoch_us(dt):
|
|
|
58
85
|
|
|
59
86
|
|
|
60
87
|
def iso_to_snap(iso_datetime):
|
|
61
|
-
|
|
62
|
-
|
|
88
|
+
"""Convert ISO datetime string to snapshot format.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
iso_datetime: The ISO datetime string.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The snapshot timestamp.
|
|
95
|
+
"""
|
|
96
|
+
return datetime_epoch_us(isoparse(iso_datetime))
|
|
63
97
|
|
|
64
98
|
|
|
65
99
|
# -- --------------------------------------------------------------------------------------
|
|
66
100
|
def urlb32_encode(i):
|
|
67
|
-
"""Encode integer
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
for d in range(1, 14):
|
|
77
|
-
if d > 2 and ((d - 1) % 4) == 0:
|
|
78
|
-
encoded_rev.append("-")
|
|
79
|
-
code = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"[raw % 32]
|
|
80
|
-
encoded_rev.append(code)
|
|
81
|
-
raw = raw // 32
|
|
82
|
-
|
|
83
|
-
while encoded_rev and encoded_rev[-1] in {"0", "-"}:
|
|
84
|
-
del encoded_rev[-1]
|
|
85
|
-
|
|
86
|
-
if not encoded_rev:
|
|
87
|
-
encoded_rev = ["0"]
|
|
88
|
-
|
|
89
|
-
encoded = reversed(encoded_rev)
|
|
90
|
-
|
|
91
|
-
return "".join(encoded)
|
|
101
|
+
"""Encode an integer to URL-safe base32.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
i: The integer to encode.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
The URL-safe base32 encoded string.
|
|
108
|
+
"""
|
|
109
|
+
return base64.urlsafe_b64encode(struct.pack(">Q", i)).decode("ascii").rstrip("=")
|