deriva-ml 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/test.py ADDED
@@ -0,0 +1,94 @@
1
+ from typing import Any, Type
2
+ from deriva_ml import RID
3
+ from sqlalchemy import UniqueConstraint, inspect
4
+ from collections import defaultdict
5
+ from graphlib import CycleError, TopologicalSorter
6
+
7
+ def _prepare_wide_table(self, dataset, dataset_rid: RID, include_tables: list[str]) -> tuple:
8
+ """
9
+ Generates details of a wide table from the model
10
+
11
+ Args:
12
+ include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
13
+ all tables from the dataset will be included.
14
+
15
+ Returns:
16
+ str: SQL query string that represents the process of denormalization.
17
+ """
18
+
19
+ # Skip over tables that we don't want to include in the denormalized dataset.
20
+ # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
21
+ # table.
22
+ include_tables = set(include_tables)
23
+ for t in include_tables:
24
+ # Check to make sure the table is in the catalog.
25
+ _ = self.name_to_table(t)
26
+
27
+ table_paths = [
28
+ path
29
+ for path in self._schema_to_paths()
30
+ if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
31
+ ]
32
+ paths_by_element = defaultdict(list)
33
+ for p in table_paths:
34
+ paths_by_element[p[2].name].append(p)
35
+
36
+ # Get the names of all of the tables that can be dataset elements.
37
+ dataset_element_tables = {e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema}
38
+
39
+ skip_columns = {"RCT", "RMT", "RCB", "RMB"}
40
+ join_conditions = {}
41
+ join_tables = {}
42
+ for element_table, paths in paths_by_element.items():
43
+ graph = {}
44
+ for path in paths:
45
+ for left, right in zip(path[0:], path[1:]):
46
+ graph.setdefault(left.name, set()).add(right.name)
47
+
48
+ # New lets remove any cycles that we may have in the graph.
49
+ # We will use a topological sort to find the order in which we need to join the tables.
50
+ # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
51
+ # We will then repeat the process until there are no cycles.
52
+ graph_has_cycles = True
53
+ element_join_tables = []
54
+ element_join_conditions = {}
55
+ while graph_has_cycles:
56
+ try:
57
+ ts = TopologicalSorter(graph)
58
+ element_join_tables = list(reversed(list(ts.static_order())))
59
+ graph_has_cycles = False
60
+ except CycleError as e:
61
+ cycle_nodes = e.args[1]
62
+ if len(cycle_nodes) > 3:
63
+ raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
64
+ # Remove cycle from graph and splice in additional ON constraint.
65
+ graph[cycle_nodes[1]].remove(cycle_nodes[0])
66
+
67
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
68
+ if "Dataset_Version" in join_tables:
69
+ element_join_tables.remove("Dataset_Version")
70
+
71
+ for path in paths:
72
+ for left, right in zip(path[0:], path[1:]):
73
+ if right.name == "Dataset_Version":
74
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
75
+ continue
76
+ if element_join_tables.index(right.name) < element_join_tables.index(left.name):
77
+ continue
78
+ table_relationship = self._table_relationship(left, right)
79
+ element_join_conditions.setdefault(right.name, set()).add((table_relationship[0], table_relationship[1]))
80
+ join_tables[element_table] = element_join_tables
81
+ join_conditions[element_table] = element_join_conditions
82
+ # Get the list of columns that will appear in the final denormalized dataset.
83
+ denormalized_columns = [
84
+ (table_name, c.name)
85
+ for table_name in join_tables
86
+ if not self.is_association(table_name) # Don't include association columns in the denormalized view.'
87
+ for c in self.name_to_table(table_name).columns
88
+ if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
89
+ ]
90
+
91
+ # List of dataset ids to include in the denormalized view.
92
+ dataset_rids = [dataset_rid] + dataset.list_dataset_children(recurse=True)
93
+ return join_tables, join_conditions, denormalized_columns, dataset_rids, dataset_element_tables
94
+
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.16.0
3
+ Version: 1.17.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
- Requires-Python: >=3.10
6
+ Requires-Python: >=3.11
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: bump-my-version
@@ -12,16 +12,18 @@ Requires-Dist: deriva~=1.7.10
12
12
  Requires-Dist: deepdiff
13
13
  Requires-Dist: nbconvert
14
14
  Requires-Dist: pandas
15
- Requires-Dist: regex~=2024.7.24
15
+ Requires-Dist: pip-system-certs
16
16
  Requires-Dist: pydantic>=2.11
17
- Requires-Dist: semver>3.0.0
18
- Requires-Dist: setuptools>=64
19
- Requires-Dist: setuptools-scm>=8.0
20
- Requires-Dist: nbstripout
21
17
  Requires-Dist: papermill
22
18
  Requires-Dist: pandas-stubs==2.2.3.250527
23
19
  Requires-Dist: pyyaml
20
+ Requires-Dist: regex~=2024.7.24
21
+ Requires-Dist: semver>3.0.0
22
+ Requires-Dist: setuptools>=80
23
+ Requires-Dist: setuptools-scm>=8.0
24
+ Requires-Dist: nbstripout
24
25
  Requires-Dist: hydra_zen
26
+ Requires-Dist: SQLAlchemy
25
27
  Dynamic: license-file
26
28
 
27
29
  # DerivaML
@@ -1,11 +1,13 @@
1
- deriva_ml/__init__.py,sha256=Yt8q0WbLFt7fbRLZe_f0bJWy1Qo6vidQzlYWQoT8U7o,2097
1
+ deriva_ml/.DS_Store,sha256=gb-f5IXVed_gS5Be1Z6WxCYjrI_r5SdblvfFpIOY4ro,8196
2
+ deriva_ml/__init__.py,sha256=YCG7P4PUtO_b-aIIYb4KhKHcfnb8Wz_YeAL-c0HiQlA,1775
2
3
  deriva_ml/bump_version.py,sha256=eN2G5G_OeiuFxhOdjjwfxD8Rmv6dFvzIm0y_1x4Mif4,4020
3
- deriva_ml/demo_catalog.py,sha256=6hlSVGNQ364chisKvSyMy2BBxzhQq1mLPPlW324eca4,14931
4
+ deriva_ml/demo_catalog.py,sha256=FfXPlDfzy29K9g2Fr_KmYyRhmxP2eSaqm8_Xcji8fUM,15352
4
5
  deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
5
6
  deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
6
7
  deriva_ml/run_notebook.py,sha256=_pds1q3WcfWqhCBqKeznbwSv5n7OND8FkL6JQ2Jkfmc,8093
8
+ deriva_ml/test.py,sha256=BqmQXR9IyQP9h8pWttk0dzyJod2CwcfYbSUZS-Q5r4k,4460
7
9
  deriva_ml/core/__init__.py,sha256=Ko8GsWc7K_eDFW0-GaNS6gOWYP8cWHWir-ChSQaHntE,856
8
- deriva_ml/core/base.py,sha256=xsz1h5QZVE7PCVZiCt7lRV43Dupq9c7elUsbGk3QHJQ,61919
10
+ deriva_ml/core/base.py,sha256=KzZW310J0YmvCUhuCWxd42LNCM_JSzR__ObtT7zgcsU,62525
9
11
  deriva_ml/core/config.py,sha256=dF4rOLFmbk1DEkQimqbiH4pC519nRZWpwKItARNMiZ4,2244
10
12
  deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
11
13
  deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
@@ -13,21 +15,20 @@ deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
13
15
  deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
14
16
  deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
15
17
  deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
16
- deriva_ml/dataset/__init__.py,sha256=tV3yK9tb8iB9f5P3ml459bP2uPWJhCJcplhmbGVtoMI,411
17
- deriva_ml/dataset/aux_classes.py,sha256=K-cVBrZY1j0ZO__FORHRVdVz3O69OgvhO5YkhwJJyxE,7348
18
- deriva_ml/dataset/dataset.py,sha256=c6hGsIH9UOn8ayDP7EsYzqgKeZm2Kr7naliPLQxGtSg,64473
19
- deriva_ml/dataset/dataset_bag.py,sha256=peFEMU8PfExbzJ0VJGIL3QDIPz0stmUR7daCXptA3f4,20256
18
+ deriva_ml/dataset/__init__.py,sha256=wTCQaWGfRYEiUoOOxerKSpkbl1T5YFhoCyemlxGTk8k,283
19
+ deriva_ml/dataset/aux_classes.py,sha256=ojqe7gyK4KQVz_xfIillXS_HJ1PMsyr47pb2tFOXO_c,7855
20
+ deriva_ml/dataset/dataset.py,sha256=d860WuCL0-Pz6TyRpGVzhpPWDMco01-I5LT4dZjYxsQ,64728
21
+ deriva_ml/dataset/dataset_bag.py,sha256=ori3BuYVqfeHkVCjNSKuZh7oMdC6uufsszicpTPODiw,19944
20
22
  deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
21
- deriva_ml/dataset/upload.py,sha256=Q9bNVv6xTK_IpwFOU_ugq33IWRs0AWyFoF8Rzwi6OVs,16430
22
- deriva_ml/execution/__init__.py,sha256=Zs-ZNmwrJJW6suJilzh3vdcPvzI8HIA0Ym0VUwuiQME,668
23
+ deriva_ml/dataset/upload.py,sha256=n1aXSbOx1hghCDxuF8yf03jZmOLMueXL-rSnQMrfHq0,16535
24
+ deriva_ml/execution/__init__.py,sha256=5kKpPwQbxhmRn7Npz7DpavuCxYwCQaDdl4-6z62hbds,705
23
25
  deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
24
- deriva_ml/execution/execution.py,sha256=X4HBADT_F5ZuER8qBcnNYqRUuMU3BaEV7rMgXEUrLCg,46096
25
- deriva_ml/execution/execution_configuration.py,sha256=oWgBueuFO0-PBm9LM08EQeFeY9IXF8tVbd3LyRsTiNw,5437
26
- deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
26
+ deriva_ml/execution/execution.py,sha256=hNBfYnqXK4PmNS2wxbJ5oYzjDszjaiGHo8d3uxmIgPk,46210
27
+ deriva_ml/execution/execution_configuration.py,sha256=RT0x9n0uyJgEsrLCUTu16nAUJN7X-XLDvfezln0PTDQ,5775
28
+ deriva_ml/execution/workflow.py,sha256=rTlspICp2Q6prUwPCeukjhO64rbcJivcFs4zH60B16U,13906
27
29
  deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- deriva_ml/model/catalog.py,sha256=TY6QdlhZX7OL5bhWcGkAFpZNaZye5l_rkb1Cih-bTjs,19180
29
- deriva_ml/model/database.py,sha256=KEPJKIlmIxTiF4Th1NgpuuuMBhbfsgsd_k8UHs-hMg4,14843
30
- deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
30
+ deriva_ml/model/catalog.py,sha256=O6_Ll4Uxg6DyxoBXT9P9CPTt9jx1guVTeX1L3KW1A5c,19645
31
+ deriva_ml/model/database.py,sha256=BG5FSisl9tWTBnf5k9dNnijOIDyCUDeRhN_inkmIqTw,31132
31
32
  deriva_ml/protocols/dataset.py,sha256=1TyaT--89Elcs-nCvVyJxUj4cDaLztZOuSOzzj1cBMk,699
32
33
  deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
33
34
  deriva_ml/schema/annotations.py,sha256=CMcRqYUlyW8iLCYp6sYJsncaRNtp4kFKoxcg-i-t-50,18302
@@ -36,9 +37,9 @@ deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2k
36
37
  deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
37
38
  deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
38
39
  deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
39
- deriva_ml-1.16.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
40
- deriva_ml-1.16.0.dist-info/METADATA,sha256=gN7KnQ1MDdqSSaVJOIKY-lBEwEE8s0bRMoVLrZGYgtA,1214
41
- deriva_ml-1.16.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
42
- deriva_ml-1.16.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
43
- deriva_ml-1.16.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
44
- deriva_ml-1.16.0.dist-info/RECORD,,
40
+ deriva_ml-1.17.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
41
+ deriva_ml-1.17.0.dist-info/METADATA,sha256=gvz8ApFj8xylH1r4Nr-X_QiHChj6wRNJE7pLzI2sB8E,1272
42
+ deriva_ml-1.17.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ deriva_ml-1.17.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
44
+ deriva_ml-1.17.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
45
+ deriva_ml-1.17.0.dist-info/RECORD,,
@@ -1,44 +0,0 @@
1
- from datetime import datetime, timezone
2
- from typing import TYPE_CHECKING, Any, Sequence
3
-
4
- if TYPE_CHECKING:
5
- from deriva_ml.model.database import DatabaseModel
6
-
7
- try:
8
- from icecream import ic
9
- except ImportError: # Graceful fallback if IceCream isn't installed.
10
- ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
11
-
12
-
13
- class SQLMapper:
14
- def __init__(self, database: "DatabaseModel", table: str) -> None:
15
- table_name = database.normalize_table_name(table)
16
- schema, table = table_name.split(":")
17
-
18
- with database.dbase as dbase:
19
- self.col_names = [c[1] for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
20
-
21
- self.boolean_columns = [
22
- self.col_names.index(c.name)
23
- for c in database.model.schemas[schema].tables[table].columns
24
- if c.type.typename == "boolean"
25
- ]
26
- self.time_columns = [
27
- self.col_names.index(c.name)
28
- for c in database.model.schemas[schema].tables[table].columns
29
- if c.type.typename in ["ermrest_rct", "ermrest_rmt"]
30
- ]
31
-
32
- def _map_value(self, idx: int, v: Any) -> Any:
33
- """
34
- Return a new value based on `data` where, for each index in `idxs`,
35
- """
36
- tf_map = {"t": True, "f": False}
37
- if idx in self.boolean_columns:
38
- return tf_map.get(v, v)
39
- if idx in self.time_columns:
40
- return datetime.strptime(v, "%Y-%m-%d %H:%M:%S.%f+00").replace(tzinfo=timezone.utc).isoformat()
41
- return v
42
-
43
- def transform_tuple(self, data: Sequence[Any]) -> Any:
44
- return dict(zip(self.col_names, tuple(self._map_value(i, v) for i, v in enumerate(data))))