PyPI - deriva-ml - Versions diffs - 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl - Mend

deriva-ml 1.16.0py3-none-any.whl → 1.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +0 -10
deriva_ml/core/base.py +18 -6
deriva_ml/dataset/__init__.py +2 -7
deriva_ml/dataset/aux_classes.py +21 -11
deriva_ml/dataset/dataset.py +5 -4
deriva_ml/dataset/dataset_bag.py +144 -151
deriva_ml/dataset/upload.py +6 -4
deriva_ml/demo_catalog.py +16 -2
deriva_ml/execution/__init__.py +2 -1
deriva_ml/execution/execution.py +4 -2
deriva_ml/execution/execution_configuration.py +28 -9
deriva_ml/execution/workflow.py +8 -0
deriva_ml/model/catalog.py +55 -50
deriva_ml/model/database.py +455 -81
deriva_ml/test.py +94 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/METADATA +9 -7
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/RECORD +22 -21
deriva_ml/model/sql_mapper.py +0 -44
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/top_level.txt +0 -0

deriva_ml/test.py ADDED Viewed

@@ -0,0 +1,94 @@
+from typing import Any, Type
+from deriva_ml import RID
+from sqlalchemy import UniqueConstraint, inspect
+from collections import defaultdict
+from graphlib import CycleError, TopologicalSorter
+def _prepare_wide_table(self, dataset, dataset_rid: RID, include_tables: list[str]) -> tuple:
+    """
+    Generates details of a wide table from the model
+    Args:
+        include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
+            all tables from the dataset will be included.
+    Returns:
+        str: SQL query string that represents the process of denormalization.
+    """
+    # Skip over tables that we don't want to include in the denormalized dataset.
+    # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
+    # table.
+    include_tables = set(include_tables)
+    for t in include_tables:
+        # Check to make sure the table is in the catalog.
+        _ = self.name_to_table(t)
+    table_paths = [
+        path
+        for path in self._schema_to_paths()
+        if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
+    ]
+    paths_by_element = defaultdict(list)
+    for p in table_paths:
+        paths_by_element[p[2].name].append(p)
+    # Get the names of all of the tables that can be dataset elements.
+    dataset_element_tables = {e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema}
+    skip_columns = {"RCT", "RMT", "RCB", "RMB"}
+    join_conditions = {}
+    join_tables = {}
+    for element_table, paths in paths_by_element.items():
+        graph = {}
+        for path in paths:
+            for left, right in zip(path[0:], path[1:]):
+                graph.setdefault(left.name, set()).add(right.name)
+        # New lets remove any cycles that we may have in the graph.
+        # We will use a topological sort to find the order in which we need to join the tables.
+        # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
+        # We will then repeat the process until there are no cycles.
+        graph_has_cycles = True
+        element_join_tables = []
+        element_join_conditions = {}
+        while graph_has_cycles:
+            try:
+                ts = TopologicalSorter(graph)
+                element_join_tables = list(reversed(list(ts.static_order())))
+                graph_has_cycles = False
+            except CycleError as e:
+                cycle_nodes = e.args[1]
+                if len(cycle_nodes) > 3:
+                    raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
+                # Remove cycle from graph and splice in additional ON constraint.
+                graph[cycle_nodes[1]].remove(cycle_nodes[0])
+    # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+        if "Dataset_Version" in join_tables:
+            element_join_tables.remove("Dataset_Version")
+        for path in paths:
+            for left, right in zip(path[0:], path[1:]):
+                if right.name == "Dataset_Version":
+                    # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+                    continue
+                if element_join_tables.index(right.name) < element_join_tables.index(left.name):
+                    continue
+                table_relationship = self._table_relationship(left, right)
+                element_join_conditions.setdefault(right.name, set()).add((table_relationship[0], table_relationship[1]))
+        join_tables[element_table] = element_join_tables
+        join_conditions[element_table] = element_join_conditions
+    # Get the list of columns that will appear in the final denormalized dataset.
+    denormalized_columns = [
+        (table_name, c.name)
+        for table_name in join_tables
+        if not self.is_association(table_name)  # Don't include association columns in the denormalized view.'
+        for c in self.name_to_table(table_name).columns
+        if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
+    ]
+    # List of dataset ids to include in the denormalized view.
+    dataset_rids = [dataset_rid] + dataset.list_dataset_children(recurse=True)
+    return join_tables, join_conditions, denormalized_columns, dataset_rids, dataset_element_tables

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.16.0
+Version: 1.17.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: bump-my-version
@@ -12,16 +12,18 @@ Requires-Dist: deriva~=1.7.10
 Requires-Dist: deepdiff
 Requires-Dist: nbconvert
 Requires-Dist: pandas
-Requires-Dist: regex~=2024.7.24
+Requires-Dist: pip-system-certs
 Requires-Dist: pydantic>=2.11
-Requires-Dist: semver>3.0.0
-Requires-Dist: setuptools>=64
-Requires-Dist: setuptools-scm>=8.0
-Requires-Dist: nbstripout
 Requires-Dist: papermill
 Requires-Dist: pandas-stubs==2.2.3.250527
 Requires-Dist: pyyaml
+Requires-Dist: regex~=2024.7.24
+Requires-Dist: semver>3.0.0
+Requires-Dist: setuptools>=80
+Requires-Dist: setuptools-scm>=8.0
+Requires-Dist: nbstripout
 Requires-Dist: hydra_zen
+Requires-Dist: SQLAlchemy
 Dynamic: license-file
 # DerivaML

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,13 @@
-deriva_ml/__init__.py,sha256=Yt8q0WbLFt7fbRLZe_f0bJWy1Qo6vidQzlYWQoT8U7o,2097
+deriva_ml/.DS_Store,sha256=gb-f5IXVed_gS5Be1Z6WxCYjrI_r5SdblvfFpIOY4ro,8196
+deriva_ml/__init__.py,sha256=YCG7P4PUtO_b-aIIYb4KhKHcfnb8Wz_YeAL-c0HiQlA,1775
 deriva_ml/bump_version.py,sha256=eN2G5G_OeiuFxhOdjjwfxD8Rmv6dFvzIm0y_1x4Mif4,4020
-deriva_ml/demo_catalog.py,sha256=6hlSVGNQ364chisKvSyMy2BBxzhQq1mLPPlW324eca4,14931
+deriva_ml/demo_catalog.py,sha256=FfXPlDfzy29K9g2Fr_KmYyRhmxP2eSaqm8_Xcji8fUM,15352
 deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
 deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
 deriva_ml/run_notebook.py,sha256=_pds1q3WcfWqhCBqKeznbwSv5n7OND8FkL6JQ2Jkfmc,8093
+deriva_ml/test.py,sha256=BqmQXR9IyQP9h8pWttk0dzyJod2CwcfYbSUZS-Q5r4k,4460
 deriva_ml/core/__init__.py,sha256=Ko8GsWc7K_eDFW0-GaNS6gOWYP8cWHWir-ChSQaHntE,856
-deriva_ml/core/base.py,sha256=xsz1h5QZVE7PCVZiCt7lRV43Dupq9c7elUsbGk3QHJQ,61919
+deriva_ml/core/base.py,sha256=KzZW310J0YmvCUhuCWxd42LNCM_JSzR__ObtT7zgcsU,62525
 deriva_ml/core/config.py,sha256=dF4rOLFmbk1DEkQimqbiH4pC519nRZWpwKItARNMiZ4,2244
 deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
 deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
@@ -13,21 +15,20 @@ deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
 deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
 deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
 deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
-deriva_ml/dataset/__init__.py,sha256=tV3yK9tb8iB9f5P3ml459bP2uPWJhCJcplhmbGVtoMI,411
-deriva_ml/dataset/aux_classes.py,sha256=K-cVBrZY1j0ZO__FORHRVdVz3O69OgvhO5YkhwJJyxE,7348
-deriva_ml/dataset/dataset.py,sha256=c6hGsIH9UOn8ayDP7EsYzqgKeZm2Kr7naliPLQxGtSg,64473
-deriva_ml/dataset/dataset_bag.py,sha256=peFEMU8PfExbzJ0VJGIL3QDIPz0stmUR7daCXptA3f4,20256
+deriva_ml/dataset/__init__.py,sha256=wTCQaWGfRYEiUoOOxerKSpkbl1T5YFhoCyemlxGTk8k,283
+deriva_ml/dataset/aux_classes.py,sha256=ojqe7gyK4KQVz_xfIillXS_HJ1PMsyr47pb2tFOXO_c,7855
+deriva_ml/dataset/dataset.py,sha256=d860WuCL0-Pz6TyRpGVzhpPWDMco01-I5LT4dZjYxsQ,64728
+deriva_ml/dataset/dataset_bag.py,sha256=ori3BuYVqfeHkVCjNSKuZh7oMdC6uufsszicpTPODiw,19944
 deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
-deriva_ml/dataset/upload.py,sha256=Q9bNVv6xTK_IpwFOU_ugq33IWRs0AWyFoF8Rzwi6OVs,16430
-deriva_ml/execution/__init__.py,sha256=Zs-ZNmwrJJW6suJilzh3vdcPvzI8HIA0Ym0VUwuiQME,668
+deriva_ml/dataset/upload.py,sha256=n1aXSbOx1hghCDxuF8yf03jZmOLMueXL-rSnQMrfHq0,16535
+deriva_ml/execution/__init__.py,sha256=5kKpPwQbxhmRn7Npz7DpavuCxYwCQaDdl4-6z62hbds,705
 deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
-deriva_ml/execution/execution.py,sha256=X4HBADT_F5ZuER8qBcnNYqRUuMU3BaEV7rMgXEUrLCg,46096
-deriva_ml/execution/execution_configuration.py,sha256=oWgBueuFO0-PBm9LM08EQeFeY9IXF8tVbd3LyRsTiNw,5437
-deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
+deriva_ml/execution/execution.py,sha256=hNBfYnqXK4PmNS2wxbJ5oYzjDszjaiGHo8d3uxmIgPk,46210
+deriva_ml/execution/execution_configuration.py,sha256=RT0x9n0uyJgEsrLCUTu16nAUJN7X-XLDvfezln0PTDQ,5775
+deriva_ml/execution/workflow.py,sha256=rTlspICp2Q6prUwPCeukjhO64rbcJivcFs4zH60B16U,13906
 deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deriva_ml/model/catalog.py,sha256=TY6QdlhZX7OL5bhWcGkAFpZNaZye5l_rkb1Cih-bTjs,19180
-deriva_ml/model/database.py,sha256=KEPJKIlmIxTiF4Th1NgpuuuMBhbfsgsd_k8UHs-hMg4,14843
-deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
+deriva_ml/model/catalog.py,sha256=O6_Ll4Uxg6DyxoBXT9P9CPTt9jx1guVTeX1L3KW1A5c,19645
+deriva_ml/model/database.py,sha256=BG5FSisl9tWTBnf5k9dNnijOIDyCUDeRhN_inkmIqTw,31132
 deriva_ml/protocols/dataset.py,sha256=1TyaT--89Elcs-nCvVyJxUj4cDaLztZOuSOzzj1cBMk,699
 deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
 deriva_ml/schema/annotations.py,sha256=CMcRqYUlyW8iLCYp6sYJsncaRNtp4kFKoxcg-i-t-50,18302
@@ -36,9 +37,9 @@ deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2k
 deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
 deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
 deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
-deriva_ml-1.16.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deriva_ml-1.16.0.dist-info/METADATA,sha256=gN7KnQ1MDdqSSaVJOIKY-lBEwEE8s0bRMoVLrZGYgtA,1214
-deriva_ml-1.16.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-deriva_ml-1.16.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
-deriva_ml-1.16.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
-deriva_ml-1.16.0.dist-info/RECORD,,
+deriva_ml-1.17.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deriva_ml-1.17.0.dist-info/METADATA,sha256=gvz8ApFj8xylH1r4Nr-X_QiHChj6wRNJE7pLzI2sB8E,1272
+deriva_ml-1.17.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deriva_ml-1.17.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
+deriva_ml-1.17.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
+deriva_ml-1.17.0.dist-info/RECORD,,

deriva_ml/model/sql_mapper.py DELETED Viewed

@@ -1,44 +0,0 @@
-from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Any, Sequence
-if TYPE_CHECKING:
-    from deriva_ml.model.database import DatabaseModel
-try:
-    from icecream import ic
-except ImportError:  # Graceful fallback if IceCream isn't installed.
-    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
-class SQLMapper:
-    def __init__(self, database: "DatabaseModel", table: str) -> None:
-        table_name = database.normalize_table_name(table)
-        schema, table = table_name.split(":")
-        with database.dbase as dbase:
-            self.col_names = [c[1] for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
-        self.boolean_columns = [
-            self.col_names.index(c.name)
-            for c in database.model.schemas[schema].tables[table].columns
-            if c.type.typename == "boolean"
-        ]
-        self.time_columns = [
-            self.col_names.index(c.name)
-            for c in database.model.schemas[schema].tables[table].columns
-            if c.type.typename in ["ermrest_rct", "ermrest_rmt"]
-        ]
-    def _map_value(self, idx: int, v: Any) -> Any:
-        """
-        Return a new value based on `data` where, for each index in `idxs`,
-        """
-        tf_map = {"t": True, "f": False}
-        if idx in self.boolean_columns:
-            return tf_map.get(v, v)
-        if idx in self.time_columns:
-            return datetime.strptime(v, "%Y-%m-%d %H:%M:%S.%f+00").replace(tzinfo=timezone.utc).isoformat()
-        return v
-    def transform_tuple(self, data: Sequence[Any]) -> Any:
-        return dict(zip(self.col_names, tuple(self._map_value(i, v) for i, v in enumerate(data))))

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

deriva-ml 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl

deriva-ml 1.16.0py3-none-any.whl → 1.17.0py3-none-any.whl