PyPI - dsi-workflow - Versions diffs - 1.2__tar.gz → 1.2.2__tar.gz - Mend

dsi-workflow 1.2tar.gz → 1.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{dsi_workflow-1.2 → dsi_workflow-1.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dsi-workflow
-Version: 1.2
+Version: 1.2.2
 Summary: A Data Science Infrastructure Project
 Author-email: Jesus Pulido <pulido@lanl.gov>, James Ahrens <ahrens@lanl.gov>, Divya Banesh <dbanesh@lanl.gov>, Hugh Greenberg <hng@lanl.gov>, Pascal Grosset <pascalgrosset@lanl.gov>, Vedant Iyer <iyer@lanl.gov>, Benjamin Sims <bsims@lanl.gov>, Terece Turton <tlturton@lanl.gov>
 License-Expression: BSD-3-Clause

dsi_workflow-1.2.2/dsi/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.2"

{dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/backend.py RENAMED Viewed

@@ -5,6 +5,7 @@ class Backend(ABC):
     def __init__(self, data_source, **kwargs) -> None:
         pass
+    # Can raise NotImplementedError for a read-only backend
     @abstractmethod
     def ingest_artifacts(self, artifacts, **kwargs) -> None:
         pass
@@ -13,6 +14,14 @@ class Backend(ABC):
     def query_artifacts(self, query, **kwargs):
         pass
+    @abstractmethod
+    def get_table(self, table_name, **kwargs):
+        pass
+    @abstractmethod
+    def get_table_names(self,query):
+        pass
     @abstractmethod
     def notebook(self, **kwargs):
         pass
@@ -21,6 +30,10 @@ class Backend(ABC):
     def process_artifacts(self, **kwargs):
         pass
+    @abstractmethod
+    def get_schema(self):
+        pass
     @abstractmethod
     def find(self, query_object, **kwargs):
         pass
@@ -45,6 +58,10 @@ class Backend(ABC):
     def list(self, **kwargs):
         pass
+    @abstractmethod
+    def num_tables(self):
+        pass
     @abstractmethod
     def display(self, table_name, **kwargs):
         pass

{dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/duckdb.py RENAMED Viewed

@@ -42,6 +42,7 @@ class DuckDB(Filesystem):
     DuckDB Filesystem Backend to which a user can ingest/process data, generate a Jupyter notebook, and find occurrences of a search term
     """
     runTable = False
+    read_only = False
     def __init__(self, filename, **kwargs):
         """
@@ -65,7 +66,7 @@ class DuckDB(Filesystem):
         `input_list` : list
             A list of values to analyze for type compatibility.
-        `return`: str
+        Return: str
             A string representing the inferred DuckDB data type for the input list.
         """
         DUCKDB_BIGINT_MIN = -9223372036854775808
@@ -281,7 +282,7 @@ class DuckDB(Filesystem):
             else:
                 table_order = list(reversed(ordered_tables)) # ingest primary key tables first then children
-        if self.runTable:
+        if self.runTable and artifacts:
             runTable_create = "CREATE TABLE IF NOT EXISTS runTable (run_id INTEGER PRIMARY KEY, run_timestamp TEXT UNIQUE);"
             self.cur.execute(runTable_create)
@@ -328,27 +329,29 @@ class DuckDB(Filesystem):
             self.ingest_table_helper(types, foreign_query)
-            col_names = ', '.join(types.properties.keys())
-            placeholders = ', '.join('?' * len(types.properties))
-            str_query = "INSERT INTO "
-            if self.runTable:
-                run_id = self.cur.execute("SELECT run_id FROM runTable ORDER BY run_id DESC LIMIT 1;").fetchone()[0]
-                str_query += "{} (run_id, {}) VALUES ({}, {});".format(str(types.name), col_names, run_id, placeholders)
-            else:
-                str_query += "{} ({}) VALUES ({});".format(str(types.name), col_names, placeholders)
-            if isVerbose:
-                print(str_query)
-            rows = zip(*types.properties.values())
-            try:
-                self.cur.executemany(str_query,rows)
-            except duckdb.Error as e:
-                self.cur.execute("ROLLBACK")
-                self.cur.execute("CHECKPOINT")
-                raise duckdb.Error(e)
+            # TODO: move this check to schema reader by allowing users to just create table without data
+            if not all(v == [""] for v in tableData.values()): # if table is just one row of empty strings, don't insert
+                col_names = ', '.join(types.properties.keys())
+                placeholders = ', '.join('?' * len(types.properties))
+                str_query = "INSERT INTO "
+                if self.runTable:
+                    run_id = self.cur.execute("SELECT run_id FROM runTable ORDER BY run_id DESC LIMIT 1;").fetchone()[0]
+                    str_query += "{} (run_id, {}) VALUES ({}, {});".format(str(types.name), col_names, run_id, placeholders)
+                else:
+                    str_query += "{} ({}) VALUES ({});".format(str(types.name), col_names, placeholders)
+                if isVerbose:
+                    print(str_query)
-            self.types = types #This will only copy the last table from artifacts (collections input)
+                rows = zip(*types.properties.values())
+                try:
+                    self.cur.executemany(str_query,rows)
+                except duckdb.Error as e:
+                    self.cur.execute("ROLLBACK")
+                    self.cur.execute("CHECKPOINT")
+                    raise duckdb.Error(e)
+            self.types = types # This will only copy the last table from artifacts (collections input)
         if "dsi_units" in artifacts.keys():
             create_query = "CREATE TABLE IF NOT EXISTS dsi_units (table_name TEXT, column_name TEXT, unit TEXT)"
@@ -378,9 +381,13 @@ class DuckDB(Filesystem):
             raise duckdb.Error(e)
-    def query_artifacts(self, query, isVerbose=False, dict_return = False):
+    def query_artifacts(self, query, isVerbose=False, dict_return = False, **kwargs):
         """
-        Executes a SQL query on the DuckDB backend and returns the result in the specified format dependent on `dict_return`
+        Executes a SQL query on the DuckDB backend.
+        Supports:
+        - SELECT / PRAGMA: returns DataFrame or OrderedDict depending on dict_return
+        - UPDATE / ALTER: executes command and returns None
         `query` : str
             Must be a SELECT or PRAGMA SQL query. Aggregate functions like COUNT are allowed.
@@ -393,12 +400,14 @@ class DuckDB(Filesystem):
             If True, returns the result as an OrderedDict.
             If False, returns the result as a pandas DataFrame.
-        `return` : pandas.DataFrame or OrderedDict
+        Return : pandas.DataFrame or OrderedDict or None
+            - If `query` includes UPDATE or ALTER: returns nothing
             - If `dict_return` is False: returns a DataFrame
             - If `dict_return` is True: returns an OrderedDict
         """
         data = None
-        if query[:6].lower() == "select" or query[:6].lower() == "pragma":
+        command = query.strip().split(None, 1)[0].lower()
+        if command in {"select", "pragma"}:
             try:
                 data = self.cur.execute(query).fetch_df()
                 if isVerbose:
@@ -412,19 +421,23 @@ class DuckDB(Filesystem):
                         return OrderedDict()
                     return pd.DataFrame()
                 raise
-        elif "filesystem" in query.lower() and "drop" in query.lower(): #remove filesystem passthrough in future
+        elif command in {"update", "alter"}:
+            query_params = kwargs.pop("params", ())
             try:
-                self.con.execute(query)
-                self.con.commit()
-            except Exception as e:
-                message = str(e)
-                if "Table" in message and "does not exist" in message:
-                    table_name = message[message.find("Table"):message.find("Did you mean")-2]
-                    print(f"WARNING: {table_name} in this database")
-                    return
+                self.cur.execute("BEGIN TRANSACTION")
+                self.cur.execute(query, query_params)
+                self.cur.execute("COMMIT")
+                self.cur.execute("FORCE CHECKPOINT")
+                return None
+            except duckdb.Error:
+                try:
+                    self.cur.execute("ROLLBACK")
+                    self.cur.execute("FORCE CHECKPOINT")
+                except duckdb.Error:
+                    pass
                 raise
         else:
-            raise RuntimeError("Can only run SELECT or PRAGMA queries on the data")
+            raise RuntimeError("Can only run SELECT, PRAGMA, UPDATE, or ALTER queries on the data")
         if dict_return:
             tables = self.get_table_names(query)
@@ -448,7 +461,7 @@ class DuckDB(Filesystem):
             If True, returns the result as an OrderedDict.
             If False, returns the result as a pandas DataFrame.
-        `return` : pandas.DataFrame or OrderedDict
+        Return : pandas.DataFrame or OrderedDict
             - If `dict_return` is False: returns a DataFrame
             - If `dict_return` is True: returns an OrderedDict
         """
@@ -461,7 +474,7 @@ class DuckDB(Filesystem):
         `query` : str
             A SQL query string, typically passed into `query_artifacts()`.
-        `return`: list of str
+        Return: list of str
             List of table names referenced in the query.
         """
         all_names = re.findall(r'FROM\s+["\']?([\w\-]+)["\']?|JOIN\s+["\']?([\w\-]+)["\']?', query, re.IGNORECASE)
@@ -472,7 +485,7 @@ class DuckDB(Filesystem):
         """
         Returns the structural schema of this database in the form of CREATE TABLE statements.
-        `return`: str
+        Return: str
             Each table's CREATE TABLE statement is concatenated into one large string.
         """
         schema_stmts = self.query_artifacts(query="SELECT sql FROM duckdb_tables where sql NOT NULL ")
@@ -493,7 +506,7 @@ class DuckDB(Filesystem):
         `only_units_relations` : bool, default=False
             **USERS SHOULD IGNORE THIS FLAG.** Used internally by duckdb.py.
-        `return` : OrderedDict
+        Return : OrderedDict
             A nested OrderedDict containing all data from the DuckDB database.
         """
         artifact = OrderedDict()
@@ -551,7 +564,7 @@ class DuckDB(Filesystem):
         `query_object` : int, float, or str
             The value to search for across all tables in the backend.
-        `return` : list
+        Return : list
             A list of ValueObjects representing matches.
         - Note: ValueObjects may vary in structure depending on whether the match occurred at the table, column, or cell level.
@@ -579,7 +592,7 @@ class DuckDB(Filesystem):
         `query_object` : str
             The string to search for in table names.
-        `return` : list of ValueObjects
+        Return : list of ValueObjects
             One ValueObject per matching table.
         ValueObject Structure:
@@ -625,7 +638,7 @@ class DuckDB(Filesystem):
             If True, `value` in the returned ValueObject will be the [min, max] of the matching numerical column.
             If False, `value` in the returned ValueObject will be the full list of column data.
-        `return` : List of ValueObjects if there is a match.
+        Return : List of ValueObjects if there is a match.
         ValueObject Structure:
             - t_name:   table name (str)
@@ -683,10 +696,10 @@ class DuckDB(Filesystem):
             The value to search for at the cell level, across all tables in the backend.
         `row`: bool, optional, default=False
-            If True, `value` in the returned ValueObject will be the entire row where a cell matched.
-            If False, `value` in the returned ValueObject will only be the matching cell value.
+            If True, certain fields in ValueObject will contain entire row's metadata/data
+            If False, certain fields in ValueObject will only contain the matching cell's metadata/data.
-        `return` : List of ValueObjects if there is a match.
+        Return : List of ValueObjects if there is a match.
         ValueObject Structure:
             - t_name:   table name (str)
@@ -768,7 +781,7 @@ class DuckDB(Filesystem):
         `relation` : str
             The operator and value to apply to the column. Ex: >4, <4, =4, >=4, <=4, ==4, !=4, (4,5), ~4, ~~4
-        `return` : list of ValueObjects
+        Return : list of ValueObjects
             One ValueObject per matching row in that first table.
         ValueObject Structure:
@@ -918,9 +931,10 @@ class DuckDB(Filesystem):
         Returns numerical metadata from tables in the first activated backend.
         `table_name` : str, optional
-            If specified, only the numerical metadata for that table will be returned as a Pandas DataFrame.
+            If specified, only the numerical metadata for that table is returned as a Pandas DataFrame.
-            If None (default), metadata for all available tables is returned as a list of Pandas DataFrames.
+            If None (default), names of all tables and metadata for each table is returned as a list.
+            [table_name_list, table1_df, table2_df, table3df ...]
         """
         if table_name is None:
             tableList = self.cur.execute("""
@@ -1088,7 +1102,7 @@ class DuckDB(Filesystem):
         `relation_dict` : OrderedDict
             An OrderedDict describing table relationships. Structured as the `dsi_relations` object with primary and foreign keys.
-        `return`: tuple of (has_cycle, ordered_tables)
+        Return: tuple of (has_cycle, ordered_tables)
             - has_cycle (bool): True if a circular dependency is detected.
             - ordered_tables (list or None): Ordered list of tables if no cycle is found; None if a circular dependency exists.
         """
@@ -1147,6 +1161,6 @@ class DuckDB(Filesystem):
         """
         Closes the DuckDB database's connection.
-        `return`: None
+        Return: None
         """
         self.con.close()

{dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/filesystem.py RENAMED Viewed

@@ -6,6 +6,7 @@ class Filesystem(Backend, ABC):
     def __init__(self, filename, **kwargs) -> None:
         pass
+    # Can raise NotImplementedError for a read-only backend
     @abstractmethod
     def ingest_artifacts(self, artifacts, **kwargs) -> None:
         pass
@@ -18,6 +19,10 @@ class Filesystem(Backend, ABC):
     def get_table(self, table_name, **kwargs):
         pass
+    @abstractmethod
+    def get_table_names(self,query):
+        pass
     @abstractmethod
     def notebook(self, **kwargs):
         pass
@@ -25,6 +30,10 @@ class Filesystem(Backend, ABC):
     @abstractmethod
     def process_artifacts(self, **kwargs):
         pass
+    @abstractmethod
+    def get_schema(self):
+        pass
     @abstractmethod
     def find(self, query_object, **kwargs):
@@ -62,6 +71,7 @@ class Filesystem(Backend, ABC):
     def summary(self, table_name, **kwargs):
         pass
+    # Can raise NotImplementedError for a read-only backend
     @abstractmethod
     def overwrite_table(self, table_name, collection, **kwargs):
         pass

dsi_workflow-1.2.2/dsi/backends/gufi.py ADDED Viewed

@@ -0,0 +1,160 @@
+import sqlite3
+# Holds table name and data properties
+from dsi.backends.filesystem import Filesystem
+class DataType:
+    name = "DEFAULT"
+    properties = {}
+    units = {}
+class Gufi(Filesystem):
+    '''
+    GUFI Datastore
+    '''
+    gufi_prefix = ""
+    gufi_index = ""
+    dsi_table_name = ""
+    dsi_columns = ""
+    gufi_columns = ""
+    collection_name = ""
+    dsi_db = None
+    isVerbose = False
+    def __init__(self, gufi_prefix, gufi_index, dsi_table_name, dsi_columns, gufi_columns,
+                 collection_name, dsi_db, verbose=False):
+        '''
+        `gufi_prefix`: the directory where GUFI is installed
+        `gufi_index`: the directory where GUFI's indexes are
+        `dsi_table_name`: the DSI table name that has the UUID for each file as a column
+        `dsi_columns`: the DSI table columns that should be included in the join with GUFI
+        `gufi_columns`: the GUFI columns that should be included in the join with DSI
+        `collection_name`: the name that identifies the collection that the DSI database belongs to
+        `dsi_db`: the path to the dsi db
+        `verbose`: print debugging statements or not
+        '''
+        # prefix is the prefix to the GUFI installation
+        self.gufi_prefix = gufi_prefix
+        self.gufi_index = gufi_index
+        self.dsi_table_name = dsi_table_name
+        self.dsi_columns = dsi_columns
+        self.gufi_columns = gufi_columns
+        self.collection_name = collection_name
+        self.dsi_db = dsi_db
+        self.isVerbose = verbose
+    # Query GUFI and DSI db ˘
+    def query_artifacts(self, query):
+        '''
+        Retrieves GUFI's metadata joined with a dsi database
+        query: an sql query into the dsi_entries table
+        '''
+        try:
+            resout = self._run_gufi_query(query)
+            if self.isVerbose:
+                print(resout)
+            return resout
+        except Exception:
+            print("Error running GUFI query")
+    def ingest_artifacts(self, query):
+        raise NotImplementedError("Cannot ingest data with the GUFI backend")
+    # Runs the gufi query command
+    def _run_gufi_query(self, sqlstring):
+        '''
+        Calls the qufy_query command to run the sql query
+        sqlstring: the query into the dsi_entries table
+        '''
+        metadata = []
+        with sqlite3.connect(":memory:") as con:
+            con.enable_load_extension(True)
+            # alternatively you can load the extension using an API call:
+            con.load_extension(self.gufi_prefix + "/lib/gufi_vt")
+            # disable extension loading again
+            con.enable_load_extension(False)
+            dsi_column_names = ",".join(self.dsi_columns)
+            gufi_column_names = ",".join((["rpath(sname, sroll, name) AS fullpath"] + self.gufi_columns[1:]
+                                    if self.gufi_columns[0] == "fullpath" else self.gufi_columns))
+            query=f"""
+            CREATE VIRTUAL TABLE uview USING gufi_vt(
+                threads=64,
+                E="SELECT {gufi_column_names}, dsi_uuid(xattr_name, xattr_value) AS uuid FROM vrxpentries WHERE uuid IS NOT NULL;",
+                index='{self.gufi_index}',
+                plugin='gufi_plugin_operations:{self.gufi_prefix}/lib/libdsi_querying.so'
+            );
+            """
+            # example from SQLite wiki
+            cur = con.execute(query)
+            query=f"""
+                ATTACH '{self.dsi_db}' AS
+                {self.collection_name};
+            """
+            cur = con.execute(query)
+            if sqlstring is None or len(sqlstring) == 0:
+                query = f"""
+                    SELECT uview.*, {dsi_column_names} FROM uview JOIN ATLAS_UUID.zarr_metadata_uuid ON uview.uuid == ATLAS_UUID.zarr_metadata_uuid.uuid;
+                """
+            else:
+                query = sqlstring
+            print("query: ", query)
+            cur.execute(query)
+            rows = cur.fetchall()
+            for row in rows:
+                print(row)
+                metadata.append(row)
+        return metadata
+    def close(self):
+        raise NotImplementedError("No connection to close for the GUFI backend")
+    def display(self):
+        raise NotImplementedError("Cannot display data with the GUFI backend")
+    def find(self):
+        raise NotImplementedError("Cannot find data with the GUFI backend")
+    def find_cell(self):
+        raise NotImplementedError("Cannot find cell data with the GUFI backend")
+    def find_column(self):
+        raise NotImplementedError("Cannot find column data with the GUFI backend")
+    def find_relation(self):
+        raise NotImplementedError("Cannot find data on a relation with the GUFI backend")
+    def find_table(self):
+        raise NotImplementedError("Cannot find table data with the GUFI backend")
+    def get_schema(self):
+        pass
+    def get_table(self):
+        raise NotImplementedError("Cannot get table data with the GUFI backend")
+    def get_table_names(self):
+        raise NotImplementedError("Cannot get table names with the GUFI backend")
+    def list(self):
+        raise NotImplementedError("Cannot list tables with the GUFI backend")
+    def notebook(self):
+        raise NotImplementedError("Cannot create notebook with the GUFI backend")
+    def num_tables(self):
+        raise NotImplementedError("Cannot count tables with the GUFI backend")
+    def overwrite_table(self):
+        raise NotImplementedError("Cannot overwrite table with the GUFI backend")
+    def process_artifacts(self):
+        raise NotImplementedError("Cannot process artifacts with the GUFI backend")
+    def summary(self):
+        raise NotImplementedError("Cannot summarize data with the GUFI backend")

dsi_workflow-1.2.2/dsi/backends/hpss.py ADDED Viewed

@@ -0,0 +1,168 @@
+import re
+import subprocess
+import os
+from dsi.backends.backend import Backend
+from collections import OrderedDict
+# HPSS backend class
+class HPSS(Backend):
+    read_only = False
+    def __init__(self, hpss_files):
+        """
+        Initializes an HPSS backend
+        `hpss_files`: list with hpss file paths
+        """
+        self.hpss_info = OrderedDict()
+        for hpss_file in hpss_files.keys():
+            self.hpss_info[hpss_file] = {
+                'local_path': hpss_files[hpss_file],
+                'hpss_hash': None,
+            }
+            stdout, stderr, _ = self.run_hsi("hashlist", [hpss_file])
+            hpss_hash = self.parse_hpss_hash(stdout, stderr)
+            self.hpss_info[hpss_file]['hpss_hash'] = hpss_hash
+    def create_hpss_hash(self, hpss_file) -> str:
+        """
+        Creates and HPSS hash
+        """
+        stdout, stderr, returncode = self.run_hsi("hashcreate", [hpss_file])
+        if returncode != 0:
+            print(stderr)
+            return None
+        hash = self.parse_hpss_hash(stdout, stderr)
+        return hash
+    def put(self, local_file, hpss_dest) -> bool:
+        """
+        Puts a local file on HPSS
+        """
+        cwd = os.getcwd()
+        new_dir = None
+        file_to_put = local_file
+        if '/' in local_file:
+            new_dir = '/'.join(local_file.split('/')[:-1])
+            os.chdir(new_dir)
+            file_to_put = local_file.split('/')[-1]
+        stdout, stderr, returncode = self.run_hsi("put", [file_to_put])
+        if new_dir is not None:
+            os.chdir(cwd)
+        if returncode == 0:
+            self.create_hpss_hash(file_to_put)
+            return True
+        return False
+    def get(self, hpss_file, tmp_dir) -> bool:
+        """
+        Gets an HPSS file and puts it in the tmp_dir
+        """
+        cwd = os.getcwd()
+        try:
+            os.chdir(tmp_dir)
+        except Exception:
+            print("Error changing to temp dir: %s" % tmp_dir)
+            return False
+        stdout, stderr, returncode = self.run_hsi("get", hpss_file)
+        try:
+            os.chdir(cwd)
+        except Exception:
+            print("Error changing to dir: %s" % cwd)
+        if returncode == 0:
+            return True
+        return False
+    def parse_hpss_hash(self, stdout, stderr) -> str:
+        """
+        Parses the result of an HPSS hash command
+        """
+        output = stdout + stderr
+        hash = None
+        for line in output.splitlines():
+            if " md5" not in line:
+                continue
+            line = line.strip()
+            matches = re.search(r'(\S+)\s+(\S+)\s+(\S+).*', line)
+            if not matches:
+                continue
+            if len(matches.groups()) == 3:
+                hash = matches.group(1)
+                break
+        return hash
+    def run_hsi(self, subcmd, arg_list):
+        """
+        Runs hsi with the supplied subcmd and arguments
+        """
+        command = ["hsi", subcmd]
+        command += arg_list
+        stdout = ""
+        stderr = ""
+        returncode = -1
+        try:
+            process = subprocess.Popen(command, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='latin-1')
+            stdout, stderr = process.communicate()
+            returncode = process.communicate()
+        except FileNotFoundError as e:
+            print("Error running hsi: %s" % e)
+        return stdout, stderr, returncode
+    def ingest_artifacts(self, collection, isVerbose=False):
+        for f in self.hpss_info.keys():
+            self.put(self.hpss_info[f]['local_path'], f)
+    def query_artifacts(self, query, **kwargs):
+        pass
+    def notebook(self, **kwargs):
+        pass
+    def process_artifacts(self, **kwargs):
+        pass
+    def get_schema(self):
+        pass
+    def find(self, query_object, **kwargs):
+        pass
+    def find_table(self, query_object, **kwargs):
+        pass
+    def find_column(self, query_object, **kwargs):
+        pass
+    def find_cell(self, query_object, **kwargs):
+        pass
+    def find_relation(self, column_name, relation, **kwargs):
+        pass
+    def list(self, **kwargs):
+        pass
+    def display(self, table_name, **kwargs):
+        pass
+    def summary(self, table_name, **kwargs):
+        pass
+    def close(self):
+        pass

dsi-workflow 1.2__tar.gz → 1.2.2__tar.gz

dsi-workflow 1.2tar.gz → 1.2.2tar.gz