dsi-workflow 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dsi/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "1.0"
@@ -0,0 +1,55 @@
1
+ from abc import ABCMeta, abstractmethod
2
+
3
+
4
+ class Backend(metaclass=ABCMeta):
5
+ @abstractmethod
6
+ def __init__(self, filename) -> None:
7
+ pass
8
+
9
+ @property
10
+ @abstractmethod
11
+ def git_commit_sha(self):
12
+ pass
13
+
14
+ @abstractmethod
15
+ def put_artifacts(self, artifacts, kwargs) -> None:
16
+ pass
17
+
18
+ @abstractmethod
19
+ def get_artifacts(self, query):
20
+ pass
21
+
22
+ @abstractmethod
23
+ def inspect_artifacts(self):
24
+ pass
25
+
26
+
27
+ class Filesystem(Backend):
28
+ git_commit_sha = '5d79e08d4a6c1570ceb47cdd61d2259505c05de9'
29
+ # Declare named types
30
+ DOUBLE = "DOUBLE"
31
+ STRING = "VARCHAR"
32
+ FLOAT = "FLOAT"
33
+ INT = "INT"
34
+
35
+ # Declare store types
36
+ GUFI_STORE = "gufi"
37
+ SQLITE_STORE = "sqlite"
38
+ PARQUET_STORE = "parquet"
39
+
40
+ # Declare comparison types
41
+ GT = ">"
42
+ LT = "<"
43
+ EQ = "="
44
+
45
+ def __init__(self, filename) -> None:
46
+ pass
47
+
48
+ def put_artifacts(self, artifacts, kwargs) -> None:
49
+ pass
50
+
51
+ def get_artifacts(self, query):
52
+ pass
53
+
54
+ def inspect_artifacts(self):
55
+ pass
dsi/backends/gufi.py ADDED
@@ -0,0 +1,92 @@
1
+ import subprocess
2
+
3
+
4
+ # Holds table name and data properties
5
+ from dsi.backends.filesystem import Filesystem
6
+
7
+
8
+ class DataType:
9
+ name = "DEFAULT"
10
+ properties = {}
11
+ units = {}
12
+
13
+
14
+ class Gufi(Filesystem):
15
+ '''
16
+ GUFI Datastore
17
+ '''
18
+
19
+ prefix = ""
20
+ index = ""
21
+ dbfile = ""
22
+ table = ""
23
+ column = ""
24
+ isVerbose = False
25
+
26
+ """
27
+ prefix: prefix to GUFI commands
28
+ index: directory with GUFI indexes
29
+ dbfile: sqlite db file from DSI
30
+ table: table name from the DSI db we want to join on
31
+ column: column name from the DSI db to join on
32
+ """
33
+
34
+ def __init__(self, prefix, index, dbfile, table, column, verbose=False):
35
+ '''
36
+ prefix: prefix to GUFI commands
37
+ index: directory with GUFI indexes
38
+ dbfile: sqlite db file from DSI
39
+ table: table name from the DSI db we want to join on
40
+ column: column name from the DSI db to join on
41
+ verbose: print debugging statements or not
42
+ '''
43
+
44
+ super().__init__(dbfile)
45
+ # prefix is the prefix to the GUFI installation
46
+ self.prefix = prefix
47
+ # index is the directory where the GUFI indexes are stored
48
+ self.index = index
49
+ # dbfile is the dsi database file that we wish to use
50
+ self.dbfile = dbfile
51
+ # table is the dsi database table name that we wish to use
52
+ self.table = table
53
+ # column is the dsi column name for a file's inode
54
+ self.column = column
55
+
56
+ self.isVerbose = verbose
57
+
58
+ # Query GUFI and DSI db
59
+ def get_artifacts(self, query):
60
+ '''
61
+ Retrieves GUFI's metadata joined with a dsi database
62
+ query: an sql query into the dsi_entries table
63
+ '''
64
+
65
+ resout = self._run_gufi_query(query)
66
+ if self.isVerbose:
67
+ print(resout)
68
+
69
+ return resout
70
+
71
+ def put_artifacts(self, query):
72
+ pass
73
+
74
+ # Runs the gufi query command
75
+ def _run_gufi_query(self, sqlstring):
76
+ '''
77
+ Calls the qufy_query command to run the sql query
78
+ sqlstring: the query into the dsi_entries table
79
+ '''
80
+
81
+ # Create the string for the -Q option that specifies the dsi db file,
82
+ # the dsi db table name, and the dsi db inode column name.
83
+ _ = self.dbfile + " " + self.table + " " + self.column + " inode"
84
+
85
+ # Run the GUFI query command
86
+ result = subprocess.run([self.prefix + "/gufi_query", "-d", "|", "-Q", self.dbfile,
87
+ self.table, self.column, "inode", "-E", sqlstring, self.index],
88
+ capture_output=True, text=True)
89
+ return result.stdout
90
+
91
+ def close(self):
92
+ pass
@@ -0,0 +1,101 @@
1
+ import pyarrow as pa
2
+ from pyarrow import parquet as pq
3
+ import subprocess
4
+
5
+ from dsi.backends.filesystem import Filesystem
6
+
7
+
8
+ class Parquet(Filesystem):
9
+ """
10
+ Support for a Parquet back-end.
11
+
12
+ Parquet is a convenient format when metadata are larger than SQLite supports.
13
+ """
14
+
15
+ def __init__(self, filename, **kwargs):
16
+ super().__init__(filename=filename)
17
+ self.filename = filename
18
+ try:
19
+ self.compression = kwargs['compression']
20
+ except KeyError:
21
+ self.compression = None
22
+
23
+ def get_artifacts(self):
24
+ """Get Parquet data from filename."""
25
+ table = pq.read_table(self.filename)
26
+ resout = table.to_pydict()
27
+ return resout
28
+
29
+ def put_artifacts(self, collection):
30
+ """Put artifacts into file at filename path."""
31
+ table = pa.table(collection)
32
+ pq.write_table(table, self.filename, compression=self.compression)
33
+
34
+ @staticmethod
35
+ def get_cmd_output(cmd: list) -> str:
36
+ """
37
+ Runs a given command and returns the stdout if successful.
38
+
39
+ If stderr is not empty, an exception is raised with the stderr text.
40
+ """
41
+ proc = subprocess.run(cmd, capture_output=True, shell=True)
42
+ if proc.stderr != b"":
43
+ raise Exception(proc.stderr)
44
+ return proc.stdout.strip().decode("utf-8")
45
+
46
+ def inspect_artifacts(self, collection, interactive=False):
47
+ import nbconvert as nbc
48
+ import nbformat as nbf
49
+
50
+ """Populate a Jupyter notebook with tools required to look at Parquet data."""
51
+ nb = nbf.v4.new_notebook()
52
+ text = """\
53
+ # This notebook was auto-generated by a DSI Backend for Parquet.
54
+ # Execute the Jupyter notebook cells below and interact with "df"
55
+ # to explore your data.
56
+ """
57
+ code1 = """\
58
+ import pandas as pd
59
+ df = pd.read_parquet('{}')
60
+ df.head()
61
+ """.format(self.filename)
62
+
63
+ code2 = """\
64
+ df.info()
65
+ """
66
+
67
+ code3 = """\
68
+ df.describe()
69
+ """
70
+
71
+ nb['cells'] = [nbf.v4.new_markdown_cell(text),
72
+ nbf.v4.new_code_cell(code1),
73
+ nbf.v4.new_code_cell(code2),
74
+ nbf.v4.new_code_cell(code3)]
75
+
76
+ fname = 'dsi_parquet_backend_output.ipynb'
77
+
78
+ print('Writing Jupyter notebook...')
79
+ with open(fname, 'w') as fh:
80
+ nbf.write(nb, fh)
81
+
82
+ # open the jupyter notebook for static page generation
83
+ with open(fname, 'r', encoding='utf-8') as fh:
84
+ nb_content = nbf.read(fh, as_version=4)
85
+ # Init executor for notebook
86
+ run_nb = nbc.preprocessors.ExecutePreprocessor(timeout=-1) # No timeout
87
+ # Execute the notebook
88
+ run_nb.preprocess(nb_content, {'metadata':{'path':'.'}})
89
+
90
+ if interactive:
91
+ print('Opening Jupyter notebook...')
92
+ self.get_cmd_output(cmd=['jupyter-lab ./dsi_parquet_backend_output.ipynb'])
93
+ else:
94
+ # self.get_cmd_output(cmd=['jupyter nbconvert --to html {}'.format(fname)])
95
+ # Init HTML exporter
96
+ html_exporter = nbc.HTMLExporter()
97
+ html_content,_ = html_exporter.from_notebook_node(nb_content)
98
+ # Save HTML file
99
+ html_filename = 'dsi_parquet_backend_output.html'
100
+ with open(html_filename, 'w', encoding='utf-8') as fh:
101
+ fh.write(html_content)
@@ -0,0 +1,43 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from sqlalchemy import ForeignKey
4
+ from sqlalchemy import String
5
+ from sqlalchemy.orm import DeclarativeBase
6
+ from sqlalchemy.orm import Mapped
7
+ from sqlalchemy.orm import mapped_column
8
+ from sqlalchemy.orm import relationship
9
+ from sqlalchemy import create_engine
10
+ from sqlalchemy.orm import Session
11
+ import csv
12
+ import json
13
+ import re
14
+ import yaml
15
+ import toml
16
+
17
+ from dsi.backends.filesystem import Filesystem
18
+
19
+ class SqlAlchemy(Filesystem):
20
+ filename = "sqlite:///fs.db"
21
+ engine = None
22
+
23
+ def __init__(self, filename, base):
24
+ self.filename = filename
25
+ self.engine = create_engine(filename, echo=True)
26
+ base.metadata.create_all(self.engine)
27
+
28
+ def put_artifacts(self, artifact_list):
29
+ with Session(self.engine) as session:
30
+ session.add_all(artifact_list)
31
+ session.commit()
32
+
33
+ def query(self, stmt):
34
+ results = []
35
+ with Session(self.engine) as session:
36
+ for obj in session.scalars(stmt):
37
+ results.append(obj)
38
+
39
+ return results
40
+
41
+ def close(self):
42
+ if self.engine:
43
+ self.engine.dispose()