dsi-workflow 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsi/_version.py +1 -0
- dsi/backends/filesystem.py +55 -0
- dsi/backends/gufi.py +92 -0
- dsi/backends/parquet.py +101 -0
- dsi/backends/sqlalchemy.py +43 -0
- dsi/backends/sqlite.py +749 -0
- dsi/core.py +450 -0
- dsi/plugins/collection_reader.py +65 -0
- dsi/plugins/env.py +224 -0
- dsi/plugins/file_reader.py +518 -0
- dsi/plugins/file_writer.py +348 -0
- dsi/plugins/metadata.py +106 -0
- dsi/plugins/plugin.py +27 -0
- dsi/plugins/plugin_models.py +38 -0
- dsi/tests/test_core.py +24 -0
- dsi_workflow-1.0.dist-info/METADATA +98 -0
- dsi_workflow-1.0.dist-info/RECORD +19 -0
- dsi_workflow-1.0.dist-info/WHEEL +5 -0
- dsi_workflow-1.0.dist-info/top_level.txt +1 -0
dsi/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from abc import ABCMeta, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Backend(metaclass=ABCMeta):
|
|
5
|
+
@abstractmethod
|
|
6
|
+
def __init__(self, filename) -> None:
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def git_commit_sha(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def put_artifacts(self, artifacts, kwargs) -> None:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def get_artifacts(self, query):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def inspect_artifacts(self):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Filesystem(Backend):
|
|
28
|
+
git_commit_sha = '5d79e08d4a6c1570ceb47cdd61d2259505c05de9'
|
|
29
|
+
# Declare named types
|
|
30
|
+
DOUBLE = "DOUBLE"
|
|
31
|
+
STRING = "VARCHAR"
|
|
32
|
+
FLOAT = "FLOAT"
|
|
33
|
+
INT = "INT"
|
|
34
|
+
|
|
35
|
+
# Declare store types
|
|
36
|
+
GUFI_STORE = "gufi"
|
|
37
|
+
SQLITE_STORE = "sqlite"
|
|
38
|
+
PARQUET_STORE = "parquet"
|
|
39
|
+
|
|
40
|
+
# Declare comparison types
|
|
41
|
+
GT = ">"
|
|
42
|
+
LT = "<"
|
|
43
|
+
EQ = "="
|
|
44
|
+
|
|
45
|
+
def __init__(self, filename) -> None:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def put_artifacts(self, artifacts, kwargs) -> None:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
def get_artifacts(self, query):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def inspect_artifacts(self):
|
|
55
|
+
pass
|
dsi/backends/gufi.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# Holds table name and data properties
|
|
5
|
+
from dsi.backends.filesystem import Filesystem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataType:
|
|
9
|
+
name = "DEFAULT"
|
|
10
|
+
properties = {}
|
|
11
|
+
units = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Gufi(Filesystem):
|
|
15
|
+
'''
|
|
16
|
+
GUFI Datastore
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
prefix = ""
|
|
20
|
+
index = ""
|
|
21
|
+
dbfile = ""
|
|
22
|
+
table = ""
|
|
23
|
+
column = ""
|
|
24
|
+
isVerbose = False
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
prefix: prefix to GUFI commands
|
|
28
|
+
index: directory with GUFI indexes
|
|
29
|
+
dbfile: sqlite db file from DSI
|
|
30
|
+
table: table name from the DSI db we want to join on
|
|
31
|
+
column: column name from the DSI db to join on
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, prefix, index, dbfile, table, column, verbose=False):
|
|
35
|
+
'''
|
|
36
|
+
prefix: prefix to GUFI commands
|
|
37
|
+
index: directory with GUFI indexes
|
|
38
|
+
dbfile: sqlite db file from DSI
|
|
39
|
+
table: table name from the DSI db we want to join on
|
|
40
|
+
column: column name from the DSI db to join on
|
|
41
|
+
verbose: print debugging statements or not
|
|
42
|
+
'''
|
|
43
|
+
|
|
44
|
+
super().__init__(dbfile)
|
|
45
|
+
# prefix is the prefix to the GUFI installation
|
|
46
|
+
self.prefix = prefix
|
|
47
|
+
# index is the directory where the GUFI indexes are stored
|
|
48
|
+
self.index = index
|
|
49
|
+
# dbfile is the dsi database file that we wish to use
|
|
50
|
+
self.dbfile = dbfile
|
|
51
|
+
# table is the dsi database table name that we wish to use
|
|
52
|
+
self.table = table
|
|
53
|
+
# column is the dsi column name for a file's inode
|
|
54
|
+
self.column = column
|
|
55
|
+
|
|
56
|
+
self.isVerbose = verbose
|
|
57
|
+
|
|
58
|
+
# Query GUFI and DSI db
|
|
59
|
+
def get_artifacts(self, query):
|
|
60
|
+
'''
|
|
61
|
+
Retrieves GUFI's metadata joined with a dsi database
|
|
62
|
+
query: an sql query into the dsi_entries table
|
|
63
|
+
'''
|
|
64
|
+
|
|
65
|
+
resout = self._run_gufi_query(query)
|
|
66
|
+
if self.isVerbose:
|
|
67
|
+
print(resout)
|
|
68
|
+
|
|
69
|
+
return resout
|
|
70
|
+
|
|
71
|
+
def put_artifacts(self, query):
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
# Runs the gufi query command
|
|
75
|
+
def _run_gufi_query(self, sqlstring):
|
|
76
|
+
'''
|
|
77
|
+
Calls the qufy_query command to run the sql query
|
|
78
|
+
sqlstring: the query into the dsi_entries table
|
|
79
|
+
'''
|
|
80
|
+
|
|
81
|
+
# Create the string for the -Q option that specifies the dsi db file,
|
|
82
|
+
# the dsi db table name, and the dsi db inode column name.
|
|
83
|
+
_ = self.dbfile + " " + self.table + " " + self.column + " inode"
|
|
84
|
+
|
|
85
|
+
# Run the GUFI query command
|
|
86
|
+
result = subprocess.run([self.prefix + "/gufi_query", "-d", "|", "-Q", self.dbfile,
|
|
87
|
+
self.table, self.column, "inode", "-E", sqlstring, self.index],
|
|
88
|
+
capture_output=True, text=True)
|
|
89
|
+
return result.stdout
|
|
90
|
+
|
|
91
|
+
def close(self):
|
|
92
|
+
pass
|
dsi/backends/parquet.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
from pyarrow import parquet as pq
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from dsi.backends.filesystem import Filesystem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Parquet(Filesystem):
|
|
9
|
+
"""
|
|
10
|
+
Support for a Parquet back-end.
|
|
11
|
+
|
|
12
|
+
Parquet is a convenient format when metadata are larger than SQLite supports.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, filename, **kwargs):
|
|
16
|
+
super().__init__(filename=filename)
|
|
17
|
+
self.filename = filename
|
|
18
|
+
try:
|
|
19
|
+
self.compression = kwargs['compression']
|
|
20
|
+
except KeyError:
|
|
21
|
+
self.compression = None
|
|
22
|
+
|
|
23
|
+
def get_artifacts(self):
|
|
24
|
+
"""Get Parquet data from filename."""
|
|
25
|
+
table = pq.read_table(self.filename)
|
|
26
|
+
resout = table.to_pydict()
|
|
27
|
+
return resout
|
|
28
|
+
|
|
29
|
+
def put_artifacts(self, collection):
|
|
30
|
+
"""Put artifacts into file at filename path."""
|
|
31
|
+
table = pa.table(collection)
|
|
32
|
+
pq.write_table(table, self.filename, compression=self.compression)
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def get_cmd_output(cmd: list) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Runs a given command and returns the stdout if successful.
|
|
38
|
+
|
|
39
|
+
If stderr is not empty, an exception is raised with the stderr text.
|
|
40
|
+
"""
|
|
41
|
+
proc = subprocess.run(cmd, capture_output=True, shell=True)
|
|
42
|
+
if proc.stderr != b"":
|
|
43
|
+
raise Exception(proc.stderr)
|
|
44
|
+
return proc.stdout.strip().decode("utf-8")
|
|
45
|
+
|
|
46
|
+
def inspect_artifacts(self, collection, interactive=False):
|
|
47
|
+
import nbconvert as nbc
|
|
48
|
+
import nbformat as nbf
|
|
49
|
+
|
|
50
|
+
"""Populate a Jupyter notebook with tools required to look at Parquet data."""
|
|
51
|
+
nb = nbf.v4.new_notebook()
|
|
52
|
+
text = """\
|
|
53
|
+
# This notebook was auto-generated by a DSI Backend for Parquet.
|
|
54
|
+
# Execute the Jupyter notebook cells below and interact with "df"
|
|
55
|
+
# to explore your data.
|
|
56
|
+
"""
|
|
57
|
+
code1 = """\
|
|
58
|
+
import pandas as pd
|
|
59
|
+
df = pd.read_parquet('{}')
|
|
60
|
+
df.head()
|
|
61
|
+
""".format(self.filename)
|
|
62
|
+
|
|
63
|
+
code2 = """\
|
|
64
|
+
df.info()
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
code3 = """\
|
|
68
|
+
df.describe()
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
nb['cells'] = [nbf.v4.new_markdown_cell(text),
|
|
72
|
+
nbf.v4.new_code_cell(code1),
|
|
73
|
+
nbf.v4.new_code_cell(code2),
|
|
74
|
+
nbf.v4.new_code_cell(code3)]
|
|
75
|
+
|
|
76
|
+
fname = 'dsi_parquet_backend_output.ipynb'
|
|
77
|
+
|
|
78
|
+
print('Writing Jupyter notebook...')
|
|
79
|
+
with open(fname, 'w') as fh:
|
|
80
|
+
nbf.write(nb, fh)
|
|
81
|
+
|
|
82
|
+
# open the jupyter notebook for static page generation
|
|
83
|
+
with open(fname, 'r', encoding='utf-8') as fh:
|
|
84
|
+
nb_content = nbf.read(fh, as_version=4)
|
|
85
|
+
# Init executor for notebook
|
|
86
|
+
run_nb = nbc.preprocessors.ExecutePreprocessor(timeout=-1) # No timeout
|
|
87
|
+
# Execute the notebook
|
|
88
|
+
run_nb.preprocess(nb_content, {'metadata':{'path':'.'}})
|
|
89
|
+
|
|
90
|
+
if interactive:
|
|
91
|
+
print('Opening Jupyter notebook...')
|
|
92
|
+
self.get_cmd_output(cmd=['jupyter-lab ./dsi_parquet_backend_output.ipynb'])
|
|
93
|
+
else:
|
|
94
|
+
# self.get_cmd_output(cmd=['jupyter nbconvert --to html {}'.format(fname)])
|
|
95
|
+
# Init HTML exporter
|
|
96
|
+
html_exporter = nbc.HTMLExporter()
|
|
97
|
+
html_content,_ = html_exporter.from_notebook_node(nb_content)
|
|
98
|
+
# Save HTML file
|
|
99
|
+
html_filename = 'dsi_parquet_backend_output.html'
|
|
100
|
+
with open(html_filename, 'w', encoding='utf-8') as fh:
|
|
101
|
+
fh.write(html_content)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from sqlalchemy import ForeignKey
|
|
4
|
+
from sqlalchemy import String
|
|
5
|
+
from sqlalchemy.orm import DeclarativeBase
|
|
6
|
+
from sqlalchemy.orm import Mapped
|
|
7
|
+
from sqlalchemy.orm import mapped_column
|
|
8
|
+
from sqlalchemy.orm import relationship
|
|
9
|
+
from sqlalchemy import create_engine
|
|
10
|
+
from sqlalchemy.orm import Session
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import yaml
|
|
15
|
+
import toml
|
|
16
|
+
|
|
17
|
+
from dsi.backends.filesystem import Filesystem
|
|
18
|
+
|
|
19
|
+
class SqlAlchemy(Filesystem):
|
|
20
|
+
filename = "sqlite:///fs.db"
|
|
21
|
+
engine = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, filename, base):
|
|
24
|
+
self.filename = filename
|
|
25
|
+
self.engine = create_engine(filename, echo=True)
|
|
26
|
+
base.metadata.create_all(self.engine)
|
|
27
|
+
|
|
28
|
+
def put_artifacts(self, artifact_list):
|
|
29
|
+
with Session(self.engine) as session:
|
|
30
|
+
session.add_all(artifact_list)
|
|
31
|
+
session.commit()
|
|
32
|
+
|
|
33
|
+
def query(self, stmt):
|
|
34
|
+
results = []
|
|
35
|
+
with Session(self.engine) as session:
|
|
36
|
+
for obj in session.scalars(stmt):
|
|
37
|
+
results.append(obj)
|
|
38
|
+
|
|
39
|
+
return results
|
|
40
|
+
|
|
41
|
+
def close(self):
|
|
42
|
+
if self.engine:
|
|
43
|
+
self.engine.dispose()
|