dsi-workflow 1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsi_workflow-1.0/PKG-INFO +98 -0
- dsi_workflow-1.0/README.rst +82 -0
- dsi_workflow-1.0/dsi/_version.py +1 -0
- dsi_workflow-1.0/dsi/backends/filesystem.py +55 -0
- dsi_workflow-1.0/dsi/backends/gufi.py +92 -0
- dsi_workflow-1.0/dsi/backends/parquet.py +101 -0
- dsi_workflow-1.0/dsi/backends/sqlalchemy.py +43 -0
- dsi_workflow-1.0/dsi/backends/sqlite.py +749 -0
- dsi_workflow-1.0/dsi/core.py +450 -0
- dsi_workflow-1.0/dsi/plugins/collection_reader.py +65 -0
- dsi_workflow-1.0/dsi/plugins/env.py +224 -0
- dsi_workflow-1.0/dsi/plugins/file_reader.py +518 -0
- dsi_workflow-1.0/dsi/plugins/file_writer.py +348 -0
- dsi_workflow-1.0/dsi/plugins/metadata.py +106 -0
- dsi_workflow-1.0/dsi/plugins/plugin.py +27 -0
- dsi_workflow-1.0/dsi/plugins/plugin_models.py +38 -0
- dsi_workflow-1.0/dsi/tests/test_core.py +24 -0
- dsi_workflow-1.0/dsi_workflow.egg-info/PKG-INFO +98 -0
- dsi_workflow-1.0/dsi_workflow.egg-info/SOURCES.txt +24 -0
- dsi_workflow-1.0/dsi_workflow.egg-info/dependency_links.txt +1 -0
- dsi_workflow-1.0/dsi_workflow.egg-info/requires.txt +7 -0
- dsi_workflow-1.0/dsi_workflow.egg-info/top_level.txt +1 -0
- dsi_workflow-1.0/pyproject.toml +28 -0
- dsi_workflow-1.0/requirements.txt +7 -0
- dsi_workflow-1.0/setup.cfg +4 -0
- dsi_workflow-1.0/setup.py +20 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: dsi-workflow
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: A Data Science Infrastructure Metadatabase
|
|
5
|
+
Author-email: Jesus Pulido <pulido@lanl.gov>, James Ahrens <ahrens@lanl.gov>, Divya Banesh <dbanesh@lanl.gov>, Hugh Greenberg <hng@lanl.gov>, Ben Sims <bsims@lanl.gov>, Christine Sweeney <cahrens@lanl.gov>, Terry Turton <tlturton@lanl.gov>, Quincy Wofford <qwofford@lanl.gov>
|
|
6
|
+
License: BSD-3-Clause
|
|
7
|
+
Classifier: Programming Language :: Python
|
|
8
|
+
Description-Content-Type: text/x-rst
|
|
9
|
+
Requires-Dist: pandas>=2.0.2
|
|
10
|
+
Requires-Dist: pyarrow>=12.0.1
|
|
11
|
+
Requires-Dist: pydantic>=2.1.1
|
|
12
|
+
Requires-Dist: nbconvert>=7.13.0
|
|
13
|
+
Requires-Dist: gitpython>=3.0.0
|
|
14
|
+
Requires-Dist: matplotlib>=3.6.0
|
|
15
|
+
Requires-Dist: pyyaml>=6.0
|
|
16
|
+
|
|
17
|
+
=============
|
|
18
|
+
DSI
|
|
19
|
+
=============
|
|
20
|
+
|
|
21
|
+
The goal of the Data Science Infrastructure Project (DSI) is to provide a flexible, AI-ready metadata query capability which returns data subject to strict, POSIX-enforced file security. The data lifecycle for AI/ML requires seamless transitions from data-intensive/AI/ML research activity to long-term archiving and shared data repositories. DSI enables flexible, data-intensive scientific workflows that meet researcher needs.
|
|
22
|
+
|
|
23
|
+
DSI is implemented in three parts:
|
|
24
|
+
|
|
25
|
+
* Plugins (Readers and Writers)
|
|
26
|
+
* Backends
|
|
27
|
+
* Core middleware
|
|
28
|
+
|
|
29
|
+
Plugins curate metadata for query and data return. Plugins can have read or write funcitonality acting as Readers and Writers for DSI. Plugins acting as readers harvest data from files and streams. Plugins acting as writers execute containerized or baremetal applications to supplement queriable metadata and data. Plugins may be user contributed and a default set of plugins is available with usage examples in our `Core documentation <https://lanl.github.io/dsi/core.html>`_.
|
|
30
|
+
|
|
31
|
+
Backends are interfaces for the Core middleware. Backends consist mostly of back-end/storage functionalities and are the interface between the Core Middleware and a data store. Backends may also have some front-end functionality interfacing between a DSI user and the Core middleware. Backends may be user contributed and a default set of backends are available with usage examples in our `Core documentation <https://lanl.github.io/dsi/core.html>`_.
|
|
32
|
+
|
|
33
|
+
DSI Core middleware provides the user/machine interface. The Core middleware defines a Terminal object. An instantiated Core Terminal can load zero or more plugins and backends. A Terminal object can be used in scripting workflows and program loops.
|
|
34
|
+
|
|
35
|
+
=====================
|
|
36
|
+
DSI Core Requirements
|
|
37
|
+
=====================
|
|
38
|
+
* python3 (3.11 tested)
|
|
39
|
+
* Linux OS (RHEL- and Debian-based distributions tested)
|
|
40
|
+
* Git
|
|
41
|
+
* Plugins and Backends introduce further requirements
|
|
42
|
+
|
|
43
|
+
===============
|
|
44
|
+
Getting Started
|
|
45
|
+
===============
|
|
46
|
+
|
|
47
|
+
DSI does not yet have a versioned release and should be considered pre-alpha. Project contributors are encouraged to prototype solutions which do not contain sensitive data at this time. Consequently a PyPA release is planned but incomplete. It is possible to install DSI locally instead.
|
|
48
|
+
|
|
49
|
+
We recommend Miniconda3 for managing virtual environments for DSI::
|
|
50
|
+
|
|
51
|
+
. ~/miniconda3/bin/activate
|
|
52
|
+
conda create -n dsi python=3.11
|
|
53
|
+
conda activate dsi
|
|
54
|
+
|
|
55
|
+
Python virtual environments can also be used for DSI::
|
|
56
|
+
|
|
57
|
+
python3 -m venv dsienv
|
|
58
|
+
source dsienv/bin/activate
|
|
59
|
+
pip install --upgrade pip
|
|
60
|
+
|
|
61
|
+
After activating your environment::
|
|
62
|
+
|
|
63
|
+
git clone https://github.com/lanl/dsi.git
|
|
64
|
+
cd dsi/
|
|
65
|
+
python3 -m pip install .
|
|
66
|
+
|
|
67
|
+
=====================
|
|
68
|
+
Copyright and License
|
|
69
|
+
=====================
|
|
70
|
+
|
|
71
|
+
This program is open source under the BSD-3 License.
|
|
72
|
+
|
|
73
|
+
© 2023. Triad National Security, LLC. All rights reserved.
|
|
74
|
+
|
|
75
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
76
|
+
provided that the following conditions are met:
|
|
77
|
+
|
|
78
|
+
1.Redistributions of source code must retain the above copyright notice, this list of conditions and
|
|
79
|
+
the following disclaimer.
|
|
80
|
+
|
|
81
|
+
2.Redistributions in binary form must reproduce the above copyright notice, this list of conditions
|
|
82
|
+
and the following disclaimer in the documentation and/or other materials provided with the
|
|
83
|
+
distribution.
|
|
84
|
+
|
|
85
|
+
3.Neither the name of the copyright holder nor the names of its contributors may be used to endorse
|
|
86
|
+
or promote products derived from this software without specific prior written permission.
|
|
87
|
+
|
|
88
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
89
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
90
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
91
|
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
92
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
93
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
94
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
95
|
+
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
96
|
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
97
|
+
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
98
|
+
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
=============
|
|
2
|
+
DSI
|
|
3
|
+
=============
|
|
4
|
+
|
|
5
|
+
The goal of the Data Science Infrastructure Project (DSI) is to provide a flexible, AI-ready metadata query capability which returns data subject to strict, POSIX-enforced file security. The data lifecycle for AI/ML requires seamless transitions from data-intensive/AI/ML research activity to long-term archiving and shared data repositories. DSI enables flexible, data-intensive scientific workflows that meet researcher needs.
|
|
6
|
+
|
|
7
|
+
DSI is implemented in three parts:
|
|
8
|
+
|
|
9
|
+
* Plugins (Readers and Writers)
|
|
10
|
+
* Backends
|
|
11
|
+
* Core middleware
|
|
12
|
+
|
|
13
|
+
Plugins curate metadata for query and data return. Plugins can have read or write funcitonality acting as Readers and Writers for DSI. Plugins acting as readers harvest data from files and streams. Plugins acting as writers execute containerized or baremetal applications to supplement queriable metadata and data. Plugins may be user contributed and a default set of plugins is available with usage examples in our `Core documentation <https://lanl.github.io/dsi/core.html>`_.
|
|
14
|
+
|
|
15
|
+
Backends are interfaces for the Core middleware. Backends consist mostly of back-end/storage functionalities and are the interface between the Core Middleware and a data store. Backends may also have some front-end functionality interfacing between a DSI user and the Core middleware. Backends may be user contributed and a default set of backends are available with usage examples in our `Core documentation <https://lanl.github.io/dsi/core.html>`_.
|
|
16
|
+
|
|
17
|
+
DSI Core middleware provides the user/machine interface. The Core middleware defines a Terminal object. An instantiated Core Terminal can load zero or more plugins and backends. A Terminal object can be used in scripting workflows and program loops.
|
|
18
|
+
|
|
19
|
+
=====================
|
|
20
|
+
DSI Core Requirements
|
|
21
|
+
=====================
|
|
22
|
+
* python3 (3.11 tested)
|
|
23
|
+
* Linux OS (RHEL- and Debian-based distributions tested)
|
|
24
|
+
* Git
|
|
25
|
+
* Plugins and Backends introduce further requirements
|
|
26
|
+
|
|
27
|
+
===============
|
|
28
|
+
Getting Started
|
|
29
|
+
===============
|
|
30
|
+
|
|
31
|
+
DSI does not yet have a versioned release and should be considered pre-alpha. Project contributors are encouraged to prototype solutions which do not contain sensitive data at this time. Consequently a PyPA release is planned but incomplete. It is possible to install DSI locally instead.
|
|
32
|
+
|
|
33
|
+
We recommend Miniconda3 for managing virtual environments for DSI::
|
|
34
|
+
|
|
35
|
+
. ~/miniconda3/bin/activate
|
|
36
|
+
conda create -n dsi python=3.11
|
|
37
|
+
conda activate dsi
|
|
38
|
+
|
|
39
|
+
Python virtual environments can also be used for DSI::
|
|
40
|
+
|
|
41
|
+
python3 -m venv dsienv
|
|
42
|
+
source dsienv/bin/activate
|
|
43
|
+
pip install --upgrade pip
|
|
44
|
+
|
|
45
|
+
After activating your environment::
|
|
46
|
+
|
|
47
|
+
git clone https://github.com/lanl/dsi.git
|
|
48
|
+
cd dsi/
|
|
49
|
+
python3 -m pip install .
|
|
50
|
+
|
|
51
|
+
=====================
|
|
52
|
+
Copyright and License
|
|
53
|
+
=====================
|
|
54
|
+
|
|
55
|
+
This program is open source under the BSD-3 License.
|
|
56
|
+
|
|
57
|
+
© 2023. Triad National Security, LLC. All rights reserved.
|
|
58
|
+
|
|
59
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
60
|
+
provided that the following conditions are met:
|
|
61
|
+
|
|
62
|
+
1.Redistributions of source code must retain the above copyright notice, this list of conditions and
|
|
63
|
+
the following disclaimer.
|
|
64
|
+
|
|
65
|
+
2.Redistributions in binary form must reproduce the above copyright notice, this list of conditions
|
|
66
|
+
and the following disclaimer in the documentation and/or other materials provided with the
|
|
67
|
+
distribution.
|
|
68
|
+
|
|
69
|
+
3.Neither the name of the copyright holder nor the names of its contributors may be used to endorse
|
|
70
|
+
or promote products derived from this software without specific prior written permission.
|
|
71
|
+
|
|
72
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
73
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
74
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
75
|
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
76
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
77
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
78
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
79
|
+
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
80
|
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
81
|
+
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
82
|
+
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from abc import ABCMeta, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Backend(metaclass=ABCMeta):
|
|
5
|
+
@abstractmethod
|
|
6
|
+
def __init__(self, filename) -> None:
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def git_commit_sha(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def put_artifacts(self, artifacts, kwargs) -> None:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def get_artifacts(self, query):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def inspect_artifacts(self):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Filesystem(Backend):
|
|
28
|
+
git_commit_sha = '5d79e08d4a6c1570ceb47cdd61d2259505c05de9'
|
|
29
|
+
# Declare named types
|
|
30
|
+
DOUBLE = "DOUBLE"
|
|
31
|
+
STRING = "VARCHAR"
|
|
32
|
+
FLOAT = "FLOAT"
|
|
33
|
+
INT = "INT"
|
|
34
|
+
|
|
35
|
+
# Declare store types
|
|
36
|
+
GUFI_STORE = "gufi"
|
|
37
|
+
SQLITE_STORE = "sqlite"
|
|
38
|
+
PARQUET_STORE = "parquet"
|
|
39
|
+
|
|
40
|
+
# Declare comparison types
|
|
41
|
+
GT = ">"
|
|
42
|
+
LT = "<"
|
|
43
|
+
EQ = "="
|
|
44
|
+
|
|
45
|
+
def __init__(self, filename) -> None:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def put_artifacts(self, artifacts, kwargs) -> None:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
def get_artifacts(self, query):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def inspect_artifacts(self):
|
|
55
|
+
pass
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# Holds table name and data properties
|
|
5
|
+
from dsi.backends.filesystem import Filesystem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataType:
|
|
9
|
+
name = "DEFAULT"
|
|
10
|
+
properties = {}
|
|
11
|
+
units = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Gufi(Filesystem):
|
|
15
|
+
'''
|
|
16
|
+
GUFI Datastore
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
prefix = ""
|
|
20
|
+
index = ""
|
|
21
|
+
dbfile = ""
|
|
22
|
+
table = ""
|
|
23
|
+
column = ""
|
|
24
|
+
isVerbose = False
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
prefix: prefix to GUFI commands
|
|
28
|
+
index: directory with GUFI indexes
|
|
29
|
+
dbfile: sqlite db file from DSI
|
|
30
|
+
table: table name from the DSI db we want to join on
|
|
31
|
+
column: column name from the DSI db to join on
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, prefix, index, dbfile, table, column, verbose=False):
|
|
35
|
+
'''
|
|
36
|
+
prefix: prefix to GUFI commands
|
|
37
|
+
index: directory with GUFI indexes
|
|
38
|
+
dbfile: sqlite db file from DSI
|
|
39
|
+
table: table name from the DSI db we want to join on
|
|
40
|
+
column: column name from the DSI db to join on
|
|
41
|
+
verbose: print debugging statements or not
|
|
42
|
+
'''
|
|
43
|
+
|
|
44
|
+
super().__init__(dbfile)
|
|
45
|
+
# prefix is the prefix to the GUFI installation
|
|
46
|
+
self.prefix = prefix
|
|
47
|
+
# index is the directory where the GUFI indexes are stored
|
|
48
|
+
self.index = index
|
|
49
|
+
# dbfile is the dsi database file that we wish to use
|
|
50
|
+
self.dbfile = dbfile
|
|
51
|
+
# table is the dsi database table name that we wish to use
|
|
52
|
+
self.table = table
|
|
53
|
+
# column is the dsi column name for a file's inode
|
|
54
|
+
self.column = column
|
|
55
|
+
|
|
56
|
+
self.isVerbose = verbose
|
|
57
|
+
|
|
58
|
+
# Query GUFI and DSI db
|
|
59
|
+
def get_artifacts(self, query):
|
|
60
|
+
'''
|
|
61
|
+
Retrieves GUFI's metadata joined with a dsi database
|
|
62
|
+
query: an sql query into the dsi_entries table
|
|
63
|
+
'''
|
|
64
|
+
|
|
65
|
+
resout = self._run_gufi_query(query)
|
|
66
|
+
if self.isVerbose:
|
|
67
|
+
print(resout)
|
|
68
|
+
|
|
69
|
+
return resout
|
|
70
|
+
|
|
71
|
+
def put_artifacts(self, query):
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
# Runs the gufi query command
|
|
75
|
+
def _run_gufi_query(self, sqlstring):
|
|
76
|
+
'''
|
|
77
|
+
Calls the qufy_query command to run the sql query
|
|
78
|
+
sqlstring: the query into the dsi_entries table
|
|
79
|
+
'''
|
|
80
|
+
|
|
81
|
+
# Create the string for the -Q option that specifies the dsi db file,
|
|
82
|
+
# the dsi db table name, and the dsi db inode column name.
|
|
83
|
+
_ = self.dbfile + " " + self.table + " " + self.column + " inode"
|
|
84
|
+
|
|
85
|
+
# Run the GUFI query command
|
|
86
|
+
result = subprocess.run([self.prefix + "/gufi_query", "-d", "|", "-Q", self.dbfile,
|
|
87
|
+
self.table, self.column, "inode", "-E", sqlstring, self.index],
|
|
88
|
+
capture_output=True, text=True)
|
|
89
|
+
return result.stdout
|
|
90
|
+
|
|
91
|
+
def close(self):
|
|
92
|
+
pass
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
from pyarrow import parquet as pq
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from dsi.backends.filesystem import Filesystem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Parquet(Filesystem):
|
|
9
|
+
"""
|
|
10
|
+
Support for a Parquet back-end.
|
|
11
|
+
|
|
12
|
+
Parquet is a convenient format when metadata are larger than SQLite supports.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, filename, **kwargs):
|
|
16
|
+
super().__init__(filename=filename)
|
|
17
|
+
self.filename = filename
|
|
18
|
+
try:
|
|
19
|
+
self.compression = kwargs['compression']
|
|
20
|
+
except KeyError:
|
|
21
|
+
self.compression = None
|
|
22
|
+
|
|
23
|
+
def get_artifacts(self):
|
|
24
|
+
"""Get Parquet data from filename."""
|
|
25
|
+
table = pq.read_table(self.filename)
|
|
26
|
+
resout = table.to_pydict()
|
|
27
|
+
return resout
|
|
28
|
+
|
|
29
|
+
def put_artifacts(self, collection):
|
|
30
|
+
"""Put artifacts into file at filename path."""
|
|
31
|
+
table = pa.table(collection)
|
|
32
|
+
pq.write_table(table, self.filename, compression=self.compression)
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def get_cmd_output(cmd: list) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Runs a given command and returns the stdout if successful.
|
|
38
|
+
|
|
39
|
+
If stderr is not empty, an exception is raised with the stderr text.
|
|
40
|
+
"""
|
|
41
|
+
proc = subprocess.run(cmd, capture_output=True, shell=True)
|
|
42
|
+
if proc.stderr != b"":
|
|
43
|
+
raise Exception(proc.stderr)
|
|
44
|
+
return proc.stdout.strip().decode("utf-8")
|
|
45
|
+
|
|
46
|
+
def inspect_artifacts(self, collection, interactive=False):
|
|
47
|
+
import nbconvert as nbc
|
|
48
|
+
import nbformat as nbf
|
|
49
|
+
|
|
50
|
+
"""Populate a Jupyter notebook with tools required to look at Parquet data."""
|
|
51
|
+
nb = nbf.v4.new_notebook()
|
|
52
|
+
text = """\
|
|
53
|
+
# This notebook was auto-generated by a DSI Backend for Parquet.
|
|
54
|
+
# Execute the Jupyter notebook cells below and interact with "df"
|
|
55
|
+
# to explore your data.
|
|
56
|
+
"""
|
|
57
|
+
code1 = """\
|
|
58
|
+
import pandas as pd
|
|
59
|
+
df = pd.read_parquet('{}')
|
|
60
|
+
df.head()
|
|
61
|
+
""".format(self.filename)
|
|
62
|
+
|
|
63
|
+
code2 = """\
|
|
64
|
+
df.info()
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
code3 = """\
|
|
68
|
+
df.describe()
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
nb['cells'] = [nbf.v4.new_markdown_cell(text),
|
|
72
|
+
nbf.v4.new_code_cell(code1),
|
|
73
|
+
nbf.v4.new_code_cell(code2),
|
|
74
|
+
nbf.v4.new_code_cell(code3)]
|
|
75
|
+
|
|
76
|
+
fname = 'dsi_parquet_backend_output.ipynb'
|
|
77
|
+
|
|
78
|
+
print('Writing Jupyter notebook...')
|
|
79
|
+
with open(fname, 'w') as fh:
|
|
80
|
+
nbf.write(nb, fh)
|
|
81
|
+
|
|
82
|
+
# open the jupyter notebook for static page generation
|
|
83
|
+
with open(fname, 'r', encoding='utf-8') as fh:
|
|
84
|
+
nb_content = nbf.read(fh, as_version=4)
|
|
85
|
+
# Init executor for notebook
|
|
86
|
+
run_nb = nbc.preprocessors.ExecutePreprocessor(timeout=-1) # No timeout
|
|
87
|
+
# Execute the notebook
|
|
88
|
+
run_nb.preprocess(nb_content, {'metadata':{'path':'.'}})
|
|
89
|
+
|
|
90
|
+
if interactive:
|
|
91
|
+
print('Opening Jupyter notebook...')
|
|
92
|
+
self.get_cmd_output(cmd=['jupyter-lab ./dsi_parquet_backend_output.ipynb'])
|
|
93
|
+
else:
|
|
94
|
+
# self.get_cmd_output(cmd=['jupyter nbconvert --to html {}'.format(fname)])
|
|
95
|
+
# Init HTML exporter
|
|
96
|
+
html_exporter = nbc.HTMLExporter()
|
|
97
|
+
html_content,_ = html_exporter.from_notebook_node(nb_content)
|
|
98
|
+
# Save HTML file
|
|
99
|
+
html_filename = 'dsi_parquet_backend_output.html'
|
|
100
|
+
with open(html_filename, 'w', encoding='utf-8') as fh:
|
|
101
|
+
fh.write(html_content)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from sqlalchemy import ForeignKey
|
|
4
|
+
from sqlalchemy import String
|
|
5
|
+
from sqlalchemy.orm import DeclarativeBase
|
|
6
|
+
from sqlalchemy.orm import Mapped
|
|
7
|
+
from sqlalchemy.orm import mapped_column
|
|
8
|
+
from sqlalchemy.orm import relationship
|
|
9
|
+
from sqlalchemy import create_engine
|
|
10
|
+
from sqlalchemy.orm import Session
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import yaml
|
|
15
|
+
import toml
|
|
16
|
+
|
|
17
|
+
from dsi.backends.filesystem import Filesystem
|
|
18
|
+
|
|
19
|
+
class SqlAlchemy(Filesystem):
|
|
20
|
+
filename = "sqlite:///fs.db"
|
|
21
|
+
engine = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, filename, base):
|
|
24
|
+
self.filename = filename
|
|
25
|
+
self.engine = create_engine(filename, echo=True)
|
|
26
|
+
base.metadata.create_all(self.engine)
|
|
27
|
+
|
|
28
|
+
def put_artifacts(self, artifact_list):
|
|
29
|
+
with Session(self.engine) as session:
|
|
30
|
+
session.add_all(artifact_list)
|
|
31
|
+
session.commit()
|
|
32
|
+
|
|
33
|
+
def query(self, stmt):
|
|
34
|
+
results = []
|
|
35
|
+
with Session(self.engine) as session:
|
|
36
|
+
for obj in session.scalars(stmt):
|
|
37
|
+
results.append(obj)
|
|
38
|
+
|
|
39
|
+
return results
|
|
40
|
+
|
|
41
|
+
def close(self):
|
|
42
|
+
if self.engine:
|
|
43
|
+
self.engine.dispose()
|