dsi-workflow 1.2__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/PKG-INFO +1 -1
- dsi_workflow-1.2.2/dsi/_version.py +1 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/backend.py +17 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/duckdb.py +65 -51
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/filesystem.py +10 -0
- dsi_workflow-1.2.2/dsi/backends/gufi.py +160 -0
- dsi_workflow-1.2.2/dsi/backends/hpss.py +168 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/ndp.py +107 -242
- dsi_workflow-1.2.2/dsi/backends/oceans11.py +1277 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/osti.py +307 -161
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/sqlalchemy.py +7 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/sqlite.py +63 -55
- dsi_workflow-1.2.2/dsi/backends/tests/test_gufi.py +20 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_ndp.py +15 -18
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/webserver.py +21 -1
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/cli.py +235 -112
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/core.py +55 -51
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/dsi.py +191 -171
- dsi_workflow-1.2.2/dsi/dsifederated.py +583 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/collection_reader.py +0 -1
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/dashboard.py +2 -2
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/file_reader.py +39 -84
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/ml_emulator.py +1 -3
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_file_reader.py +7 -6
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/sync.py +453 -182
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/tests/test_core.py +6 -2
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/tests/test_dsi.py +0 -2
- dsi_workflow-1.2.2/dsi/utils/federated/federate_datasets.py +646 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/federation_utils.py +30 -6
- dsi_workflow-1.2.2/dsi/utils/hpc_kerberos.py +251 -0
- dsi_workflow-1.2.2/dsi/utils/s3_utils.py +248 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/PKG-INFO +1 -1
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/SOURCES.txt +3 -2
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/pyproject.toml +1 -4
- dsi_workflow-1.2/dsi/_version.py +0 -1
- dsi_workflow-1.2/dsi/backends/gufi.py +0 -88
- dsi_workflow-1.2/dsi/backends/hpss.py +0 -156
- dsi_workflow-1.2/dsi/backends/tests/test_gufi.py +0 -16
- dsi_workflow-1.2/dsi/dsifederated.py +0 -293
- dsi_workflow-1.2/dsi/utils/federated/federate_datasets.py +0 -380
- dsi_workflow-1.2/dsi/utils/launch_streamlit.bat +0 -9
- dsi_workflow-1.2/dsi/utils/launch_streamlit.sh +0 -27
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/README.rst +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/__init__.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_duckdb.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_hpss.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_sqlalchemy.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_sqlite.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/env.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/file_writer.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/metadata.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/plugin.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/plugin_models.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_collection_reader.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_env.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_file_writer.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/__init__.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/dsi_utils.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/federated/__init__.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/git_utils.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/rsync_utils.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/web_utils.py +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/dependency_links.txt +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/entry_points.txt +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/requires.txt +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/top_level.txt +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/requirements.txt +0 -0
- {dsi_workflow-1.2 → dsi_workflow-1.2.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dsi-workflow
|
|
3
|
-
Version: 1.2
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: A Data Science Infrastructure Project
|
|
5
5
|
Author-email: Jesus Pulido <pulido@lanl.gov>, James Ahrens <ahrens@lanl.gov>, Divya Banesh <dbanesh@lanl.gov>, Hugh Greenberg <hng@lanl.gov>, Pascal Grosset <pascalgrosset@lanl.gov>, Vedant Iyer <iyer@lanl.gov>, Benjamin Sims <bsims@lanl.gov>, Terece Turton <tlturton@lanl.gov>
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.2"
|
|
@@ -5,6 +5,7 @@ class Backend(ABC):
|
|
|
5
5
|
def __init__(self, data_source, **kwargs) -> None:
|
|
6
6
|
pass
|
|
7
7
|
|
|
8
|
+
# Can raise NotImplementedError for a read-only backend
|
|
8
9
|
@abstractmethod
|
|
9
10
|
def ingest_artifacts(self, artifacts, **kwargs) -> None:
|
|
10
11
|
pass
|
|
@@ -13,6 +14,14 @@ class Backend(ABC):
|
|
|
13
14
|
def query_artifacts(self, query, **kwargs):
|
|
14
15
|
pass
|
|
15
16
|
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def get_table(self, table_name, **kwargs):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def get_table_names(self,query):
|
|
23
|
+
pass
|
|
24
|
+
|
|
16
25
|
@abstractmethod
|
|
17
26
|
def notebook(self, **kwargs):
|
|
18
27
|
pass
|
|
@@ -21,6 +30,10 @@ class Backend(ABC):
|
|
|
21
30
|
def process_artifacts(self, **kwargs):
|
|
22
31
|
pass
|
|
23
32
|
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def get_schema(self):
|
|
35
|
+
pass
|
|
36
|
+
|
|
24
37
|
@abstractmethod
|
|
25
38
|
def find(self, query_object, **kwargs):
|
|
26
39
|
pass
|
|
@@ -45,6 +58,10 @@ class Backend(ABC):
|
|
|
45
58
|
def list(self, **kwargs):
|
|
46
59
|
pass
|
|
47
60
|
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def num_tables(self):
|
|
63
|
+
pass
|
|
64
|
+
|
|
48
65
|
@abstractmethod
|
|
49
66
|
def display(self, table_name, **kwargs):
|
|
50
67
|
pass
|
|
@@ -42,6 +42,7 @@ class DuckDB(Filesystem):
|
|
|
42
42
|
DuckDB Filesystem Backend to which a user can ingest/process data, generate a Jupyter notebook, and find occurrences of a search term
|
|
43
43
|
"""
|
|
44
44
|
runTable = False
|
|
45
|
+
read_only = False
|
|
45
46
|
|
|
46
47
|
def __init__(self, filename, **kwargs):
|
|
47
48
|
"""
|
|
@@ -65,7 +66,7 @@ class DuckDB(Filesystem):
|
|
|
65
66
|
`input_list` : list
|
|
66
67
|
A list of values to analyze for type compatibility.
|
|
67
68
|
|
|
68
|
-
|
|
69
|
+
Return: str
|
|
69
70
|
A string representing the inferred DuckDB data type for the input list.
|
|
70
71
|
"""
|
|
71
72
|
DUCKDB_BIGINT_MIN = -9223372036854775808
|
|
@@ -281,7 +282,7 @@ class DuckDB(Filesystem):
|
|
|
281
282
|
else:
|
|
282
283
|
table_order = list(reversed(ordered_tables)) # ingest primary key tables first then children
|
|
283
284
|
|
|
284
|
-
if self.runTable:
|
|
285
|
+
if self.runTable and artifacts:
|
|
285
286
|
runTable_create = "CREATE TABLE IF NOT EXISTS runTable (run_id INTEGER PRIMARY KEY, run_timestamp TEXT UNIQUE);"
|
|
286
287
|
self.cur.execute(runTable_create)
|
|
287
288
|
|
|
@@ -328,27 +329,29 @@ class DuckDB(Filesystem):
|
|
|
328
329
|
|
|
329
330
|
self.ingest_table_helper(types, foreign_query)
|
|
330
331
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
try:
|
|
345
|
-
self.cur.executemany(str_query,rows)
|
|
346
|
-
except duckdb.Error as e:
|
|
347
|
-
self.cur.execute("ROLLBACK")
|
|
348
|
-
self.cur.execute("CHECKPOINT")
|
|
349
|
-
raise duckdb.Error(e)
|
|
332
|
+
# TODO: move this check to schema reader by allowing users to just create table without data
|
|
333
|
+
if not all(v == [""] for v in tableData.values()): # if table is just one row of empty strings, don't insert
|
|
334
|
+
col_names = ', '.join(types.properties.keys())
|
|
335
|
+
placeholders = ', '.join('?' * len(types.properties))
|
|
336
|
+
|
|
337
|
+
str_query = "INSERT INTO "
|
|
338
|
+
if self.runTable:
|
|
339
|
+
run_id = self.cur.execute("SELECT run_id FROM runTable ORDER BY run_id DESC LIMIT 1;").fetchone()[0]
|
|
340
|
+
str_query += "{} (run_id, {}) VALUES ({}, {});".format(str(types.name), col_names, run_id, placeholders)
|
|
341
|
+
else:
|
|
342
|
+
str_query += "{} ({}) VALUES ({});".format(str(types.name), col_names, placeholders)
|
|
343
|
+
if isVerbose:
|
|
344
|
+
print(str_query)
|
|
350
345
|
|
|
351
|
-
|
|
346
|
+
rows = zip(*types.properties.values())
|
|
347
|
+
try:
|
|
348
|
+
self.cur.executemany(str_query,rows)
|
|
349
|
+
except duckdb.Error as e:
|
|
350
|
+
self.cur.execute("ROLLBACK")
|
|
351
|
+
self.cur.execute("CHECKPOINT")
|
|
352
|
+
raise duckdb.Error(e)
|
|
353
|
+
|
|
354
|
+
self.types = types # This will only copy the last table from artifacts (collections input)
|
|
352
355
|
|
|
353
356
|
if "dsi_units" in artifacts.keys():
|
|
354
357
|
create_query = "CREATE TABLE IF NOT EXISTS dsi_units (table_name TEXT, column_name TEXT, unit TEXT)"
|
|
@@ -378,9 +381,13 @@ class DuckDB(Filesystem):
|
|
|
378
381
|
raise duckdb.Error(e)
|
|
379
382
|
|
|
380
383
|
|
|
381
|
-
def query_artifacts(self, query, isVerbose=False, dict_return = False):
|
|
384
|
+
def query_artifacts(self, query, isVerbose=False, dict_return = False, **kwargs):
|
|
382
385
|
"""
|
|
383
|
-
Executes a SQL query on the DuckDB backend
|
|
386
|
+
Executes a SQL query on the DuckDB backend.
|
|
387
|
+
|
|
388
|
+
Supports:
|
|
389
|
+
- SELECT / PRAGMA: returns DataFrame or OrderedDict depending on dict_return
|
|
390
|
+
- UPDATE / ALTER: executes command and returns None
|
|
384
391
|
|
|
385
392
|
`query` : str
|
|
386
393
|
Must be a SELECT or PRAGMA SQL query. Aggregate functions like COUNT are allowed.
|
|
@@ -393,12 +400,14 @@ class DuckDB(Filesystem):
|
|
|
393
400
|
If True, returns the result as an OrderedDict.
|
|
394
401
|
If False, returns the result as a pandas DataFrame.
|
|
395
402
|
|
|
396
|
-
|
|
403
|
+
Return : pandas.DataFrame or OrderedDict or None
|
|
404
|
+
- If `query` includes UPDATE or ALTER: returns nothing
|
|
397
405
|
- If `dict_return` is False: returns a DataFrame
|
|
398
406
|
- If `dict_return` is True: returns an OrderedDict
|
|
399
407
|
"""
|
|
400
408
|
data = None
|
|
401
|
-
|
|
409
|
+
command = query.strip().split(None, 1)[0].lower()
|
|
410
|
+
if command in {"select", "pragma"}:
|
|
402
411
|
try:
|
|
403
412
|
data = self.cur.execute(query).fetch_df()
|
|
404
413
|
if isVerbose:
|
|
@@ -412,19 +421,23 @@ class DuckDB(Filesystem):
|
|
|
412
421
|
return OrderedDict()
|
|
413
422
|
return pd.DataFrame()
|
|
414
423
|
raise
|
|
415
|
-
elif
|
|
424
|
+
elif command in {"update", "alter"}:
|
|
425
|
+
query_params = kwargs.pop("params", ())
|
|
416
426
|
try:
|
|
417
|
-
self.
|
|
418
|
-
self.
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
427
|
+
self.cur.execute("BEGIN TRANSACTION")
|
|
428
|
+
self.cur.execute(query, query_params)
|
|
429
|
+
self.cur.execute("COMMIT")
|
|
430
|
+
self.cur.execute("FORCE CHECKPOINT")
|
|
431
|
+
return None
|
|
432
|
+
except duckdb.Error:
|
|
433
|
+
try:
|
|
434
|
+
self.cur.execute("ROLLBACK")
|
|
435
|
+
self.cur.execute("FORCE CHECKPOINT")
|
|
436
|
+
except duckdb.Error:
|
|
437
|
+
pass
|
|
425
438
|
raise
|
|
426
439
|
else:
|
|
427
|
-
raise RuntimeError("Can only run SELECT or
|
|
440
|
+
raise RuntimeError("Can only run SELECT, PRAGMA, UPDATE, or ALTER queries on the data")
|
|
428
441
|
|
|
429
442
|
if dict_return:
|
|
430
443
|
tables = self.get_table_names(query)
|
|
@@ -448,7 +461,7 @@ class DuckDB(Filesystem):
|
|
|
448
461
|
If True, returns the result as an OrderedDict.
|
|
449
462
|
If False, returns the result as a pandas DataFrame.
|
|
450
463
|
|
|
451
|
-
|
|
464
|
+
Return : pandas.DataFrame or OrderedDict
|
|
452
465
|
- If `dict_return` is False: returns a DataFrame
|
|
453
466
|
- If `dict_return` is True: returns an OrderedDict
|
|
454
467
|
"""
|
|
@@ -461,7 +474,7 @@ class DuckDB(Filesystem):
|
|
|
461
474
|
`query` : str
|
|
462
475
|
A SQL query string, typically passed into `query_artifacts()`.
|
|
463
476
|
|
|
464
|
-
|
|
477
|
+
Return: list of str
|
|
465
478
|
List of table names referenced in the query.
|
|
466
479
|
"""
|
|
467
480
|
all_names = re.findall(r'FROM\s+["\']?([\w\-]+)["\']?|JOIN\s+["\']?([\w\-]+)["\']?', query, re.IGNORECASE)
|
|
@@ -472,7 +485,7 @@ class DuckDB(Filesystem):
|
|
|
472
485
|
"""
|
|
473
486
|
Returns the structural schema of this database in the form of CREATE TABLE statements.
|
|
474
487
|
|
|
475
|
-
|
|
488
|
+
Return: str
|
|
476
489
|
Each table's CREATE TABLE statement is concatenated into one large string.
|
|
477
490
|
"""
|
|
478
491
|
schema_stmts = self.query_artifacts(query="SELECT sql FROM duckdb_tables where sql NOT NULL ")
|
|
@@ -493,7 +506,7 @@ class DuckDB(Filesystem):
|
|
|
493
506
|
`only_units_relations` : bool, default=False
|
|
494
507
|
**USERS SHOULD IGNORE THIS FLAG.** Used internally by duckdb.py.
|
|
495
508
|
|
|
496
|
-
|
|
509
|
+
Return : OrderedDict
|
|
497
510
|
A nested OrderedDict containing all data from the DuckDB database.
|
|
498
511
|
"""
|
|
499
512
|
artifact = OrderedDict()
|
|
@@ -551,7 +564,7 @@ class DuckDB(Filesystem):
|
|
|
551
564
|
`query_object` : int, float, or str
|
|
552
565
|
The value to search for across all tables in the backend.
|
|
553
566
|
|
|
554
|
-
|
|
567
|
+
Return : list
|
|
555
568
|
A list of ValueObjects representing matches.
|
|
556
569
|
|
|
557
570
|
- Note: ValueObjects may vary in structure depending on whether the match occurred at the table, column, or cell level.
|
|
@@ -579,7 +592,7 @@ class DuckDB(Filesystem):
|
|
|
579
592
|
`query_object` : str
|
|
580
593
|
The string to search for in table names.
|
|
581
594
|
|
|
582
|
-
|
|
595
|
+
Return : list of ValueObjects
|
|
583
596
|
One ValueObject per matching table.
|
|
584
597
|
|
|
585
598
|
ValueObject Structure:
|
|
@@ -625,7 +638,7 @@ class DuckDB(Filesystem):
|
|
|
625
638
|
If True, `value` in the returned ValueObject will be the [min, max] of the matching numerical column.
|
|
626
639
|
If False, `value` in the returned ValueObject will be the full list of column data.
|
|
627
640
|
|
|
628
|
-
|
|
641
|
+
Return : List of ValueObjects if there is a match.
|
|
629
642
|
|
|
630
643
|
ValueObject Structure:
|
|
631
644
|
- t_name: table name (str)
|
|
@@ -683,10 +696,10 @@ class DuckDB(Filesystem):
|
|
|
683
696
|
The value to search for at the cell level, across all tables in the backend.
|
|
684
697
|
|
|
685
698
|
`row`: bool, optional, default=False
|
|
686
|
-
If True,
|
|
687
|
-
If False,
|
|
699
|
+
If True, certain fields in ValueObject will contain entire row's metadata/data
|
|
700
|
+
If False, certain fields in ValueObject will only contain the matching cell's metadata/data.
|
|
688
701
|
|
|
689
|
-
|
|
702
|
+
Return : List of ValueObjects if there is a match.
|
|
690
703
|
|
|
691
704
|
ValueObject Structure:
|
|
692
705
|
- t_name: table name (str)
|
|
@@ -768,7 +781,7 @@ class DuckDB(Filesystem):
|
|
|
768
781
|
`relation` : str
|
|
769
782
|
The operator and value to apply to the column. Ex: >4, <4, =4, >=4, <=4, ==4, !=4, (4,5), ~4, ~~4
|
|
770
783
|
|
|
771
|
-
|
|
784
|
+
Return : list of ValueObjects
|
|
772
785
|
One ValueObject per matching row in that first table.
|
|
773
786
|
|
|
774
787
|
ValueObject Structure:
|
|
@@ -918,9 +931,10 @@ class DuckDB(Filesystem):
|
|
|
918
931
|
Returns numerical metadata from tables in the first activated backend.
|
|
919
932
|
|
|
920
933
|
`table_name` : str, optional
|
|
921
|
-
If specified, only the numerical metadata for that table
|
|
934
|
+
If specified, only the numerical metadata for that table is returned as a Pandas DataFrame.
|
|
922
935
|
|
|
923
|
-
If None (default), metadata for
|
|
936
|
+
If None (default), names of all tables and metadata for each table is returned as a list.
|
|
937
|
+
[table_name_list, table1_df, table2_df, table3df ...]
|
|
924
938
|
"""
|
|
925
939
|
if table_name is None:
|
|
926
940
|
tableList = self.cur.execute("""
|
|
@@ -1088,7 +1102,7 @@ class DuckDB(Filesystem):
|
|
|
1088
1102
|
`relation_dict` : OrderedDict
|
|
1089
1103
|
An OrderedDict describing table relationships. Structured as the `dsi_relations` object with primary and foreign keys.
|
|
1090
1104
|
|
|
1091
|
-
|
|
1105
|
+
Return: tuple of (has_cycle, ordered_tables)
|
|
1092
1106
|
- has_cycle (bool): True if a circular dependency is detected.
|
|
1093
1107
|
- ordered_tables (list or None): Ordered list of tables if no cycle is found; None if a circular dependency exists.
|
|
1094
1108
|
"""
|
|
@@ -1147,6 +1161,6 @@ class DuckDB(Filesystem):
|
|
|
1147
1161
|
"""
|
|
1148
1162
|
Closes the DuckDB database's connection.
|
|
1149
1163
|
|
|
1150
|
-
|
|
1164
|
+
Return: None
|
|
1151
1165
|
"""
|
|
1152
1166
|
self.con.close()
|
|
@@ -6,6 +6,7 @@ class Filesystem(Backend, ABC):
|
|
|
6
6
|
def __init__(self, filename, **kwargs) -> None:
|
|
7
7
|
pass
|
|
8
8
|
|
|
9
|
+
# Can raise NotImplementedError for a read-only backend
|
|
9
10
|
@abstractmethod
|
|
10
11
|
def ingest_artifacts(self, artifacts, **kwargs) -> None:
|
|
11
12
|
pass
|
|
@@ -18,6 +19,10 @@ class Filesystem(Backend, ABC):
|
|
|
18
19
|
def get_table(self, table_name, **kwargs):
|
|
19
20
|
pass
|
|
20
21
|
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get_table_names(self,query):
|
|
24
|
+
pass
|
|
25
|
+
|
|
21
26
|
@abstractmethod
|
|
22
27
|
def notebook(self, **kwargs):
|
|
23
28
|
pass
|
|
@@ -25,6 +30,10 @@ class Filesystem(Backend, ABC):
|
|
|
25
30
|
@abstractmethod
|
|
26
31
|
def process_artifacts(self, **kwargs):
|
|
27
32
|
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_schema(self):
|
|
36
|
+
pass
|
|
28
37
|
|
|
29
38
|
@abstractmethod
|
|
30
39
|
def find(self, query_object, **kwargs):
|
|
@@ -62,6 +71,7 @@ class Filesystem(Backend, ABC):
|
|
|
62
71
|
def summary(self, table_name, **kwargs):
|
|
63
72
|
pass
|
|
64
73
|
|
|
74
|
+
# Can raise NotImplementedError for a read-only backend
|
|
65
75
|
@abstractmethod
|
|
66
76
|
def overwrite_table(self, table_name, collection, **kwargs):
|
|
67
77
|
pass
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
|
|
3
|
+
# Holds table name and data properties
|
|
4
|
+
from dsi.backends.filesystem import Filesystem
|
|
5
|
+
|
|
6
|
+
class DataType:
|
|
7
|
+
name = "DEFAULT"
|
|
8
|
+
properties = {}
|
|
9
|
+
units = {}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Gufi(Filesystem):
|
|
13
|
+
'''
|
|
14
|
+
GUFI Datastore
|
|
15
|
+
'''
|
|
16
|
+
gufi_prefix = ""
|
|
17
|
+
gufi_index = ""
|
|
18
|
+
dsi_table_name = ""
|
|
19
|
+
dsi_columns = ""
|
|
20
|
+
gufi_columns = ""
|
|
21
|
+
collection_name = ""
|
|
22
|
+
dsi_db = None
|
|
23
|
+
isVerbose = False
|
|
24
|
+
|
|
25
|
+
def __init__(self, gufi_prefix, gufi_index, dsi_table_name, dsi_columns, gufi_columns,
|
|
26
|
+
collection_name, dsi_db, verbose=False):
|
|
27
|
+
'''
|
|
28
|
+
`gufi_prefix`: the directory where GUFI is installed
|
|
29
|
+
|
|
30
|
+
`gufi_index`: the directory where GUFI's indexes are
|
|
31
|
+
|
|
32
|
+
`dsi_table_name`: the DSI table name that has the UUID for each file as a column
|
|
33
|
+
|
|
34
|
+
`dsi_columns`: the DSI table columns that should be included in the join with GUFI
|
|
35
|
+
|
|
36
|
+
`gufi_columns`: the GUFI columns that should be included in the join with DSI
|
|
37
|
+
|
|
38
|
+
`collection_name`: the name that identifies the collection that the DSI database belongs to
|
|
39
|
+
|
|
40
|
+
`dsi_db`: the path to the dsi db
|
|
41
|
+
|
|
42
|
+
`verbose`: print debugging statements or not
|
|
43
|
+
'''
|
|
44
|
+
|
|
45
|
+
# prefix is the prefix to the GUFI installation
|
|
46
|
+
self.gufi_prefix = gufi_prefix
|
|
47
|
+
self.gufi_index = gufi_index
|
|
48
|
+
self.dsi_table_name = dsi_table_name
|
|
49
|
+
self.dsi_columns = dsi_columns
|
|
50
|
+
self.gufi_columns = gufi_columns
|
|
51
|
+
self.collection_name = collection_name
|
|
52
|
+
self.dsi_db = dsi_db
|
|
53
|
+
self.isVerbose = verbose
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Query GUFI and DSI db ˘
|
|
57
|
+
def query_artifacts(self, query):
|
|
58
|
+
'''
|
|
59
|
+
Retrieves GUFI's metadata joined with a dsi database
|
|
60
|
+
query: an sql query into the dsi_entries table
|
|
61
|
+
'''
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
resout = self._run_gufi_query(query)
|
|
65
|
+
if self.isVerbose:
|
|
66
|
+
print(resout)
|
|
67
|
+
|
|
68
|
+
return resout
|
|
69
|
+
except Exception:
|
|
70
|
+
print("Error running GUFI query")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def ingest_artifacts(self, query):
|
|
74
|
+
raise NotImplementedError("Cannot ingest data with the GUFI backend")
|
|
75
|
+
|
|
76
|
+
# Runs the gufi query command
|
|
77
|
+
def _run_gufi_query(self, sqlstring):
|
|
78
|
+
'''
|
|
79
|
+
Calls the qufy_query command to run the sql query
|
|
80
|
+
sqlstring: the query into the dsi_entries table
|
|
81
|
+
'''
|
|
82
|
+
|
|
83
|
+
metadata = []
|
|
84
|
+
with sqlite3.connect(":memory:") as con:
|
|
85
|
+
con.enable_load_extension(True)
|
|
86
|
+
|
|
87
|
+
# alternatively you can load the extension using an API call:
|
|
88
|
+
con.load_extension(self.gufi_prefix + "/lib/gufi_vt")
|
|
89
|
+
|
|
90
|
+
# disable extension loading again
|
|
91
|
+
con.enable_load_extension(False)
|
|
92
|
+
|
|
93
|
+
dsi_column_names = ",".join(self.dsi_columns)
|
|
94
|
+
gufi_column_names = ",".join((["rpath(sname, sroll, name) AS fullpath"] + self.gufi_columns[1:]
|
|
95
|
+
if self.gufi_columns[0] == "fullpath" else self.gufi_columns))
|
|
96
|
+
query=f"""
|
|
97
|
+
CREATE VIRTUAL TABLE uview USING gufi_vt(
|
|
98
|
+
threads=64,
|
|
99
|
+
E="SELECT {gufi_column_names}, dsi_uuid(xattr_name, xattr_value) AS uuid FROM vrxpentries WHERE uuid IS NOT NULL;",
|
|
100
|
+
index='{self.gufi_index}',
|
|
101
|
+
plugin='gufi_plugin_operations:{self.gufi_prefix}/lib/libdsi_querying.so'
|
|
102
|
+
);
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# example from SQLite wiki
|
|
106
|
+
cur = con.execute(query)
|
|
107
|
+
query=f"""
|
|
108
|
+
ATTACH '{self.dsi_db}' AS
|
|
109
|
+
{self.collection_name};
|
|
110
|
+
"""
|
|
111
|
+
cur = con.execute(query)
|
|
112
|
+
|
|
113
|
+
if sqlstring is None or len(sqlstring) == 0:
|
|
114
|
+
query = f"""
|
|
115
|
+
SELECT uview.*, {dsi_column_names} FROM uview JOIN ATLAS_UUID.zarr_metadata_uuid ON uview.uuid == ATLAS_UUID.zarr_metadata_uuid.uuid;
|
|
116
|
+
"""
|
|
117
|
+
else:
|
|
118
|
+
query = sqlstring
|
|
119
|
+
|
|
120
|
+
print("query: ", query)
|
|
121
|
+
cur.execute(query)
|
|
122
|
+
rows = cur.fetchall()
|
|
123
|
+
for row in rows:
|
|
124
|
+
print(row)
|
|
125
|
+
metadata.append(row)
|
|
126
|
+
|
|
127
|
+
return metadata
|
|
128
|
+
|
|
129
|
+
def close(self):
|
|
130
|
+
raise NotImplementedError("No connection to close for the GUFI backend")
|
|
131
|
+
def display(self):
|
|
132
|
+
raise NotImplementedError("Cannot display data with the GUFI backend")
|
|
133
|
+
def find(self):
|
|
134
|
+
raise NotImplementedError("Cannot find data with the GUFI backend")
|
|
135
|
+
def find_cell(self):
|
|
136
|
+
raise NotImplementedError("Cannot find cell data with the GUFI backend")
|
|
137
|
+
def find_column(self):
|
|
138
|
+
raise NotImplementedError("Cannot find column data with the GUFI backend")
|
|
139
|
+
def find_relation(self):
|
|
140
|
+
raise NotImplementedError("Cannot find data on a relation with the GUFI backend")
|
|
141
|
+
def find_table(self):
|
|
142
|
+
raise NotImplementedError("Cannot find table data with the GUFI backend")
|
|
143
|
+
def get_schema(self):
|
|
144
|
+
pass
|
|
145
|
+
def get_table(self):
|
|
146
|
+
raise NotImplementedError("Cannot get table data with the GUFI backend")
|
|
147
|
+
def get_table_names(self):
|
|
148
|
+
raise NotImplementedError("Cannot get table names with the GUFI backend")
|
|
149
|
+
def list(self):
|
|
150
|
+
raise NotImplementedError("Cannot list tables with the GUFI backend")
|
|
151
|
+
def notebook(self):
|
|
152
|
+
raise NotImplementedError("Cannot create notebook with the GUFI backend")
|
|
153
|
+
def num_tables(self):
|
|
154
|
+
raise NotImplementedError("Cannot count tables with the GUFI backend")
|
|
155
|
+
def overwrite_table(self):
|
|
156
|
+
raise NotImplementedError("Cannot overwrite table with the GUFI backend")
|
|
157
|
+
def process_artifacts(self):
|
|
158
|
+
raise NotImplementedError("Cannot process artifacts with the GUFI backend")
|
|
159
|
+
def summary(self):
|
|
160
|
+
raise NotImplementedError("Cannot summarize data with the GUFI backend")
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import subprocess
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from dsi.backends.backend import Backend
|
|
6
|
+
from collections import OrderedDict
|
|
7
|
+
|
|
8
|
+
# HPSS backend class
|
|
9
|
+
class HPSS(Backend):
|
|
10
|
+
read_only = False
|
|
11
|
+
|
|
12
|
+
def __init__(self, hpss_files):
|
|
13
|
+
"""
|
|
14
|
+
Initializes an HPSS backend
|
|
15
|
+
|
|
16
|
+
`hpss_files`: list with hpss file paths
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
self.hpss_info = OrderedDict()
|
|
20
|
+
for hpss_file in hpss_files.keys():
|
|
21
|
+
self.hpss_info[hpss_file] = {
|
|
22
|
+
'local_path': hpss_files[hpss_file],
|
|
23
|
+
'hpss_hash': None,
|
|
24
|
+
}
|
|
25
|
+
stdout, stderr, _ = self.run_hsi("hashlist", [hpss_file])
|
|
26
|
+
hpss_hash = self.parse_hpss_hash(stdout, stderr)
|
|
27
|
+
self.hpss_info[hpss_file]['hpss_hash'] = hpss_hash
|
|
28
|
+
|
|
29
|
+
def create_hpss_hash(self, hpss_file) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Creates and HPSS hash
|
|
32
|
+
"""
|
|
33
|
+
stdout, stderr, returncode = self.run_hsi("hashcreate", [hpss_file])
|
|
34
|
+
if returncode != 0:
|
|
35
|
+
print(stderr)
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
hash = self.parse_hpss_hash(stdout, stderr)
|
|
39
|
+
return hash
|
|
40
|
+
|
|
41
|
+
def put(self, local_file, hpss_dest) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Puts a local file on HPSS
|
|
44
|
+
"""
|
|
45
|
+
cwd = os.getcwd()
|
|
46
|
+
new_dir = None
|
|
47
|
+
file_to_put = local_file
|
|
48
|
+
if '/' in local_file:
|
|
49
|
+
new_dir = '/'.join(local_file.split('/')[:-1])
|
|
50
|
+
os.chdir(new_dir)
|
|
51
|
+
file_to_put = local_file.split('/')[-1]
|
|
52
|
+
|
|
53
|
+
stdout, stderr, returncode = self.run_hsi("put", [file_to_put])
|
|
54
|
+
if new_dir is not None:
|
|
55
|
+
os.chdir(cwd)
|
|
56
|
+
|
|
57
|
+
if returncode == 0:
|
|
58
|
+
self.create_hpss_hash(file_to_put)
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
def get(self, hpss_file, tmp_dir) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Gets an HPSS file and puts it in the tmp_dir
|
|
66
|
+
"""
|
|
67
|
+
cwd = os.getcwd()
|
|
68
|
+
try:
|
|
69
|
+
os.chdir(tmp_dir)
|
|
70
|
+
except Exception:
|
|
71
|
+
print("Error changing to temp dir: %s" % tmp_dir)
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
stdout, stderr, returncode = self.run_hsi("get", hpss_file)
|
|
75
|
+
try:
|
|
76
|
+
os.chdir(cwd)
|
|
77
|
+
except Exception:
|
|
78
|
+
print("Error changing to dir: %s" % cwd)
|
|
79
|
+
|
|
80
|
+
if returncode == 0:
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
def parse_hpss_hash(self, stdout, stderr) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Parses the result of an HPSS hash command
|
|
88
|
+
"""
|
|
89
|
+
output = stdout + stderr
|
|
90
|
+
hash = None
|
|
91
|
+
for line in output.splitlines():
|
|
92
|
+
if " md5" not in line:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
line = line.strip()
|
|
96
|
+
matches = re.search(r'(\S+)\s+(\S+)\s+(\S+).*', line)
|
|
97
|
+
if not matches:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
if len(matches.groups()) == 3:
|
|
101
|
+
hash = matches.group(1)
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
return hash
|
|
105
|
+
|
|
106
|
+
def run_hsi(self, subcmd, arg_list):
|
|
107
|
+
"""
|
|
108
|
+
Runs hsi with the supplied subcmd and arguments
|
|
109
|
+
"""
|
|
110
|
+
command = ["hsi", subcmd]
|
|
111
|
+
command += arg_list
|
|
112
|
+
|
|
113
|
+
stdout = ""
|
|
114
|
+
stderr = ""
|
|
115
|
+
returncode = -1
|
|
116
|
+
try:
|
|
117
|
+
process = subprocess.Popen(command, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='latin-1')
|
|
118
|
+
|
|
119
|
+
stdout, stderr = process.communicate()
|
|
120
|
+
returncode = process.communicate()
|
|
121
|
+
except FileNotFoundError as e:
|
|
122
|
+
print("Error running hsi: %s" % e)
|
|
123
|
+
|
|
124
|
+
return stdout, stderr, returncode
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def ingest_artifacts(self, collection, isVerbose=False):
|
|
128
|
+
for f in self.hpss_info.keys():
|
|
129
|
+
self.put(self.hpss_info[f]['local_path'], f)
|
|
130
|
+
|
|
131
|
+
def query_artifacts(self, query, **kwargs):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
def notebook(self, **kwargs):
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
def process_artifacts(self, **kwargs):
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
def get_schema(self):
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
def find(self, query_object, **kwargs):
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
def find_table(self, query_object, **kwargs):
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
def find_column(self, query_object, **kwargs):
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
def find_cell(self, query_object, **kwargs):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
def find_relation(self, column_name, relation, **kwargs):
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
def list(self, **kwargs):
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def display(self, table_name, **kwargs):
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
def summary(self, table_name, **kwargs):
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
def close(self):
|
|
168
|
+
pass
|