dsi-workflow 1.2__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/PKG-INFO +1 -1
  2. dsi_workflow-1.2.2/dsi/_version.py +1 -0
  3. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/backend.py +17 -0
  4. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/duckdb.py +65 -51
  5. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/filesystem.py +10 -0
  6. dsi_workflow-1.2.2/dsi/backends/gufi.py +160 -0
  7. dsi_workflow-1.2.2/dsi/backends/hpss.py +168 -0
  8. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/ndp.py +107 -242
  9. dsi_workflow-1.2.2/dsi/backends/oceans11.py +1277 -0
  10. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/osti.py +307 -161
  11. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/sqlalchemy.py +7 -0
  12. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/sqlite.py +63 -55
  13. dsi_workflow-1.2.2/dsi/backends/tests/test_gufi.py +20 -0
  14. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_ndp.py +15 -18
  15. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/webserver.py +21 -1
  16. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/cli.py +235 -112
  17. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/core.py +55 -51
  18. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/dsi.py +191 -171
  19. dsi_workflow-1.2.2/dsi/dsifederated.py +583 -0
  20. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/collection_reader.py +0 -1
  21. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/dashboard.py +2 -2
  22. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/file_reader.py +39 -84
  23. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/ml_emulator.py +1 -3
  24. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_file_reader.py +7 -6
  25. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/sync.py +453 -182
  26. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/tests/test_core.py +6 -2
  27. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/tests/test_dsi.py +0 -2
  28. dsi_workflow-1.2.2/dsi/utils/federated/federate_datasets.py +646 -0
  29. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/federation_utils.py +30 -6
  30. dsi_workflow-1.2.2/dsi/utils/hpc_kerberos.py +251 -0
  31. dsi_workflow-1.2.2/dsi/utils/s3_utils.py +248 -0
  32. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/PKG-INFO +1 -1
  33. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/SOURCES.txt +3 -2
  34. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/pyproject.toml +1 -4
  35. dsi_workflow-1.2/dsi/_version.py +0 -1
  36. dsi_workflow-1.2/dsi/backends/gufi.py +0 -88
  37. dsi_workflow-1.2/dsi/backends/hpss.py +0 -156
  38. dsi_workflow-1.2/dsi/backends/tests/test_gufi.py +0 -16
  39. dsi_workflow-1.2/dsi/dsifederated.py +0 -293
  40. dsi_workflow-1.2/dsi/utils/federated/federate_datasets.py +0 -380
  41. dsi_workflow-1.2/dsi/utils/launch_streamlit.bat +0 -9
  42. dsi_workflow-1.2/dsi/utils/launch_streamlit.sh +0 -27
  43. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/README.rst +0 -0
  44. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/__init__.py +0 -0
  45. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_duckdb.py +0 -0
  46. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_hpss.py +0 -0
  47. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_sqlalchemy.py +0 -0
  48. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/backends/tests/test_sqlite.py +0 -0
  49. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/env.py +0 -0
  50. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/file_writer.py +0 -0
  51. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/metadata.py +0 -0
  52. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/plugin.py +0 -0
  53. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/plugin_models.py +0 -0
  54. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_collection_reader.py +0 -0
  55. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_env.py +0 -0
  56. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/plugins/tests/test_file_writer.py +0 -0
  57. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/__init__.py +0 -0
  58. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/dsi_utils.py +0 -0
  59. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/federated/__init__.py +0 -0
  60. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/git_utils.py +0 -0
  61. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/rsync_utils.py +0 -0
  62. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi/utils/web_utils.py +0 -0
  63. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/dependency_links.txt +0 -0
  64. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/entry_points.txt +0 -0
  65. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/requires.txt +0 -0
  66. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/dsi_workflow.egg-info/top_level.txt +0 -0
  67. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/requirements.txt +0 -0
  68. {dsi_workflow-1.2 → dsi_workflow-1.2.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dsi-workflow
3
- Version: 1.2
3
+ Version: 1.2.2
4
4
  Summary: A Data Science Infrastructure Project
5
5
  Author-email: Jesus Pulido <pulido@lanl.gov>, James Ahrens <ahrens@lanl.gov>, Divya Banesh <dbanesh@lanl.gov>, Hugh Greenberg <hng@lanl.gov>, Pascal Grosset <pascalgrosset@lanl.gov>, Vedant Iyer <iyer@lanl.gov>, Benjamin Sims <bsims@lanl.gov>, Terece Turton <tlturton@lanl.gov>
6
6
  License-Expression: BSD-3-Clause
@@ -0,0 +1 @@
1
+ __version__ = "1.2.2"
@@ -5,6 +5,7 @@ class Backend(ABC):
5
5
  def __init__(self, data_source, **kwargs) -> None:
6
6
  pass
7
7
 
8
+ # Can raise NotImplementedError for a read-only backend
8
9
  @abstractmethod
9
10
  def ingest_artifacts(self, artifacts, **kwargs) -> None:
10
11
  pass
@@ -13,6 +14,14 @@ class Backend(ABC):
13
14
  def query_artifacts(self, query, **kwargs):
14
15
  pass
15
16
 
17
+ @abstractmethod
18
+ def get_table(self, table_name, **kwargs):
19
+ pass
20
+
21
+ @abstractmethod
22
+ def get_table_names(self,query):
23
+ pass
24
+
16
25
  @abstractmethod
17
26
  def notebook(self, **kwargs):
18
27
  pass
@@ -21,6 +30,10 @@ class Backend(ABC):
21
30
  def process_artifacts(self, **kwargs):
22
31
  pass
23
32
 
33
+ @abstractmethod
34
+ def get_schema(self):
35
+ pass
36
+
24
37
  @abstractmethod
25
38
  def find(self, query_object, **kwargs):
26
39
  pass
@@ -45,6 +58,10 @@ class Backend(ABC):
45
58
  def list(self, **kwargs):
46
59
  pass
47
60
 
61
+ @abstractmethod
62
+ def num_tables(self):
63
+ pass
64
+
48
65
  @abstractmethod
49
66
  def display(self, table_name, **kwargs):
50
67
  pass
@@ -42,6 +42,7 @@ class DuckDB(Filesystem):
42
42
  DuckDB Filesystem Backend to which a user can ingest/process data, generate a Jupyter notebook, and find occurrences of a search term
43
43
  """
44
44
  runTable = False
45
+ read_only = False
45
46
 
46
47
  def __init__(self, filename, **kwargs):
47
48
  """
@@ -65,7 +66,7 @@ class DuckDB(Filesystem):
65
66
  `input_list` : list
66
67
  A list of values to analyze for type compatibility.
67
68
 
68
- `return`: str
69
+ Return: str
69
70
  A string representing the inferred DuckDB data type for the input list.
70
71
  """
71
72
  DUCKDB_BIGINT_MIN = -9223372036854775808
@@ -281,7 +282,7 @@ class DuckDB(Filesystem):
281
282
  else:
282
283
  table_order = list(reversed(ordered_tables)) # ingest primary key tables first then children
283
284
 
284
- if self.runTable:
285
+ if self.runTable and artifacts:
285
286
  runTable_create = "CREATE TABLE IF NOT EXISTS runTable (run_id INTEGER PRIMARY KEY, run_timestamp TEXT UNIQUE);"
286
287
  self.cur.execute(runTable_create)
287
288
 
@@ -328,27 +329,29 @@ class DuckDB(Filesystem):
328
329
 
329
330
  self.ingest_table_helper(types, foreign_query)
330
331
 
331
- col_names = ', '.join(types.properties.keys())
332
- placeholders = ', '.join('?' * len(types.properties))
333
-
334
- str_query = "INSERT INTO "
335
- if self.runTable:
336
- run_id = self.cur.execute("SELECT run_id FROM runTable ORDER BY run_id DESC LIMIT 1;").fetchone()[0]
337
- str_query += "{} (run_id, {}) VALUES ({}, {});".format(str(types.name), col_names, run_id, placeholders)
338
- else:
339
- str_query += "{} ({}) VALUES ({});".format(str(types.name), col_names, placeholders)
340
- if isVerbose:
341
- print(str_query)
342
-
343
- rows = zip(*types.properties.values())
344
- try:
345
- self.cur.executemany(str_query,rows)
346
- except duckdb.Error as e:
347
- self.cur.execute("ROLLBACK")
348
- self.cur.execute("CHECKPOINT")
349
- raise duckdb.Error(e)
332
+ # TODO: move this check to schema reader by allowing users to just create table without data
333
+ if not all(v == [""] for v in tableData.values()): # if table is just one row of empty strings, don't insert
334
+ col_names = ', '.join(types.properties.keys())
335
+ placeholders = ', '.join('?' * len(types.properties))
336
+
337
+ str_query = "INSERT INTO "
338
+ if self.runTable:
339
+ run_id = self.cur.execute("SELECT run_id FROM runTable ORDER BY run_id DESC LIMIT 1;").fetchone()[0]
340
+ str_query += "{} (run_id, {}) VALUES ({}, {});".format(str(types.name), col_names, run_id, placeholders)
341
+ else:
342
+ str_query += "{} ({}) VALUES ({});".format(str(types.name), col_names, placeholders)
343
+ if isVerbose:
344
+ print(str_query)
350
345
 
351
- self.types = types #This will only copy the last table from artifacts (collections input)
346
+ rows = zip(*types.properties.values())
347
+ try:
348
+ self.cur.executemany(str_query,rows)
349
+ except duckdb.Error as e:
350
+ self.cur.execute("ROLLBACK")
351
+ self.cur.execute("CHECKPOINT")
352
+ raise duckdb.Error(e)
353
+
354
+ self.types = types # This will only copy the last table from artifacts (collections input)
352
355
 
353
356
  if "dsi_units" in artifacts.keys():
354
357
  create_query = "CREATE TABLE IF NOT EXISTS dsi_units (table_name TEXT, column_name TEXT, unit TEXT)"
@@ -378,9 +381,13 @@ class DuckDB(Filesystem):
378
381
  raise duckdb.Error(e)
379
382
 
380
383
 
381
- def query_artifacts(self, query, isVerbose=False, dict_return = False):
384
+ def query_artifacts(self, query, isVerbose=False, dict_return = False, **kwargs):
382
385
  """
383
- Executes a SQL query on the DuckDB backend and returns the result in the specified format dependent on `dict_return`
386
+ Executes a SQL query on the DuckDB backend.
387
+
388
+ Supports:
389
+ - SELECT / PRAGMA: returns DataFrame or OrderedDict depending on dict_return
390
+ - UPDATE / ALTER: executes command and returns None
384
391
 
385
392
  `query` : str
386
393
  Must be a SELECT or PRAGMA SQL query. Aggregate functions like COUNT are allowed.
@@ -393,12 +400,14 @@ class DuckDB(Filesystem):
393
400
  If True, returns the result as an OrderedDict.
394
401
  If False, returns the result as a pandas DataFrame.
395
402
 
396
- `return` : pandas.DataFrame or OrderedDict
403
+ Return : pandas.DataFrame or OrderedDict or None
404
+ - If `query` includes UPDATE or ALTER: returns nothing
397
405
  - If `dict_return` is False: returns a DataFrame
398
406
  - If `dict_return` is True: returns an OrderedDict
399
407
  """
400
408
  data = None
401
- if query[:6].lower() == "select" or query[:6].lower() == "pragma":
409
+ command = query.strip().split(None, 1)[0].lower()
410
+ if command in {"select", "pragma"}:
402
411
  try:
403
412
  data = self.cur.execute(query).fetch_df()
404
413
  if isVerbose:
@@ -412,19 +421,23 @@ class DuckDB(Filesystem):
412
421
  return OrderedDict()
413
422
  return pd.DataFrame()
414
423
  raise
415
- elif "filesystem" in query.lower() and "drop" in query.lower(): #remove filesystem passthrough in future
424
+ elif command in {"update", "alter"}:
425
+ query_params = kwargs.pop("params", ())
416
426
  try:
417
- self.con.execute(query)
418
- self.con.commit()
419
- except Exception as e:
420
- message = str(e)
421
- if "Table" in message and "does not exist" in message:
422
- table_name = message[message.find("Table"):message.find("Did you mean")-2]
423
- print(f"WARNING: {table_name} in this database")
424
- return
427
+ self.cur.execute("BEGIN TRANSACTION")
428
+ self.cur.execute(query, query_params)
429
+ self.cur.execute("COMMIT")
430
+ self.cur.execute("FORCE CHECKPOINT")
431
+ return None
432
+ except duckdb.Error:
433
+ try:
434
+ self.cur.execute("ROLLBACK")
435
+ self.cur.execute("FORCE CHECKPOINT")
436
+ except duckdb.Error:
437
+ pass
425
438
  raise
426
439
  else:
427
- raise RuntimeError("Can only run SELECT or PRAGMA queries on the data")
440
+ raise RuntimeError("Can only run SELECT, PRAGMA, UPDATE, or ALTER queries on the data")
428
441
 
429
442
  if dict_return:
430
443
  tables = self.get_table_names(query)
@@ -448,7 +461,7 @@ class DuckDB(Filesystem):
448
461
  If True, returns the result as an OrderedDict.
449
462
  If False, returns the result as a pandas DataFrame.
450
463
 
451
- `return` : pandas.DataFrame or OrderedDict
464
+ Return : pandas.DataFrame or OrderedDict
452
465
  - If `dict_return` is False: returns a DataFrame
453
466
  - If `dict_return` is True: returns an OrderedDict
454
467
  """
@@ -461,7 +474,7 @@ class DuckDB(Filesystem):
461
474
  `query` : str
462
475
  A SQL query string, typically passed into `query_artifacts()`.
463
476
 
464
- `return`: list of str
477
+ Return: list of str
465
478
  List of table names referenced in the query.
466
479
  """
467
480
  all_names = re.findall(r'FROM\s+["\']?([\w\-]+)["\']?|JOIN\s+["\']?([\w\-]+)["\']?', query, re.IGNORECASE)
@@ -472,7 +485,7 @@ class DuckDB(Filesystem):
472
485
  """
473
486
  Returns the structural schema of this database in the form of CREATE TABLE statements.
474
487
 
475
- `return`: str
488
+ Return: str
476
489
  Each table's CREATE TABLE statement is concatenated into one large string.
477
490
  """
478
491
  schema_stmts = self.query_artifacts(query="SELECT sql FROM duckdb_tables where sql NOT NULL ")
@@ -493,7 +506,7 @@ class DuckDB(Filesystem):
493
506
  `only_units_relations` : bool, default=False
494
507
  **USERS SHOULD IGNORE THIS FLAG.** Used internally by duckdb.py.
495
508
 
496
- `return` : OrderedDict
509
+ Return : OrderedDict
497
510
  A nested OrderedDict containing all data from the DuckDB database.
498
511
  """
499
512
  artifact = OrderedDict()
@@ -551,7 +564,7 @@ class DuckDB(Filesystem):
551
564
  `query_object` : int, float, or str
552
565
  The value to search for across all tables in the backend.
553
566
 
554
- `return` : list
567
+ Return : list
555
568
  A list of ValueObjects representing matches.
556
569
 
557
570
  - Note: ValueObjects may vary in structure depending on whether the match occurred at the table, column, or cell level.
@@ -579,7 +592,7 @@ class DuckDB(Filesystem):
579
592
  `query_object` : str
580
593
  The string to search for in table names.
581
594
 
582
- `return` : list of ValueObjects
595
+ Return : list of ValueObjects
583
596
  One ValueObject per matching table.
584
597
 
585
598
  ValueObject Structure:
@@ -625,7 +638,7 @@ class DuckDB(Filesystem):
625
638
  If True, `value` in the returned ValueObject will be the [min, max] of the matching numerical column.
626
639
  If False, `value` in the returned ValueObject will be the full list of column data.
627
640
 
628
- `return` : List of ValueObjects if there is a match.
641
+ Return : List of ValueObjects if there is a match.
629
642
 
630
643
  ValueObject Structure:
631
644
  - t_name: table name (str)
@@ -683,10 +696,10 @@ class DuckDB(Filesystem):
683
696
  The value to search for at the cell level, across all tables in the backend.
684
697
 
685
698
  `row`: bool, optional, default=False
686
- If True, `value` in the returned ValueObject will be the entire row where a cell matched.
687
- If False, `value` in the returned ValueObject will only be the matching cell value.
699
+ If True, certain fields in ValueObject will contain entire row's metadata/data
700
+ If False, certain fields in ValueObject will only contain the matching cell's metadata/data.
688
701
 
689
- `return` : List of ValueObjects if there is a match.
702
+ Return : List of ValueObjects if there is a match.
690
703
 
691
704
  ValueObject Structure:
692
705
  - t_name: table name (str)
@@ -768,7 +781,7 @@ class DuckDB(Filesystem):
768
781
  `relation` : str
769
782
  The operator and value to apply to the column. Ex: >4, <4, =4, >=4, <=4, ==4, !=4, (4,5), ~4, ~~4
770
783
 
771
- `return` : list of ValueObjects
784
+ Return : list of ValueObjects
772
785
  One ValueObject per matching row in that first table.
773
786
 
774
787
  ValueObject Structure:
@@ -918,9 +931,10 @@ class DuckDB(Filesystem):
918
931
  Returns numerical metadata from tables in the first activated backend.
919
932
 
920
933
  `table_name` : str, optional
921
- If specified, only the numerical metadata for that table will be returned as a Pandas DataFrame.
934
+ If specified, only the numerical metadata for that table is returned as a Pandas DataFrame.
922
935
 
923
- If None (default), metadata for all available tables is returned as a list of Pandas DataFrames.
936
+ If None (default), names of all tables and metadata for each table is returned as a list.
937
+ [table_name_list, table1_df, table2_df, table3df ...]
924
938
  """
925
939
  if table_name is None:
926
940
  tableList = self.cur.execute("""
@@ -1088,7 +1102,7 @@ class DuckDB(Filesystem):
1088
1102
  `relation_dict` : OrderedDict
1089
1103
  An OrderedDict describing table relationships. Structured as the `dsi_relations` object with primary and foreign keys.
1090
1104
 
1091
- `return`: tuple of (has_cycle, ordered_tables)
1105
+ Return: tuple of (has_cycle, ordered_tables)
1092
1106
  - has_cycle (bool): True if a circular dependency is detected.
1093
1107
  - ordered_tables (list or None): Ordered list of tables if no cycle is found; None if a circular dependency exists.
1094
1108
  """
@@ -1147,6 +1161,6 @@ class DuckDB(Filesystem):
1147
1161
  """
1148
1162
  Closes the DuckDB database's connection.
1149
1163
 
1150
- `return`: None
1164
+ Return: None
1151
1165
  """
1152
1166
  self.con.close()
@@ -6,6 +6,7 @@ class Filesystem(Backend, ABC):
6
6
  def __init__(self, filename, **kwargs) -> None:
7
7
  pass
8
8
 
9
+ # Can raise NotImplementedError for a read-only backend
9
10
  @abstractmethod
10
11
  def ingest_artifacts(self, artifacts, **kwargs) -> None:
11
12
  pass
@@ -18,6 +19,10 @@ class Filesystem(Backend, ABC):
18
19
  def get_table(self, table_name, **kwargs):
19
20
  pass
20
21
 
22
+ @abstractmethod
23
+ def get_table_names(self,query):
24
+ pass
25
+
21
26
  @abstractmethod
22
27
  def notebook(self, **kwargs):
23
28
  pass
@@ -25,6 +30,10 @@ class Filesystem(Backend, ABC):
25
30
  @abstractmethod
26
31
  def process_artifacts(self, **kwargs):
27
32
  pass
33
+
34
+ @abstractmethod
35
+ def get_schema(self):
36
+ pass
28
37
 
29
38
  @abstractmethod
30
39
  def find(self, query_object, **kwargs):
@@ -62,6 +71,7 @@ class Filesystem(Backend, ABC):
62
71
  def summary(self, table_name, **kwargs):
63
72
  pass
64
73
 
74
+ # Can raise NotImplementedError for a read-only backend
65
75
  @abstractmethod
66
76
  def overwrite_table(self, table_name, collection, **kwargs):
67
77
  pass
@@ -0,0 +1,160 @@
1
+ import sqlite3
2
+
3
+ # Holds table name and data properties
4
+ from dsi.backends.filesystem import Filesystem
5
+
6
+ class DataType:
7
+ name = "DEFAULT"
8
+ properties = {}
9
+ units = {}
10
+
11
+
12
+ class Gufi(Filesystem):
13
+ '''
14
+ GUFI Datastore
15
+ '''
16
+ gufi_prefix = ""
17
+ gufi_index = ""
18
+ dsi_table_name = ""
19
+ dsi_columns = ""
20
+ gufi_columns = ""
21
+ collection_name = ""
22
+ dsi_db = None
23
+ isVerbose = False
24
+
25
+ def __init__(self, gufi_prefix, gufi_index, dsi_table_name, dsi_columns, gufi_columns,
26
+ collection_name, dsi_db, verbose=False):
27
+ '''
28
+ `gufi_prefix`: the directory where GUFI is installed
29
+
30
+ `gufi_index`: the directory where GUFI's indexes are
31
+
32
+ `dsi_table_name`: the DSI table name that has the UUID for each file as a column
33
+
34
+ `dsi_columns`: the DSI table columns that should be included in the join with GUFI
35
+
36
+ `gufi_columns`: the GUFI columns that should be included in the join with DSI
37
+
38
+ `collection_name`: the name that identifies the collection that the DSI database belongs to
39
+
40
+ `dsi_db`: the path to the dsi db
41
+
42
+ `verbose`: print debugging statements or not
43
+ '''
44
+
45
+ # prefix is the prefix to the GUFI installation
46
+ self.gufi_prefix = gufi_prefix
47
+ self.gufi_index = gufi_index
48
+ self.dsi_table_name = dsi_table_name
49
+ self.dsi_columns = dsi_columns
50
+ self.gufi_columns = gufi_columns
51
+ self.collection_name = collection_name
52
+ self.dsi_db = dsi_db
53
+ self.isVerbose = verbose
54
+
55
+
56
+ # Query GUFI and DSI db ˘
57
+ def query_artifacts(self, query):
58
+ '''
59
+ Retrieves GUFI's metadata joined with a dsi database
60
+ query: an sql query into the dsi_entries table
61
+ '''
62
+
63
+ try:
64
+ resout = self._run_gufi_query(query)
65
+ if self.isVerbose:
66
+ print(resout)
67
+
68
+ return resout
69
+ except Exception:
70
+ print("Error running GUFI query")
71
+
72
+
73
+ def ingest_artifacts(self, query):
74
+ raise NotImplementedError("Cannot ingest data with the GUFI backend")
75
+
76
+ # Runs the gufi query command
77
+ def _run_gufi_query(self, sqlstring):
78
+ '''
79
+ Calls the qufy_query command to run the sql query
80
+ sqlstring: the query into the dsi_entries table
81
+ '''
82
+
83
+ metadata = []
84
+ with sqlite3.connect(":memory:") as con:
85
+ con.enable_load_extension(True)
86
+
87
+ # alternatively you can load the extension using an API call:
88
+ con.load_extension(self.gufi_prefix + "/lib/gufi_vt")
89
+
90
+ # disable extension loading again
91
+ con.enable_load_extension(False)
92
+
93
+ dsi_column_names = ",".join(self.dsi_columns)
94
+ gufi_column_names = ",".join((["rpath(sname, sroll, name) AS fullpath"] + self.gufi_columns[1:]
95
+ if self.gufi_columns[0] == "fullpath" else self.gufi_columns))
96
+ query=f"""
97
+ CREATE VIRTUAL TABLE uview USING gufi_vt(
98
+ threads=64,
99
+ E="SELECT {gufi_column_names}, dsi_uuid(xattr_name, xattr_value) AS uuid FROM vrxpentries WHERE uuid IS NOT NULL;",
100
+ index='{self.gufi_index}',
101
+ plugin='gufi_plugin_operations:{self.gufi_prefix}/lib/libdsi_querying.so'
102
+ );
103
+ """
104
+
105
+ # example from SQLite wiki
106
+ cur = con.execute(query)
107
+ query=f"""
108
+ ATTACH '{self.dsi_db}' AS
109
+ {self.collection_name};
110
+ """
111
+ cur = con.execute(query)
112
+
113
+ if sqlstring is None or len(sqlstring) == 0:
114
+ query = f"""
115
+ SELECT uview.*, {dsi_column_names} FROM uview JOIN ATLAS_UUID.zarr_metadata_uuid ON uview.uuid == ATLAS_UUID.zarr_metadata_uuid.uuid;
116
+ """
117
+ else:
118
+ query = sqlstring
119
+
120
+ print("query: ", query)
121
+ cur.execute(query)
122
+ rows = cur.fetchall()
123
+ for row in rows:
124
+ print(row)
125
+ metadata.append(row)
126
+
127
+ return metadata
128
+
129
+ def close(self):
130
+ raise NotImplementedError("No connection to close for the GUFI backend")
131
+ def display(self):
132
+ raise NotImplementedError("Cannot display data with the GUFI backend")
133
+ def find(self):
134
+ raise NotImplementedError("Cannot find data with the GUFI backend")
135
+ def find_cell(self):
136
+ raise NotImplementedError("Cannot find cell data with the GUFI backend")
137
+ def find_column(self):
138
+ raise NotImplementedError("Cannot find column data with the GUFI backend")
139
+ def find_relation(self):
140
+ raise NotImplementedError("Cannot find data on a relation with the GUFI backend")
141
+ def find_table(self):
142
+ raise NotImplementedError("Cannot find table data with the GUFI backend")
143
+ def get_schema(self):
144
+ pass
145
+ def get_table(self):
146
+ raise NotImplementedError("Cannot get table data with the GUFI backend")
147
+ def get_table_names(self):
148
+ raise NotImplementedError("Cannot get table names with the GUFI backend")
149
+ def list(self):
150
+ raise NotImplementedError("Cannot list tables with the GUFI backend")
151
+ def notebook(self):
152
+ raise NotImplementedError("Cannot create notebook with the GUFI backend")
153
+ def num_tables(self):
154
+ raise NotImplementedError("Cannot count tables with the GUFI backend")
155
+ def overwrite_table(self):
156
+ raise NotImplementedError("Cannot overwrite table with the GUFI backend")
157
+ def process_artifacts(self):
158
+ raise NotImplementedError("Cannot process artifacts with the GUFI backend")
159
+ def summary(self):
160
+ raise NotImplementedError("Cannot summarize data with the GUFI backend")
@@ -0,0 +1,168 @@
1
+ import re
2
+ import subprocess
3
+ import os
4
+
5
+ from dsi.backends.backend import Backend
6
+ from collections import OrderedDict
7
+
8
+ # HPSS backend class
9
+ class HPSS(Backend):
10
+ read_only = False
11
+
12
+ def __init__(self, hpss_files):
13
+ """
14
+ Initializes an HPSS backend
15
+
16
+ `hpss_files`: list with hpss file paths
17
+
18
+ """
19
+ self.hpss_info = OrderedDict()
20
+ for hpss_file in hpss_files.keys():
21
+ self.hpss_info[hpss_file] = {
22
+ 'local_path': hpss_files[hpss_file],
23
+ 'hpss_hash': None,
24
+ }
25
+ stdout, stderr, _ = self.run_hsi("hashlist", [hpss_file])
26
+ hpss_hash = self.parse_hpss_hash(stdout, stderr)
27
+ self.hpss_info[hpss_file]['hpss_hash'] = hpss_hash
28
+
29
+ def create_hpss_hash(self, hpss_file) -> str:
30
+ """
31
+ Creates and HPSS hash
32
+ """
33
+ stdout, stderr, returncode = self.run_hsi("hashcreate", [hpss_file])
34
+ if returncode != 0:
35
+ print(stderr)
36
+ return None
37
+
38
+ hash = self.parse_hpss_hash(stdout, stderr)
39
+ return hash
40
+
41
+ def put(self, local_file, hpss_dest) -> bool:
42
+ """
43
+ Puts a local file on HPSS
44
+ """
45
+ cwd = os.getcwd()
46
+ new_dir = None
47
+ file_to_put = local_file
48
+ if '/' in local_file:
49
+ new_dir = '/'.join(local_file.split('/')[:-1])
50
+ os.chdir(new_dir)
51
+ file_to_put = local_file.split('/')[-1]
52
+
53
+ stdout, stderr, returncode = self.run_hsi("put", [file_to_put])
54
+ if new_dir is not None:
55
+ os.chdir(cwd)
56
+
57
+ if returncode == 0:
58
+ self.create_hpss_hash(file_to_put)
59
+ return True
60
+
61
+ return False
62
+
63
+ def get(self, hpss_file, tmp_dir) -> bool:
64
+ """
65
+ Gets an HPSS file and puts it in the tmp_dir
66
+ """
67
+ cwd = os.getcwd()
68
+ try:
69
+ os.chdir(tmp_dir)
70
+ except Exception:
71
+ print("Error changing to temp dir: %s" % tmp_dir)
72
+ return False
73
+
74
+ stdout, stderr, returncode = self.run_hsi("get", hpss_file)
75
+ try:
76
+ os.chdir(cwd)
77
+ except Exception:
78
+ print("Error changing to dir: %s" % cwd)
79
+
80
+ if returncode == 0:
81
+ return True
82
+
83
+ return False
84
+
85
+ def parse_hpss_hash(self, stdout, stderr) -> str:
86
+ """
87
+ Parses the result of an HPSS hash command
88
+ """
89
+ output = stdout + stderr
90
+ hash = None
91
+ for line in output.splitlines():
92
+ if " md5" not in line:
93
+ continue
94
+
95
+ line = line.strip()
96
+ matches = re.search(r'(\S+)\s+(\S+)\s+(\S+).*', line)
97
+ if not matches:
98
+ continue
99
+
100
+ if len(matches.groups()) == 3:
101
+ hash = matches.group(1)
102
+ break
103
+
104
+ return hash
105
+
106
+ def run_hsi(self, subcmd, arg_list):
107
+ """
108
+ Runs hsi with the supplied subcmd and arguments
109
+ """
110
+ command = ["hsi", subcmd]
111
+ command += arg_list
112
+
113
+ stdout = ""
114
+ stderr = ""
115
+ returncode = -1
116
+ try:
117
+ process = subprocess.Popen(command, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='latin-1')
118
+
119
+ stdout, stderr = process.communicate()
120
+ returncode = process.communicate()
121
+ except FileNotFoundError as e:
122
+ print("Error running hsi: %s" % e)
123
+
124
+ return stdout, stderr, returncode
125
+
126
+
127
+ def ingest_artifacts(self, collection, isVerbose=False):
128
+ for f in self.hpss_info.keys():
129
+ self.put(self.hpss_info[f]['local_path'], f)
130
+
131
+ def query_artifacts(self, query, **kwargs):
132
+ pass
133
+
134
+ def notebook(self, **kwargs):
135
+ pass
136
+
137
+ def process_artifacts(self, **kwargs):
138
+ pass
139
+
140
+ def get_schema(self):
141
+ pass
142
+
143
+ def find(self, query_object, **kwargs):
144
+ pass
145
+
146
+ def find_table(self, query_object, **kwargs):
147
+ pass
148
+
149
+ def find_column(self, query_object, **kwargs):
150
+ pass
151
+
152
+ def find_cell(self, query_object, **kwargs):
153
+ pass
154
+
155
+ def find_relation(self, column_name, relation, **kwargs):
156
+ pass
157
+
158
+ def list(self, **kwargs):
159
+ pass
160
+
161
+ def display(self, table_name, **kwargs):
162
+ pass
163
+
164
+ def summary(self, table_name, **kwargs):
165
+ pass
166
+
167
+ def close(self):
168
+ pass