deriva-ml 1.6.8__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/VERSION.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.6.8"
1
+ __version__ = "1.8.0"
deriva_ml/__init__.py CHANGED
@@ -2,6 +2,7 @@ __all__ = [
2
2
  "DerivaML",
3
3
  "DerivaMLException",
4
4
  "FileUploadState",
5
+ "FileSpec",
5
6
  "ExecutionConfiguration",
6
7
  "Workflow",
7
8
  "DatasetBag",
@@ -26,6 +27,7 @@ from .deriva_definitions import (
26
27
  BuiltinTypes,
27
28
  UploadState,
28
29
  FileUploadState,
30
+ FileSpec,
29
31
  RID,
30
32
  DerivaMLException,
31
33
  MLVocab,
@@ -1,14 +1,15 @@
1
+ """Ths module constains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
2
+ of a dataset and a sqllite database in which the contents of the bag are stored.
3
+ """
1
4
  import logging
2
5
  import sqlite3
3
6
 
4
7
  from csv import reader
5
8
  from pathlib import Path
6
- from typing import Any, Generator, Optional
9
+ from typing import Any, Optional
7
10
  from urllib.parse import urlparse
8
11
 
9
- import pandas as pd
10
12
  from deriva.core.ermrest_model import Model
11
- from pydantic import validate_call
12
13
 
13
14
  from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
14
15
  from .dataset_aux_classes import DatasetVersion, DatasetMinid
@@ -16,7 +17,21 @@ from .deriva_model import DerivaModel
16
17
  from .dataset_bag import DatasetBag
17
18
 
18
19
 
19
- class DatabaseModel(DerivaModel):
20
+ class DatabaseModelMeta(type):
21
+ """Use metaclass to ensure that there is onl one instance per path"""
22
+
23
+ _paths_loaded: dict[Path:"DatabaseModel"] = {}
24
+
25
+ def __call__(cls, *args, **kwargs):
26
+ logger = logging.getLogger("deriva_ml")
27
+ bag_path: Path = args[1]
28
+ if bag_path.as_posix() not in cls._paths_loaded:
29
+ logger.info(f"Loading {bag_path}")
30
+ cls._paths_loaded[bag_path] = super().__call__(*args, **kwargs)
31
+ return cls._paths_loaded[bag_path]
32
+
33
+
34
+ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
20
35
  """Read in the contents of a BDBag and create a local SQLite database.
21
36
 
22
37
  As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
@@ -32,6 +47,9 @@ class DatabaseModel(DerivaModel):
32
47
  Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
33
48
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
34
49
  into DatabaseModels, is kept in the class variable `_rid_map`.
50
+
51
+ Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
52
+ sqllite instance is created for every new dataset version present.
35
53
 
36
54
  Attributes:
37
55
  bag_path (Path): path to the local copy of the BDBag
@@ -42,29 +60,9 @@ class DatabaseModel(DerivaModel):
42
60
  dataset_table (Table): the dataset table in the ERMRest model.
43
61
  """
44
62
 
45
- # Keep track of what databases we have loaded.
46
- _paths_loaded: dict[Path:"DatabaseModel"] = {}
47
-
48
63
  # Maintain a global map of RIDS to versions and databases.
49
64
  _rid_map: dict[RID, list[tuple[DatasetVersion, "DatabaseModel"]]] = {}
50
65
 
51
- @classmethod
52
- @validate_call
53
- def register(cls, minid: DatasetMinid, bag_path: Path):
54
- """Register a new minid in the list of local databases if it's new, otherwise, return an existing DatabaseModel.
55
-
56
- Args:
57
- minid: MINID to the databag that is to be loaded.
58
- bag_path: Path to the bag on the local filesystem./
59
-
60
- Returns:
61
- A DatabaseModel instance to the loaded bag.
62
- """
63
- o = cls._paths_loaded.get(bag_path.as_posix())
64
- if o:
65
- return o
66
- return cls(minid, bag_path)
67
-
68
66
  @staticmethod
69
67
  def rid_lookup(dataset_rid: RID) -> list[tuple[DatasetVersion, "DatabaseModel"]]:
70
68
  """Return a list of DatasetVersion/DatabaseModel instances corresponding to the given RID.
@@ -84,13 +82,12 @@ class DatabaseModel(DerivaModel):
84
82
  raise DerivaMLException(f"Dataset {dataset_rid} not found")
85
83
 
86
84
  def __init__(self, minid: DatasetMinid, bag_path: Path):
87
- """Create a new DatabaseModel. This should only be called via the static Register method
85
+ """Create a new DatabaseModel.
88
86
 
89
87
  Args:
90
88
  minid: Minid for the specified bag.
91
89
  bag_path: Path to the local copy of the BDBag.
92
90
  """
93
- DatabaseModel._paths_loaded[bag_path.as_posix()] = self
94
91
 
95
92
  self.bag_path = bag_path
96
93
  self.minid = minid
@@ -342,60 +339,6 @@ class DatabaseModel(DerivaModel):
342
339
  except KeyError:
343
340
  raise DerivaMLException(f'Table name "{table}" does not exist.')
344
341
 
345
- def get_table(self, table: str) -> Generator[tuple, None, None]:
346
- """Retrieve the contents of the specified table. If schema is not provided as part of the table name,
347
- the method will attempt to locate the schema for the table.
348
-
349
- Args:
350
- table: return: A generator that yields tuples of column values.
351
-
352
- Returns:
353
- A generator that yields tuples of column values.
354
-
355
- """
356
- table_name = self.normalize_table_name(table)
357
- result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
358
- while row := result.fetchone():
359
- yield row
360
-
361
- def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
362
- """Retrieve the contents of the specified table as a dataframe.
363
-
364
-
365
- If schema is not provided as part of the table name,
366
- the method will attempt to locate the schema for the table.
367
-
368
- Args:
369
- table: Table to retrieve data from.
370
-
371
- Returns:
372
- A dataframe containing the contents of the specified table.
373
- """
374
- table_name = self.normalize_table_name(table)
375
- return pd.read_sql(f'SELECT * FROM "{table_name}"', con=self.dbase)
376
-
377
- def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
378
- """Retrieve the contents of the specified table as a dictionary.
379
-
380
- Args:
381
- table: Table to retrieve data from. f schema is not provided as part of the table name,
382
- the method will attempt to locate the schema for the table.
383
-
384
- Returns:
385
- A generator producing dictionaries containing the contents of the specified table as name/value pairs.
386
- """
387
- table_name = self.normalize_table_name(table)
388
- with self.dbase:
389
- col_names = [
390
- c[1]
391
- for c in self.dbase.execute(
392
- f'PRAGMA table_info("{table_name}")'
393
- ).fetchall()
394
- ]
395
- result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
396
- while row := result.fetchone():
397
- yield dict(zip(col_names, row))
398
-
399
342
  def delete_database(self):
400
343
  """
401
344