linkml-store 0.0.0__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +108 -7
- linkml_store/api/collection.py +221 -30
- linkml_store/api/config.py +97 -0
- linkml_store/api/database.py +207 -17
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +0 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +114 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +35 -44
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +86 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/solr_collection.py +132 -0
- linkml_store/api/stores/solr/solr_database.py +82 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +73 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/METADATA +17 -6
- linkml_store-0.1.6.dist-info/RECORD +41 -0
- linkml_store-0.1.6.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/WHEEL +0 -0
|
@@ -1,22 +1,24 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
2
|
from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sqla
|
|
5
5
|
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
6
|
-
from sqlalchemy import Column, Table, delete, insert, text
|
|
6
|
+
from sqlalchemy import Column, Table, delete, insert, inspect, text
|
|
7
7
|
from sqlalchemy.sql.ddl import CreateTable
|
|
8
8
|
|
|
9
9
|
from linkml_store.api import Collection
|
|
10
|
-
from linkml_store.api.collection import OBJECT
|
|
10
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
11
|
+
from linkml_store.api.queries import Query
|
|
11
12
|
from linkml_store.api.stores.duckdb.mappings import TMAP
|
|
12
13
|
from linkml_store.utils.sql_utils import facet_count_sql
|
|
13
14
|
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
14
17
|
|
|
15
|
-
@dataclass
|
|
16
18
|
class DuckDBCollection(Collection):
|
|
17
19
|
_table_created: bool = None
|
|
18
20
|
|
|
19
|
-
def
|
|
21
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
20
22
|
if not isinstance(objs, list):
|
|
21
23
|
objs = [objs]
|
|
22
24
|
if not objs:
|
|
@@ -26,6 +28,7 @@ class DuckDBCollection(Collection):
|
|
|
26
28
|
cd = self.induce_class_definition_from_objects(objs)
|
|
27
29
|
self._create_table(cd)
|
|
28
30
|
table = self._sqla_table(cd)
|
|
31
|
+
logger.info(f"Inserting into: {self._alias} // T={table.name}")
|
|
29
32
|
engine = self.parent.engine
|
|
30
33
|
col_names = [c.name for c in table.columns]
|
|
31
34
|
objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
|
|
@@ -51,39 +54,58 @@ class DuckDBCollection(Collection):
|
|
|
51
54
|
conn.commit()
|
|
52
55
|
return len(objs)
|
|
53
56
|
|
|
54
|
-
def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
|
|
57
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
58
|
+
logger.info(f"Deleting from {self._target_class_name} where: {where}")
|
|
59
|
+
if where is None:
|
|
60
|
+
where = {}
|
|
55
61
|
cd = self.class_definition()
|
|
62
|
+
if not cd:
|
|
63
|
+
logger.info(f"No class definition found for {self._target_class_name}, assuming not prepopulated")
|
|
64
|
+
return 0
|
|
56
65
|
table = self._sqla_table(cd)
|
|
57
66
|
engine = self.parent.engine
|
|
67
|
+
inspector = inspect(engine)
|
|
68
|
+
table_exists = table.name in inspector.get_table_names()
|
|
69
|
+
if not table_exists:
|
|
70
|
+
logger.info(f"Table {table.name} does not exist, assuming no data")
|
|
71
|
+
return 0
|
|
58
72
|
with engine.connect() as conn:
|
|
59
73
|
conditions = [table.c[k] == v for k, v in where.items()]
|
|
60
74
|
stmt = delete(table).where(*conditions)
|
|
61
75
|
stmt = stmt.compile(engine)
|
|
62
|
-
conn.execute(stmt)
|
|
76
|
+
result = conn.execute(stmt)
|
|
77
|
+
deleted_rows_count = result.rowcount
|
|
78
|
+
if deleted_rows_count == 0 and not missing_ok:
|
|
79
|
+
raise ValueError(f"No rows found for {where}")
|
|
63
80
|
conn.commit()
|
|
64
|
-
|
|
81
|
+
return deleted_rows_count
|
|
65
82
|
|
|
66
|
-
def query_facets(
|
|
83
|
+
def query_facets(
|
|
84
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
85
|
+
) -> Dict[str, Dict[str, int]]:
|
|
67
86
|
results = {}
|
|
68
87
|
cd = self.class_definition()
|
|
69
88
|
with self.parent.engine.connect() as conn:
|
|
70
89
|
if not facet_columns:
|
|
71
90
|
facet_columns = list(self.class_definition().attributes.keys())
|
|
72
91
|
for col in facet_columns:
|
|
92
|
+
logger.debug(f"Faceting on {col}")
|
|
73
93
|
if isinstance(col, tuple):
|
|
74
94
|
sd = SlotDefinition(name="PLACEHOLDER")
|
|
75
95
|
else:
|
|
76
96
|
sd = cd.attributes[col]
|
|
77
97
|
facet_query = self._create_query(where_clause=where)
|
|
78
98
|
facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
|
|
99
|
+
logger.debug(f"Facet query: {facet_query_str}")
|
|
79
100
|
rows = list(conn.execute(text(facet_query_str)))
|
|
80
101
|
results[col] = rows
|
|
81
102
|
return results
|
|
82
103
|
|
|
83
104
|
def _sqla_table(self, cd: ClassDefinition) -> Table:
|
|
105
|
+
schema_view = self.parent.schema_view
|
|
84
106
|
metadata_obj = sqla.MetaData()
|
|
85
107
|
cols = []
|
|
86
|
-
for att in cd.
|
|
108
|
+
for att in schema_view.class_induced_slots(cd.name):
|
|
87
109
|
typ = TMAP.get(att.range, sqla.String)
|
|
88
110
|
if att.inlined:
|
|
89
111
|
typ = sqla.JSON
|
|
@@ -93,17 +115,28 @@ class DuckDBCollection(Collection):
|
|
|
93
115
|
typ = sqla.ARRAY(typ, dimensions=1)
|
|
94
116
|
col = Column(att.name, typ)
|
|
95
117
|
cols.append(col)
|
|
96
|
-
t = Table(self.
|
|
118
|
+
t = Table(self._alias, metadata_obj, *cols)
|
|
97
119
|
return t
|
|
98
120
|
|
|
99
121
|
def _create_table(self, cd: ClassDefinition):
|
|
100
|
-
if self._table_created:
|
|
122
|
+
if self._table_created or self.metadata.is_prepopulated:
|
|
123
|
+
logger.info(f"Already have table for: {cd.name}")
|
|
101
124
|
return
|
|
125
|
+
query = Query(
|
|
126
|
+
from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self._alias}
|
|
127
|
+
)
|
|
128
|
+
qr = self.parent.query(query)
|
|
129
|
+
if qr.num_rows > 0:
|
|
130
|
+
logger.info(f"Table already exists for {cd.name}")
|
|
131
|
+
self._table_created = True
|
|
132
|
+
self.metadata.is_prepopulated = True
|
|
133
|
+
return
|
|
134
|
+
logger.info(f"Creating table for {cd.name}")
|
|
102
135
|
t = self._sqla_table(cd)
|
|
103
136
|
ct = CreateTable(t)
|
|
104
137
|
ddl = str(ct.compile(self.parent.engine))
|
|
105
138
|
with self.parent.engine.connect() as conn:
|
|
106
139
|
conn.execute(text(ddl))
|
|
107
140
|
conn.commit()
|
|
108
|
-
|
|
109
|
-
|
|
141
|
+
self._table_created = True
|
|
142
|
+
self.metadata.is_prepopulated = True
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
from typing import Optional
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
import sqlalchemy
|
|
8
7
|
from duckdb import DuckDBPyConnection
|
|
9
8
|
from linkml_runtime import SchemaView
|
|
10
|
-
from linkml_runtime.linkml_model import SlotDefinition
|
|
9
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
11
10
|
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
12
|
-
from sqlalchemy import text
|
|
11
|
+
from sqlalchemy import NullPool, text
|
|
13
12
|
|
|
14
13
|
from linkml_store.api import Database
|
|
15
14
|
from linkml_store.api.queries import Query, QueryResult
|
|
@@ -20,54 +19,36 @@ TYPE_MAP = {
|
|
|
20
19
|
"VARCHAR": "string",
|
|
21
20
|
"BIGINT": "integer",
|
|
22
21
|
"BOOLEAN": "boolean",
|
|
22
|
+
"DATE": "date",
|
|
23
|
+
"DOUBLE": "float",
|
|
24
|
+
"INTEGER": "integer",
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
logger = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
30
|
|
|
29
|
-
def run_query(con: DuckDBPyConnection, query: Query, **kwargs):
|
|
30
|
-
"""
|
|
31
|
-
Run a query and return the result.
|
|
32
|
-
|
|
33
|
-
>>> import duckdb
|
|
34
|
-
>>> con = duckdb.connect("db/mgi.db")
|
|
35
|
-
>>> query = Query(from_table="gaf_association", limit=5)
|
|
36
|
-
>>> result = run_query(con, query)
|
|
37
|
-
>>> print(result.num_rows)
|
|
38
|
-
532233
|
|
39
|
-
|
|
40
|
-
:param con:
|
|
41
|
-
:param query:
|
|
42
|
-
:return:
|
|
43
|
-
"""
|
|
44
|
-
count_query_str = query_to_sql(query, count=True)
|
|
45
|
-
num_rows = con.execute(count_query_str).fetchall()[0][0]
|
|
46
|
-
logger.debug(f"num_rows: {num_rows}")
|
|
47
|
-
query_str = query_to_sql(query, **kwargs)
|
|
48
|
-
logger.debug(f"query_str: {query_str}")
|
|
49
|
-
rows = con.execute(query_str).fetchdf()
|
|
50
|
-
qr = QueryResult(query=query, num_rows=num_rows)
|
|
51
|
-
qr.set_rows(rows)
|
|
52
|
-
return qr
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
31
|
class DuckDBDatabase(Database):
|
|
57
32
|
_connection: DuckDBPyConnection = None
|
|
58
33
|
_engine: sqlalchemy.Engine = None
|
|
34
|
+
collection_class = DuckDBCollection
|
|
59
35
|
|
|
60
|
-
def
|
|
61
|
-
if
|
|
62
|
-
|
|
36
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
37
|
+
if handle is None:
|
|
38
|
+
handle = "duckdb:///:memory:"
|
|
39
|
+
super().__init__(handle=handle, **kwargs)
|
|
63
40
|
|
|
64
41
|
@property
|
|
65
42
|
def engine(self) -> sqlalchemy.Engine:
|
|
66
43
|
if not self._engine:
|
|
67
44
|
handle = self.handle
|
|
68
45
|
if not handle.startswith("duckdb://") and not handle.startswith(":"):
|
|
69
|
-
handle = f"duckdb
|
|
70
|
-
|
|
46
|
+
handle = f"duckdb:///{handle}"
|
|
47
|
+
if ":memory:" not in handle:
|
|
48
|
+
# TODO: investigate this; duckdb appears to be prematurely caching
|
|
49
|
+
self._engine = sqlalchemy.create_engine(handle, poolclass=NullPool)
|
|
50
|
+
else:
|
|
51
|
+
self._engine = sqlalchemy.create_engine(handle)
|
|
71
52
|
return self._engine
|
|
72
53
|
|
|
73
54
|
def commit(self, **kwargs):
|
|
@@ -80,6 +61,14 @@ class DuckDBDatabase(Database):
|
|
|
80
61
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
81
62
|
json_encoded_cols = []
|
|
82
63
|
if query.from_table:
|
|
64
|
+
if not query.from_table.startswith("information_schema"):
|
|
65
|
+
meta_query = Query(
|
|
66
|
+
from_table="information_schema.tables", where_clause={"table_name": query.from_table}
|
|
67
|
+
)
|
|
68
|
+
qr = self.query(meta_query)
|
|
69
|
+
if qr.num_rows == 0:
|
|
70
|
+
logger.debug(f"Table {query.from_table} not created yet")
|
|
71
|
+
return QueryResult(query=query, num_rows=0, rows=[])
|
|
83
72
|
sv = self._schema_view
|
|
84
73
|
if sv:
|
|
85
74
|
cd = None
|
|
@@ -127,21 +116,15 @@ class DuckDBDatabase(Database):
|
|
|
127
116
|
collection = DuckDBCollection(name=table_name, parent=self)
|
|
128
117
|
self._collections[table_name] = collection
|
|
129
118
|
|
|
130
|
-
def create_collection(self, name: str, alias: Optional[str] = None, **kwargs) -> DuckDBCollection:
|
|
131
|
-
collection = DuckDBCollection(name=name, parent=self)
|
|
132
|
-
if not self._collections:
|
|
133
|
-
self._collections = {}
|
|
134
|
-
if not alias:
|
|
135
|
-
alias = name
|
|
136
|
-
self._collections[alias] = collection
|
|
137
|
-
return collection
|
|
138
|
-
|
|
139
119
|
def induce_schema_view(self) -> SchemaView:
|
|
140
120
|
# TODO: unify schema introspection
|
|
121
|
+
# TODO: handle case where schema is provided in advance
|
|
122
|
+
logger.info(f"Inducing schema view for {self.metadata.handle}")
|
|
141
123
|
sb = SchemaBuilder()
|
|
142
124
|
schema = sb.schema
|
|
143
125
|
query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
|
|
144
126
|
qr = self.query(query)
|
|
127
|
+
logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
|
|
145
128
|
if qr.num_rows:
|
|
146
129
|
table_names = [row["table_name"] for row in qr.rows]
|
|
147
130
|
for tbl in table_names:
|
|
@@ -162,5 +145,13 @@ class DuckDBDatabase(Database):
|
|
|
162
145
|
row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
|
|
163
146
|
)
|
|
164
147
|
sb.schema.classes[tbl_name].attributes[sd.name] = sd
|
|
148
|
+
logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range}")
|
|
165
149
|
sb.add_defaults()
|
|
150
|
+
for cls_name in schema.classes:
|
|
151
|
+
if cls_name in self.metadata.collections:
|
|
152
|
+
collection_metadata = self.metadata.collections[cls_name]
|
|
153
|
+
if collection_metadata.attributes:
|
|
154
|
+
del schema.classes[cls_name]
|
|
155
|
+
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
156
|
+
schema.classes[cls.name] = cls
|
|
166
157
|
return SchemaView(schema)
|
|
File without changes
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import h5py
|
|
6
|
+
|
|
7
|
+
from linkml_store.api import Collection
|
|
8
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
9
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HDF5Collection(Collection):
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def hdf5_group(self) -> h5py.Group:
|
|
18
|
+
return self.parent.file[self.name]
|
|
19
|
+
|
|
20
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
21
|
+
if not isinstance(objs, list):
|
|
22
|
+
objs = [objs]
|
|
23
|
+
|
|
24
|
+
for obj in objs:
|
|
25
|
+
if "id" not in obj:
|
|
26
|
+
raise ValueError("Each object must have an 'id' field.")
|
|
27
|
+
obj_id = str(obj["id"])
|
|
28
|
+
for key, value in obj.items():
|
|
29
|
+
if key == "id":
|
|
30
|
+
continue
|
|
31
|
+
if isinstance(value, (dict, list)):
|
|
32
|
+
value = json.dumps(value)
|
|
33
|
+
self.hdf5_group.create_dataset(f"{obj_id}/{key}", data=value)
|
|
34
|
+
|
|
35
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
36
|
+
if not isinstance(objs, list):
|
|
37
|
+
objs = [objs]
|
|
38
|
+
count = 0
|
|
39
|
+
for obj in objs:
|
|
40
|
+
if "id" not in obj:
|
|
41
|
+
raise ValueError("Each object must have an 'id' field.")
|
|
42
|
+
obj_id = str(obj["id"])
|
|
43
|
+
if obj_id in self.hdf5_group:
|
|
44
|
+
del self.hdf5_group[obj_id]
|
|
45
|
+
count += 1
|
|
46
|
+
return count
|
|
47
|
+
|
|
48
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
49
|
+
logger.info(f"Deleting from {self._target_class_name} where: {where}")
|
|
50
|
+
if where is None:
|
|
51
|
+
where = {}
|
|
52
|
+
results = self.query(Query(where_clause=where)).rows
|
|
53
|
+
count = self.delete(results)
|
|
54
|
+
return count
|
|
55
|
+
|
|
56
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
57
|
+
results = []
|
|
58
|
+
for obj_id in self.hdf5_group:
|
|
59
|
+
obj = {"id": obj_id}
|
|
60
|
+
for key, value in self.hdf5_group[obj_id].items():
|
|
61
|
+
try:
|
|
62
|
+
obj[key] = json.loads(value[()])
|
|
63
|
+
except json.JSONDecodeError:
|
|
64
|
+
obj[key] = value[()]
|
|
65
|
+
if self._match_where_clause(obj, query.where_clause):
|
|
66
|
+
results.append(obj)
|
|
67
|
+
|
|
68
|
+
count = len(results)
|
|
69
|
+
if query.limit:
|
|
70
|
+
results = results[: query.limit]
|
|
71
|
+
return QueryResult(query=query, num_rows=count, rows=results)
|
|
72
|
+
|
|
73
|
+
def query_facets(
|
|
74
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
75
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
76
|
+
results = {}
|
|
77
|
+
if not facet_columns:
|
|
78
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
79
|
+
|
|
80
|
+
for col in facet_columns:
|
|
81
|
+
logger.debug(f"Faceting on {col}")
|
|
82
|
+
facet_counts = {}
|
|
83
|
+
for obj in self.query(Query(where_clause=where)).rows:
|
|
84
|
+
if col in obj:
|
|
85
|
+
value = obj[col]
|
|
86
|
+
if isinstance(value, list):
|
|
87
|
+
for v in value:
|
|
88
|
+
facet_counts[v] = facet_counts.get(v, 0) + 1
|
|
89
|
+
else:
|
|
90
|
+
facet_counts[value] = facet_counts.get(value, 0) + 1
|
|
91
|
+
facet_counts = sorted(facet_counts.items(), key=lambda x: x[1], reverse=True)[:facet_limit]
|
|
92
|
+
results[col] = facet_counts
|
|
93
|
+
|
|
94
|
+
return results
|
|
95
|
+
|
|
96
|
+
def _match_where_clause(self, obj: Dict[str, Any], where_clause: Optional[Dict[str, Any]]) -> bool:
|
|
97
|
+
if where_clause is None:
|
|
98
|
+
return True
|
|
99
|
+
for key, value in where_clause.items():
|
|
100
|
+
if key not in obj:
|
|
101
|
+
return False
|
|
102
|
+
if obj[key] != value:
|
|
103
|
+
return False
|
|
104
|
+
return True
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# hdf5_database.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import h5py
|
|
7
|
+
from linkml_runtime import SchemaView
|
|
8
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
9
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
10
|
+
|
|
11
|
+
from linkml_store.api import Database
|
|
12
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
13
|
+
from linkml_store.api.stores.hdf5.hdf5_collection import HDF5Collection
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HDF5Database(Database):
|
|
19
|
+
_file: h5py.File = None
|
|
20
|
+
collection_class = HDF5Collection
|
|
21
|
+
|
|
22
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
23
|
+
if handle is None:
|
|
24
|
+
handle = "linkml_store.h5"
|
|
25
|
+
super().__init__(handle=handle, **kwargs)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def file(self) -> h5py.File:
|
|
29
|
+
if self._file is None:
|
|
30
|
+
self._file = h5py.File(self.handle, "a")
|
|
31
|
+
return self._file
|
|
32
|
+
|
|
33
|
+
def commit(self, **kwargs):
|
|
34
|
+
self.file.flush()
|
|
35
|
+
|
|
36
|
+
def close(self, **kwargs):
|
|
37
|
+
if self._file:
|
|
38
|
+
self._file.close()
|
|
39
|
+
|
|
40
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
41
|
+
if query.from_table:
|
|
42
|
+
collection = self.get_collection(query.from_table)
|
|
43
|
+
return collection.query(query, **kwargs)
|
|
44
|
+
|
|
45
|
+
def init_collections(self):
|
|
46
|
+
if self._collections is None:
|
|
47
|
+
self._collections = {}
|
|
48
|
+
|
|
49
|
+
for collection_name in self.file:
|
|
50
|
+
if collection_name not in self._collections:
|
|
51
|
+
collection = HDF5Collection(name=collection_name, parent=self)
|
|
52
|
+
self._collections[collection_name] = collection
|
|
53
|
+
|
|
54
|
+
def induce_schema_view(self) -> SchemaView:
|
|
55
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
56
|
+
sb = SchemaBuilder()
|
|
57
|
+
schema = sb.schema
|
|
58
|
+
|
|
59
|
+
for collection_name in self.file:
|
|
60
|
+
sb.add_class(collection_name)
|
|
61
|
+
hdf5_group = self.file[collection_name]
|
|
62
|
+
for field in hdf5_group:
|
|
63
|
+
if field == "_id":
|
|
64
|
+
continue
|
|
65
|
+
sd = SlotDefinition(field)
|
|
66
|
+
if isinstance(hdf5_group[field][()], list):
|
|
67
|
+
sd.multivalued = True
|
|
68
|
+
sb.schema.classes[collection_name].attributes[sd.name] = sd
|
|
69
|
+
|
|
70
|
+
sb.add_defaults()
|
|
71
|
+
for cls_name in schema.classes:
|
|
72
|
+
if cls_name in self.metadata.collections:
|
|
73
|
+
collection_metadata = self.metadata.collections[cls_name]
|
|
74
|
+
if collection_metadata.attributes:
|
|
75
|
+
del schema.classes[cls_name]
|
|
76
|
+
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
77
|
+
schema.classes[cls.name] = cls
|
|
78
|
+
|
|
79
|
+
return SchemaView(schema)
|
|
@@ -1,56 +1,102 @@
|
|
|
1
|
-
|
|
2
|
-
from typing import Any, Dict, List, Optional, Union
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
5
|
+
from pymongo.collection import Collection as MongoCollection
|
|
3
6
|
|
|
4
7
|
from linkml_store.api import Collection
|
|
5
|
-
from linkml_store.api.collection import OBJECT
|
|
8
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
9
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
6
12
|
|
|
7
13
|
|
|
8
|
-
@dataclass
|
|
9
14
|
class MongoDBCollection(Collection):
|
|
10
|
-
"""
|
|
11
|
-
A wrapper around a MongoDB collection
|
|
12
|
-
"""
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
cd = self.class_definition()
|
|
20
|
-
if not cd:
|
|
21
|
-
cd = self.induce_class_definition_from_objects(objs)
|
|
22
|
-
collection = self.parent.database[self.name]
|
|
23
|
-
collection.insert_many(objs)
|
|
16
|
+
@property
|
|
17
|
+
def mongo_collection(self) -> MongoCollection:
|
|
18
|
+
if not self.name:
|
|
19
|
+
raise ValueError("Collection name not set")
|
|
20
|
+
return self.parent.native_db[self.name]
|
|
24
21
|
|
|
25
|
-
def
|
|
22
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
26
23
|
if not isinstance(objs, list):
|
|
27
24
|
objs = [objs]
|
|
28
|
-
|
|
29
|
-
if not cd:
|
|
30
|
-
cd = self.induce_class_definition_from_objects(objs)
|
|
31
|
-
collection = self.parent.database[self.name]
|
|
32
|
-
deleted_count = 0
|
|
33
|
-
for obj in objs:
|
|
34
|
-
result = collection.delete_one(obj)
|
|
35
|
-
deleted_count += result.deleted_count
|
|
36
|
-
return deleted_count
|
|
25
|
+
self.mongo_collection.insert_many(objs)
|
|
37
26
|
|
|
38
|
-
def
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
27
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
28
|
+
mongo_filter = self._build_mongo_filter(query.where_clause)
|
|
29
|
+
if query.limit:
|
|
30
|
+
cursor = self.mongo_collection.find(mongo_filter).limit(query.limit)
|
|
31
|
+
else:
|
|
32
|
+
cursor = self.mongo_collection.find(mongo_filter)
|
|
33
|
+
|
|
34
|
+
rows = list(cursor)
|
|
35
|
+
count = self.mongo_collection.count_documents(mongo_filter)
|
|
36
|
+
|
|
37
|
+
return QueryResult(query=query, num_rows=count, rows=rows)
|
|
38
|
+
|
|
39
|
+
def _build_mongo_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
|
|
40
|
+
mongo_filter = {}
|
|
41
|
+
if where_clause:
|
|
42
|
+
for field, value in where_clause.items():
|
|
43
|
+
mongo_filter[field] = value
|
|
44
|
+
return mongo_filter
|
|
42
45
|
|
|
43
|
-
def query_facets(
|
|
46
|
+
def query_facets(
|
|
47
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
48
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
44
49
|
results = {}
|
|
45
|
-
|
|
46
|
-
collection = self.parent.database[self.name]
|
|
50
|
+
cd = self.class_definition()
|
|
47
51
|
if not facet_columns:
|
|
48
52
|
facet_columns = list(self.class_definition().attributes.keys())
|
|
53
|
+
|
|
49
54
|
for col in facet_columns:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
logger.debug(f"Faceting on {col}")
|
|
56
|
+
if isinstance(col, tuple):
|
|
57
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
58
|
+
else:
|
|
59
|
+
sd = cd.attributes[col]
|
|
60
|
+
|
|
61
|
+
if sd.multivalued:
|
|
62
|
+
facet_pipeline = [
|
|
63
|
+
{"$match": where} if where else {"$match": {}},
|
|
64
|
+
{"$unwind": f"${col}"},
|
|
65
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
66
|
+
{"$sort": {"count": -1}},
|
|
67
|
+
{"$limit": facet_limit},
|
|
68
|
+
]
|
|
69
|
+
else:
|
|
70
|
+
facet_pipeline = [
|
|
71
|
+
{"$match": where} if where else {"$match": {}},
|
|
72
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
73
|
+
{"$sort": {"count": -1}},
|
|
74
|
+
{"$limit": facet_limit},
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
|
|
78
|
+
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
79
|
+
|
|
56
80
|
return results
|
|
81
|
+
|
|
82
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
83
|
+
if not isinstance(objs, list):
|
|
84
|
+
objs = [objs]
|
|
85
|
+
filter_conditions = []
|
|
86
|
+
for obj in objs:
|
|
87
|
+
filter_condition = {}
|
|
88
|
+
for key, value in obj.items():
|
|
89
|
+
filter_condition[key] = value
|
|
90
|
+
filter_conditions.append(filter_condition)
|
|
91
|
+
result = self.mongo_collection.delete_many({"$or": filter_conditions})
|
|
92
|
+
return result.deleted_count
|
|
93
|
+
|
|
94
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
95
|
+
logger.info(f"Deleting from {self._target_class_name} where: {where}")
|
|
96
|
+
if where is None:
|
|
97
|
+
where = {}
|
|
98
|
+
result = self.mongo_collection.delete_many(where)
|
|
99
|
+
deleted_rows_count = result.deleted_count
|
|
100
|
+
if deleted_rows_count == 0 and not missing_ok:
|
|
101
|
+
raise ValueError(f"No rows found for {where}")
|
|
102
|
+
return deleted_rows_count
|