linkml-store 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +151 -0
- linkml_store/api/collection.py +327 -0
- linkml_store/api/database.py +215 -0
- linkml_store/api/metadata.py +5 -0
- linkml_store/api/queries.py +56 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/duckdb/__init__.py +0 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +109 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +166 -0
- linkml_store/api/stores/duckdb/mappings.py +7 -0
- linkml_store/api/stores/mongodb/__init__.py +0 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +56 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +112 -0
- linkml_store/constants.py +7 -0
- linkml_store/index/__init__.py +0 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_index.py +44 -0
- linkml_store/index/implementations/simple_index.py +40 -0
- linkml_store/index/index.py +109 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/sql_utils.py +126 -0
- linkml_store-0.0.0.dist-info/LICENSE +22 -0
- linkml_store-0.0.0.dist-info/METADATA +44 -0
- linkml_store-0.0.0.dist-info/RECORD +29 -0
- linkml_store-0.0.0.dist-info/WHEEL +4 -0
- linkml_store-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, Optional, Sequence
|
|
4
|
+
|
|
5
|
+
from linkml_runtime import SchemaView
|
|
6
|
+
|
|
7
|
+
from linkml_store.api.collection import Collection
|
|
8
|
+
from linkml_store.api.metadata import MetaData
|
|
9
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Database(ABC):
|
|
14
|
+
"""
|
|
15
|
+
A Database provides access to named collections of data.
|
|
16
|
+
|
|
17
|
+
Examples
|
|
18
|
+
--------
|
|
19
|
+
>>> from linkml_store.api.client import Client
|
|
20
|
+
>>> client = Client()
|
|
21
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
22
|
+
>>> db.handle
|
|
23
|
+
'duckdb:///:memory:'
|
|
24
|
+
>>> collection = db.create_collection("Person")
|
|
25
|
+
>>> len(db.list_collections())
|
|
26
|
+
1
|
|
27
|
+
>>> db.get_collection("Person") == collection
|
|
28
|
+
True
|
|
29
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
30
|
+
>>> collection.add(objs)
|
|
31
|
+
>>> qr = collection.find()
|
|
32
|
+
>>> len(qr.rows)
|
|
33
|
+
2
|
|
34
|
+
>>> qr.rows[0]["id"]
|
|
35
|
+
'P1'
|
|
36
|
+
>>> qr.rows[1]["name"]
|
|
37
|
+
'Alice'
|
|
38
|
+
>>> qr = collection.find({"name": "John"})
|
|
39
|
+
>>> len(qr.rows)
|
|
40
|
+
1
|
|
41
|
+
>>> qr.rows[0]["name"]
|
|
42
|
+
'John'
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
handle: Optional[str] = None
|
|
47
|
+
recreate_if_exists: Optional[bool] = False
|
|
48
|
+
_schema_view: Optional[SchemaView] = None
|
|
49
|
+
_collections: Optional[Dict[str, Collection]] = None
|
|
50
|
+
|
|
51
|
+
def store(self, obj: Dict[str, str], **kwargs):
|
|
52
|
+
"""
|
|
53
|
+
Store an object in the database
|
|
54
|
+
|
|
55
|
+
:param obj: object to store
|
|
56
|
+
:param kwargs: additional arguments
|
|
57
|
+
"""
|
|
58
|
+
for k, v in obj.items():
|
|
59
|
+
if not isinstance(v, list):
|
|
60
|
+
continue
|
|
61
|
+
if not v:
|
|
62
|
+
continue
|
|
63
|
+
collection = self.get_collection(k, create_if_not_exists=True)
|
|
64
|
+
collection.add(v)
|
|
65
|
+
|
|
66
|
+
def commit(self, **kwargs):
|
|
67
|
+
"""
|
|
68
|
+
Commit any pending changes to the database
|
|
69
|
+
"""
|
|
70
|
+
raise NotImplementedError()
|
|
71
|
+
|
|
72
|
+
def close(self, **kwargs):
|
|
73
|
+
"""
|
|
74
|
+
Close the database and all connection objects
|
|
75
|
+
"""
|
|
76
|
+
raise NotImplementedError()
|
|
77
|
+
|
|
78
|
+
def create_collection(
|
|
79
|
+
self, name: str, alias: Optional[str] = None, metadata: Optional[MetaData] = None, **kwargs
|
|
80
|
+
) -> Collection:
|
|
81
|
+
"""
|
|
82
|
+
Create a new collection
|
|
83
|
+
|
|
84
|
+
>>> from linkml_store.api.client import Client
|
|
85
|
+
>>> client = Client()
|
|
86
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
87
|
+
>>> collection = db.create_collection("Person")
|
|
88
|
+
>>> collection.name
|
|
89
|
+
'Person'
|
|
90
|
+
|
|
91
|
+
:param name: name of the collection
|
|
92
|
+
:param alias: alias for the collection
|
|
93
|
+
:param metadata: metadata for the collection
|
|
94
|
+
:param kwargs: additional arguments
|
|
95
|
+
"""
|
|
96
|
+
raise NotImplementedError()
|
|
97
|
+
|
|
98
|
+
def list_collections(self) -> Sequence[Collection]:
|
|
99
|
+
"""
|
|
100
|
+
List all collections.
|
|
101
|
+
|
|
102
|
+
Examples
|
|
103
|
+
--------
|
|
104
|
+
>>> from linkml_store.api.client import Client
|
|
105
|
+
>>> client = Client()
|
|
106
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
107
|
+
>>> c1 = db.create_collection("Person")
|
|
108
|
+
>>> c2 = db.create_collection("Product")
|
|
109
|
+
>>> collections = db.list_collections()
|
|
110
|
+
>>> len(collections)
|
|
111
|
+
2
|
|
112
|
+
>>> [c.name for c in collections]
|
|
113
|
+
['Person', 'Product']
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
if not self._collections:
|
|
117
|
+
self.init_collections()
|
|
118
|
+
return list(self._collections.values())
|
|
119
|
+
|
|
120
|
+
def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
|
|
121
|
+
"""
|
|
122
|
+
Get a named collection.
|
|
123
|
+
|
|
124
|
+
Examples
|
|
125
|
+
--------
|
|
126
|
+
>>> from linkml_store.api.client import Client
|
|
127
|
+
>>> client = Client()
|
|
128
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
129
|
+
>>> collection = db.create_collection("Person")
|
|
130
|
+
>>> db.get_collection("Person") == collection
|
|
131
|
+
True
|
|
132
|
+
>>> db.get_collection("NonExistent", create_if_not_exists=False)
|
|
133
|
+
Traceback (most recent call last):
|
|
134
|
+
...
|
|
135
|
+
KeyError: 'Collection NonExistent does not exist'
|
|
136
|
+
|
|
137
|
+
:param name: name of the collection
|
|
138
|
+
:param create_if_not_exists: create the collection if it does not exist
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
if not self._collections:
|
|
142
|
+
self.init_collections()
|
|
143
|
+
if name not in self._collections:
|
|
144
|
+
if create_if_not_exists:
|
|
145
|
+
self._collections[name] = self.create_collection(name)
|
|
146
|
+
else:
|
|
147
|
+
raise KeyError(f"Collection {name} does not exist")
|
|
148
|
+
return self._collections[name]
|
|
149
|
+
|
|
150
|
+
def init_collections(self):
|
|
151
|
+
"""
|
|
152
|
+
Initialize collections.
|
|
153
|
+
|
|
154
|
+
Not typically called directly: consider making hidden
|
|
155
|
+
:return:
|
|
156
|
+
"""
|
|
157
|
+
raise NotImplementedError
|
|
158
|
+
|
|
159
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
160
|
+
"""
|
|
161
|
+
Run a query against the database.
|
|
162
|
+
|
|
163
|
+
Examples
|
|
164
|
+
--------
|
|
165
|
+
>>> from linkml_store.api.client import Client
|
|
166
|
+
>>> from linkml_store.api.queries import Query
|
|
167
|
+
>>> client = Client()
|
|
168
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
169
|
+
>>> collection = db.create_collection("Person")
|
|
170
|
+
>>> collection.add([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
|
|
171
|
+
>>> query = Query(from_table="Person", where_clause={"name": "John"})
|
|
172
|
+
>>> result = db.query(query)
|
|
173
|
+
>>> len(result.rows)
|
|
174
|
+
1
|
|
175
|
+
>>> result.rows[0]["id"]
|
|
176
|
+
'P1'
|
|
177
|
+
|
|
178
|
+
:param query:
|
|
179
|
+
:param kwargs:
|
|
180
|
+
:return:
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
raise NotImplementedError
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def schema_view(self) -> SchemaView:
|
|
187
|
+
"""
|
|
188
|
+
Return a schema view for the named collection
|
|
189
|
+
"""
|
|
190
|
+
if not self._schema_view:
|
|
191
|
+
self._schema_view = self.induce_schema_view()
|
|
192
|
+
return self._schema_view
|
|
193
|
+
|
|
194
|
+
def set_schema_view(self, schema_view: SchemaView):
|
|
195
|
+
self._schema_view = schema_view
|
|
196
|
+
|
|
197
|
+
def induce_schema_view(self) -> SchemaView:
|
|
198
|
+
"""
|
|
199
|
+
Induce a schema view from a schema definition.
|
|
200
|
+
|
|
201
|
+
>>> from linkml_store.api.client import Client
|
|
202
|
+
>>> from linkml_store.api.queries import Query
|
|
203
|
+
>>> client = Client()
|
|
204
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
205
|
+
>>> collection = db.create_collection("Person")
|
|
206
|
+
>>> collection.add([{"id": "P1", "name": "John", "age_in_years": 25},
|
|
207
|
+
... {"id": "P2", "name": "Alice", "age_in_years": 25}])
|
|
208
|
+
>>> schema_view = db.induce_schema_view()
|
|
209
|
+
>>> cd = schema_view.get_class("Person")
|
|
210
|
+
>>> cd.attributes["id"].range
|
|
211
|
+
'string'
|
|
212
|
+
|
|
213
|
+
:return: A schema view
|
|
214
|
+
"""
|
|
215
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from collections import namedtuple
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
# defined a named tuple called between with two values (start, end):
|
|
8
|
+
# This is used in the Query class to represent a range of values
|
|
9
|
+
# This is used in the Query class to represent a range of values
|
|
10
|
+
Between = namedtuple("Between", "min max")
|
|
11
|
+
|
|
12
|
+
FACET_GROUP_ATOM = Union[str, int, float, Between]
|
|
13
|
+
FACET_GROUP = Union[FACET_GROUP_ATOM, Tuple[FACET_GROUP_ATOM, ...]]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Query(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
A query object.
|
|
19
|
+
|
|
20
|
+
- In SQL this would be a SQL query string
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from_table: Optional[str]
|
|
24
|
+
select_cols: Optional[List[str]] = None
|
|
25
|
+
where_clause: Optional[Union[str, List[str], Dict[str, str]]] = None
|
|
26
|
+
sort_by: Optional[List[str]] = None
|
|
27
|
+
limit: Optional[int] = None
|
|
28
|
+
offset: Optional[int] = None
|
|
29
|
+
include_facet_counts: bool = False
|
|
30
|
+
facet_slots: Optional[List[str]] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class QueryResult(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
A query result
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
query: Optional[Query] = None
|
|
39
|
+
num_rows: int
|
|
40
|
+
offset: Optional[int] = 0
|
|
41
|
+
rows: Optional[List[Dict[str, Any]]] = None
|
|
42
|
+
ranked_rows: Optional[List[Tuple[float, Dict[str, Any]]]] = None
|
|
43
|
+
_rows_dataframe: Optional[pd.DataFrame] = None
|
|
44
|
+
facet_counts: Optional[Dict[str, List[Tuple[FACET_GROUP, int]]]] = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def rows_dataframe(self) -> pd.DataFrame:
|
|
48
|
+
if self._rows_dataframe is None and self.rows:
|
|
49
|
+
self._rows_dataframe = pd.DataFrame(self.rows)
|
|
50
|
+
return self._rows_dataframe
|
|
51
|
+
|
|
52
|
+
def set_rows(self, rows: pd.DataFrame):
|
|
53
|
+
self._rows_dataframe = rows
|
|
54
|
+
|
|
55
|
+
class Config:
|
|
56
|
+
arbitrary_types_allowed = True
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sqla
|
|
5
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
6
|
+
from sqlalchemy import Column, Table, delete, insert, text
|
|
7
|
+
from sqlalchemy.sql.ddl import CreateTable
|
|
8
|
+
|
|
9
|
+
from linkml_store.api import Collection
|
|
10
|
+
from linkml_store.api.collection import OBJECT
|
|
11
|
+
from linkml_store.api.stores.duckdb.mappings import TMAP
|
|
12
|
+
from linkml_store.utils.sql_utils import facet_count_sql
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class DuckDBCollection(Collection):
|
|
17
|
+
_table_created: bool = None
|
|
18
|
+
|
|
19
|
+
def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
20
|
+
if not isinstance(objs, list):
|
|
21
|
+
objs = [objs]
|
|
22
|
+
if not objs:
|
|
23
|
+
return
|
|
24
|
+
cd = self.class_definition()
|
|
25
|
+
if not cd:
|
|
26
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
27
|
+
self._create_table(cd)
|
|
28
|
+
table = self._sqla_table(cd)
|
|
29
|
+
engine = self.parent.engine
|
|
30
|
+
col_names = [c.name for c in table.columns]
|
|
31
|
+
objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
|
|
32
|
+
with engine.connect() as conn:
|
|
33
|
+
with conn.begin():
|
|
34
|
+
conn.execute(insert(table), objs)
|
|
35
|
+
conn.commit()
|
|
36
|
+
|
|
37
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
38
|
+
if not isinstance(objs, list):
|
|
39
|
+
objs = [objs]
|
|
40
|
+
cd = self.class_definition()
|
|
41
|
+
if not cd:
|
|
42
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
43
|
+
table = self._sqla_table(cd)
|
|
44
|
+
engine = self.parent.engine
|
|
45
|
+
with engine.connect() as conn:
|
|
46
|
+
for obj in objs:
|
|
47
|
+
conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
|
|
48
|
+
stmt = delete(table).where(*conditions)
|
|
49
|
+
stmt = stmt.compile(engine)
|
|
50
|
+
conn.execute(stmt)
|
|
51
|
+
conn.commit()
|
|
52
|
+
return len(objs)
|
|
53
|
+
|
|
54
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
|
|
55
|
+
cd = self.class_definition()
|
|
56
|
+
table = self._sqla_table(cd)
|
|
57
|
+
engine = self.parent.engine
|
|
58
|
+
with engine.connect() as conn:
|
|
59
|
+
conditions = [table.c[k] == v for k, v in where.items()]
|
|
60
|
+
stmt = delete(table).where(*conditions)
|
|
61
|
+
stmt = stmt.compile(engine)
|
|
62
|
+
conn.execute(stmt)
|
|
63
|
+
conn.commit()
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
def query_facets(self, where: Dict = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
|
|
67
|
+
results = {}
|
|
68
|
+
cd = self.class_definition()
|
|
69
|
+
with self.parent.engine.connect() as conn:
|
|
70
|
+
if not facet_columns:
|
|
71
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
72
|
+
for col in facet_columns:
|
|
73
|
+
if isinstance(col, tuple):
|
|
74
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
75
|
+
else:
|
|
76
|
+
sd = cd.attributes[col]
|
|
77
|
+
facet_query = self._create_query(where_clause=where)
|
|
78
|
+
facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
|
|
79
|
+
rows = list(conn.execute(text(facet_query_str)))
|
|
80
|
+
results[col] = rows
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
def _sqla_table(self, cd: ClassDefinition) -> Table:
|
|
84
|
+
metadata_obj = sqla.MetaData()
|
|
85
|
+
cols = []
|
|
86
|
+
for att in cd.attributes.values():
|
|
87
|
+
typ = TMAP.get(att.range, sqla.String)
|
|
88
|
+
if att.inlined:
|
|
89
|
+
typ = sqla.JSON
|
|
90
|
+
if att.multivalued:
|
|
91
|
+
typ = sqla.ARRAY(typ, dimensions=1)
|
|
92
|
+
if att.array:
|
|
93
|
+
typ = sqla.ARRAY(typ, dimensions=1)
|
|
94
|
+
col = Column(att.name, typ)
|
|
95
|
+
cols.append(col)
|
|
96
|
+
t = Table(self.name, metadata_obj, *cols)
|
|
97
|
+
return t
|
|
98
|
+
|
|
99
|
+
def _create_table(self, cd: ClassDefinition):
|
|
100
|
+
if self._table_created:
|
|
101
|
+
return
|
|
102
|
+
t = self._sqla_table(cd)
|
|
103
|
+
ct = CreateTable(t)
|
|
104
|
+
ddl = str(ct.compile(self.parent.engine))
|
|
105
|
+
with self.parent.engine.connect() as conn:
|
|
106
|
+
conn.execute(text(ddl))
|
|
107
|
+
conn.commit()
|
|
108
|
+
if not self._table_created:
|
|
109
|
+
self._table_created = True
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
from duckdb import DuckDBPyConnection
|
|
9
|
+
from linkml_runtime import SchemaView
|
|
10
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
11
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
12
|
+
from sqlalchemy import text
|
|
13
|
+
|
|
14
|
+
from linkml_store.api import Database
|
|
15
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
16
|
+
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
|
|
17
|
+
from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
|
|
18
|
+
|
|
19
|
+
TYPE_MAP = {
|
|
20
|
+
"VARCHAR": "string",
|
|
21
|
+
"BIGINT": "integer",
|
|
22
|
+
"BOOLEAN": "boolean",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def run_query(con: DuckDBPyConnection, query: Query, **kwargs):
|
|
30
|
+
"""
|
|
31
|
+
Run a query and return the result.
|
|
32
|
+
|
|
33
|
+
>>> import duckdb
|
|
34
|
+
>>> con = duckdb.connect("db/mgi.db")
|
|
35
|
+
>>> query = Query(from_table="gaf_association", limit=5)
|
|
36
|
+
>>> result = run_query(con, query)
|
|
37
|
+
>>> print(result.num_rows)
|
|
38
|
+
532233
|
|
39
|
+
|
|
40
|
+
:param con:
|
|
41
|
+
:param query:
|
|
42
|
+
:return:
|
|
43
|
+
"""
|
|
44
|
+
count_query_str = query_to_sql(query, count=True)
|
|
45
|
+
num_rows = con.execute(count_query_str).fetchall()[0][0]
|
|
46
|
+
logger.debug(f"num_rows: {num_rows}")
|
|
47
|
+
query_str = query_to_sql(query, **kwargs)
|
|
48
|
+
logger.debug(f"query_str: {query_str}")
|
|
49
|
+
rows = con.execute(query_str).fetchdf()
|
|
50
|
+
qr = QueryResult(query=query, num_rows=num_rows)
|
|
51
|
+
qr.set_rows(rows)
|
|
52
|
+
return qr
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class DuckDBDatabase(Database):
|
|
57
|
+
_connection: DuckDBPyConnection = None
|
|
58
|
+
_engine: sqlalchemy.Engine = None
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
if not self.handle:
|
|
62
|
+
self.handle = "duckdb:///:memory:"
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def engine(self) -> sqlalchemy.Engine:
|
|
66
|
+
if not self._engine:
|
|
67
|
+
handle = self.handle
|
|
68
|
+
if not handle.startswith("duckdb://") and not handle.startswith(":"):
|
|
69
|
+
handle = f"duckdb://{handle}"
|
|
70
|
+
self._engine = sqlalchemy.create_engine(handle)
|
|
71
|
+
return self._engine
|
|
72
|
+
|
|
73
|
+
def commit(self, **kwargs):
|
|
74
|
+
with self.engine.connect() as conn:
|
|
75
|
+
conn.commit()
|
|
76
|
+
|
|
77
|
+
def close(self, **kwargs):
|
|
78
|
+
self.engine.dispose()
|
|
79
|
+
|
|
80
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
81
|
+
json_encoded_cols = []
|
|
82
|
+
if query.from_table:
|
|
83
|
+
sv = self._schema_view
|
|
84
|
+
if sv:
|
|
85
|
+
cd = None
|
|
86
|
+
for c in self._collections.values():
|
|
87
|
+
if c.name == query.from_table:
|
|
88
|
+
cd = c.class_definition()
|
|
89
|
+
break
|
|
90
|
+
if cd:
|
|
91
|
+
for att in cd.attributes.values():
|
|
92
|
+
if att.inlined:
|
|
93
|
+
json_encoded_cols.append(att.name)
|
|
94
|
+
with self.engine.connect() as conn:
|
|
95
|
+
count_query_str = text(query_to_sql(query, count=True))
|
|
96
|
+
num_rows = list(conn.execute(count_query_str))[0][0]
|
|
97
|
+
logger.debug(f"num_rows: {num_rows}")
|
|
98
|
+
query_str = query_to_sql(query, **kwargs) # include offset, limit
|
|
99
|
+
logger.debug(f"query_str: {query_str}")
|
|
100
|
+
rows = list(conn.execute(text(query_str)).mappings())
|
|
101
|
+
qr = QueryResult(query=query, num_rows=num_rows, rows=rows)
|
|
102
|
+
if json_encoded_cols:
|
|
103
|
+
for row in qr.rows:
|
|
104
|
+
for col in json_encoded_cols:
|
|
105
|
+
if row[col]:
|
|
106
|
+
if isinstance(row[col], list):
|
|
107
|
+
for i in range(len(row[col])):
|
|
108
|
+
row[col][i] = json.loads(row[col][i])
|
|
109
|
+
else:
|
|
110
|
+
row[col] = json.loads(row[col])
|
|
111
|
+
qr.set_rows(pd.DataFrame(rows))
|
|
112
|
+
facet_columns = query.facet_slots
|
|
113
|
+
if query.include_facet_counts and not facet_columns:
|
|
114
|
+
raise ValueError("Facet counts requested but no facet columns specified")
|
|
115
|
+
if facet_columns:
|
|
116
|
+
raise NotImplementedError
|
|
117
|
+
return qr
|
|
118
|
+
|
|
119
|
+
def init_collections(self):
|
|
120
|
+
# TODO: unify schema introspection
|
|
121
|
+
schema = introspect_schema(self.engine)
|
|
122
|
+
table_names = schema.classes.keys()
|
|
123
|
+
if self._collections is None:
|
|
124
|
+
self._collections = {}
|
|
125
|
+
for table_name in table_names:
|
|
126
|
+
if table_name not in self._collections:
|
|
127
|
+
collection = DuckDBCollection(name=table_name, parent=self)
|
|
128
|
+
self._collections[table_name] = collection
|
|
129
|
+
|
|
130
|
+
def create_collection(self, name: str, alias: Optional[str] = None, **kwargs) -> DuckDBCollection:
|
|
131
|
+
collection = DuckDBCollection(name=name, parent=self)
|
|
132
|
+
if not self._collections:
|
|
133
|
+
self._collections = {}
|
|
134
|
+
if not alias:
|
|
135
|
+
alias = name
|
|
136
|
+
self._collections[alias] = collection
|
|
137
|
+
return collection
|
|
138
|
+
|
|
139
|
+
def induce_schema_view(self) -> SchemaView:
|
|
140
|
+
# TODO: unify schema introspection
|
|
141
|
+
sb = SchemaBuilder()
|
|
142
|
+
schema = sb.schema
|
|
143
|
+
query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
|
|
144
|
+
qr = self.query(query)
|
|
145
|
+
if qr.num_rows:
|
|
146
|
+
table_names = [row["table_name"] for row in qr.rows]
|
|
147
|
+
for tbl in table_names:
|
|
148
|
+
sb.add_class(tbl)
|
|
149
|
+
query = Query(from_table="information_schema.columns", sort_by=["ordinal_position"])
|
|
150
|
+
for row in self.query(query, limit=-1).rows:
|
|
151
|
+
tbl_name = row["table_name"]
|
|
152
|
+
if tbl_name not in sb.schema.classes:
|
|
153
|
+
continue
|
|
154
|
+
dt = row["data_type"]
|
|
155
|
+
if dt.endswith("[]"):
|
|
156
|
+
dt = dt[0:-2]
|
|
157
|
+
multivalued = True
|
|
158
|
+
else:
|
|
159
|
+
multivalued = False
|
|
160
|
+
rng = TYPE_MAP.get(dt, "string")
|
|
161
|
+
sd = SlotDefinition(
|
|
162
|
+
row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
|
|
163
|
+
)
|
|
164
|
+
sb.schema.classes[tbl_name].attributes[sd.name] = sd
|
|
165
|
+
sb.add_defaults()
|
|
166
|
+
return SchemaView(schema)
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from linkml_store.api import Collection
|
|
5
|
+
from linkml_store.api.collection import OBJECT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class MongoDBCollection(Collection):
|
|
10
|
+
"""
|
|
11
|
+
A wrapper around a MongoDB collection
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
15
|
+
if not isinstance(objs, list):
|
|
16
|
+
objs = [objs]
|
|
17
|
+
if not objs:
|
|
18
|
+
return
|
|
19
|
+
cd = self.class_definition()
|
|
20
|
+
if not cd:
|
|
21
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
22
|
+
collection = self.parent.database[self.name]
|
|
23
|
+
collection.insert_many(objs)
|
|
24
|
+
|
|
25
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
26
|
+
if not isinstance(objs, list):
|
|
27
|
+
objs = [objs]
|
|
28
|
+
cd = self.class_definition()
|
|
29
|
+
if not cd:
|
|
30
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
31
|
+
collection = self.parent.database[self.name]
|
|
32
|
+
deleted_count = 0
|
|
33
|
+
for obj in objs:
|
|
34
|
+
result = collection.delete_one(obj)
|
|
35
|
+
deleted_count += result.deleted_count
|
|
36
|
+
return deleted_count
|
|
37
|
+
|
|
38
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
|
|
39
|
+
collection = self.parent.database[self.name]
|
|
40
|
+
result = collection.delete_many(where)
|
|
41
|
+
return result.deleted_count
|
|
42
|
+
|
|
43
|
+
def query_facets(self, where: Dict = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
|
|
44
|
+
results = {}
|
|
45
|
+
_cd = self.class_definition()
|
|
46
|
+
collection = self.parent.database[self.name]
|
|
47
|
+
if not facet_columns:
|
|
48
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
49
|
+
for col in facet_columns:
|
|
50
|
+
facet_pipeline = [
|
|
51
|
+
{"$match": where} if where else {"$match": {}},
|
|
52
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
53
|
+
]
|
|
54
|
+
facet_results = list(collection.aggregate(facet_pipeline))
|
|
55
|
+
results[col] = [(row["_id"], row["count"]) for row in facet_results]
|
|
56
|
+
return results
|