linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adapter for DuckDB embedded database.
|
|
3
|
+
|
|
4
|
+
Handles have the form:
|
|
5
|
+
|
|
6
|
+
- ``duckdb:///<path>`` for a file-based database
|
|
7
|
+
- ``duckdb:///:memory:`` for an in-memory database
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
|
|
11
|
+
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"DuckDBCollection",
|
|
15
|
+
"DuckDBDatabase",
|
|
16
|
+
]
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sqla
|
|
5
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
6
|
+
from sqlalchemy import Column, Table, delete, insert, inspect, text
|
|
7
|
+
from sqlalchemy.sql.ddl import CreateTable
|
|
8
|
+
|
|
9
|
+
from linkml_store.api import Collection
|
|
10
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
11
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
12
|
+
from linkml_store.api.stores.duckdb.mappings import TMAP
|
|
13
|
+
from linkml_store.utils.sql_utils import facet_count_sql
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DuckDBCollection(Collection):
|
|
19
|
+
_table_created: bool = None
|
|
20
|
+
|
|
21
|
+
def __init__(self, *args, **kwargs):
|
|
22
|
+
super().__init__(*args, **kwargs)
|
|
23
|
+
|
|
24
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
25
|
+
logger.debug(f"Inserting {len(objs)}")
|
|
26
|
+
if not isinstance(objs, list):
|
|
27
|
+
objs = [objs]
|
|
28
|
+
if not objs:
|
|
29
|
+
return
|
|
30
|
+
cd = self.class_definition()
|
|
31
|
+
if not cd:
|
|
32
|
+
logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
|
|
33
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
34
|
+
self._create_table(cd)
|
|
35
|
+
table = self._sqla_table(cd)
|
|
36
|
+
logger.info(f"Inserting into: {self.alias} // T={table.name}")
|
|
37
|
+
engine = self.parent.engine
|
|
38
|
+
col_names = [c.name for c in table.columns]
|
|
39
|
+
bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
|
|
40
|
+
if bad_objs:
|
|
41
|
+
logger.error(f"Bad objects: {bad_objs}")
|
|
42
|
+
objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
|
|
43
|
+
with engine.connect() as conn:
|
|
44
|
+
with conn.begin():
|
|
45
|
+
conn.execute(insert(table), objs)
|
|
46
|
+
conn.commit()
|
|
47
|
+
self._post_insert_hook(objs)
|
|
48
|
+
|
|
49
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
|
|
50
|
+
if not isinstance(objs, list):
|
|
51
|
+
objs = [objs]
|
|
52
|
+
cd = self.class_definition()
|
|
53
|
+
if not cd or not cd.attributes:
|
|
54
|
+
cd = self.induce_class_definition_from_objects(objs)
|
|
55
|
+
assert cd.attributes
|
|
56
|
+
table = self._sqla_table(cd)
|
|
57
|
+
engine = self.parent.engine
|
|
58
|
+
with engine.connect() as conn:
|
|
59
|
+
for obj in objs:
|
|
60
|
+
conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
|
|
61
|
+
stmt = delete(table).where(*conditions)
|
|
62
|
+
stmt = stmt.compile(engine)
|
|
63
|
+
conn.execute(stmt)
|
|
64
|
+
conn.commit()
|
|
65
|
+
self._post_delete_hook()
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
|
|
69
|
+
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
70
|
+
if where is None:
|
|
71
|
+
where = {}
|
|
72
|
+
cd = self.class_definition()
|
|
73
|
+
if not cd:
|
|
74
|
+
logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
|
|
75
|
+
return 0
|
|
76
|
+
table = self._sqla_table(cd)
|
|
77
|
+
engine = self.parent.engine
|
|
78
|
+
inspector = inspect(engine)
|
|
79
|
+
table_exists = table.name in inspector.get_table_names()
|
|
80
|
+
if not table_exists:
|
|
81
|
+
logger.info(f"Table {table.name} does not exist, assuming no data")
|
|
82
|
+
return 0
|
|
83
|
+
with engine.connect() as conn:
|
|
84
|
+
conditions = [table.c[k] == v for k, v in where.items()]
|
|
85
|
+
stmt = delete(table).where(*conditions)
|
|
86
|
+
stmt = stmt.compile(engine)
|
|
87
|
+
result = conn.execute(stmt)
|
|
88
|
+
deleted_rows_count = result.rowcount
|
|
89
|
+
if deleted_rows_count == 0 and not missing_ok:
|
|
90
|
+
raise ValueError(f"No rows found for {where}")
|
|
91
|
+
conn.commit()
|
|
92
|
+
self._post_delete_hook()
|
|
93
|
+
return deleted_rows_count if deleted_rows_count > -1 else None
|
|
94
|
+
|
|
95
|
+
def query_facets(
|
|
96
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
97
|
+
) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
|
|
98
|
+
if facet_limit is None:
|
|
99
|
+
facet_limit = DEFAULT_FACET_LIMIT
|
|
100
|
+
results = {}
|
|
101
|
+
cd = self.class_definition()
|
|
102
|
+
with self.parent.engine.connect() as conn:
|
|
103
|
+
if not facet_columns:
|
|
104
|
+
if not cd:
|
|
105
|
+
raise ValueError(f"No class definition found for {self.target_class_name}")
|
|
106
|
+
facet_columns = list(cd.attributes.keys())
|
|
107
|
+
for col in facet_columns:
|
|
108
|
+
logger.debug(f"Faceting on {col}")
|
|
109
|
+
if isinstance(col, tuple):
|
|
110
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
111
|
+
else:
|
|
112
|
+
sd = cd.attributes[col]
|
|
113
|
+
facet_query = self._create_query(where_clause=where)
|
|
114
|
+
facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
|
|
115
|
+
logger.debug(f"Facet query: {facet_query_str}")
|
|
116
|
+
rows = list(conn.execute(text(facet_query_str)))
|
|
117
|
+
results[col] = [tuple(row) for row in rows]
|
|
118
|
+
return results
|
|
119
|
+
|
|
120
|
+
def _sqla_table(self, cd: ClassDefinition) -> Table:
|
|
121
|
+
schema_view = self.parent.schema_view
|
|
122
|
+
metadata_obj = sqla.MetaData()
|
|
123
|
+
cols = []
|
|
124
|
+
for att in schema_view.class_induced_slots(cd.name):
|
|
125
|
+
typ = TMAP.get(att.range, sqla.String)
|
|
126
|
+
if att.inlined or att.inlined_as_list:
|
|
127
|
+
typ = sqla.JSON
|
|
128
|
+
if att.multivalued:
|
|
129
|
+
typ = sqla.ARRAY(typ, dimensions=1)
|
|
130
|
+
if att.array:
|
|
131
|
+
typ = sqla.ARRAY(typ, dimensions=1)
|
|
132
|
+
col = Column(att.name, typ)
|
|
133
|
+
cols.append(col)
|
|
134
|
+
t = Table(self.alias, metadata_obj, *cols)
|
|
135
|
+
return t
|
|
136
|
+
|
|
137
|
+
def _check_if_initialized(self) -> bool:
|
|
138
|
+
# if self._initialized:
|
|
139
|
+
# return True
|
|
140
|
+
query = Query(
|
|
141
|
+
from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
|
|
142
|
+
)
|
|
143
|
+
qr = self.parent.query(query)
|
|
144
|
+
if qr.num_rows > 0:
|
|
145
|
+
return True
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
def group_by(
|
|
149
|
+
self,
|
|
150
|
+
group_by_fields: List[str],
|
|
151
|
+
inlined_field="objects",
|
|
152
|
+
agg_map: Optional[Dict[str, str]] = None,
|
|
153
|
+
where: Optional[Dict] = None,
|
|
154
|
+
**kwargs,
|
|
155
|
+
) -> QueryResult:
|
|
156
|
+
"""
|
|
157
|
+
Group objects in the collection by specified fields using SQLAlchemy.
|
|
158
|
+
|
|
159
|
+
This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
|
|
160
|
+
|
|
161
|
+
:param group_by_fields: List of fields to group by
|
|
162
|
+
:param inlined_field: Field name to store aggregated objects
|
|
163
|
+
:param agg_map: Dictionary mapping aggregation types to fields
|
|
164
|
+
:param where: Filter conditions
|
|
165
|
+
:param kwargs: Additional arguments
|
|
166
|
+
:return: Query result containing grouped data
|
|
167
|
+
"""
|
|
168
|
+
if isinstance(group_by_fields, str):
|
|
169
|
+
group_by_fields = [group_by_fields]
|
|
170
|
+
|
|
171
|
+
cd = self.class_definition()
|
|
172
|
+
if not cd:
|
|
173
|
+
logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
|
|
174
|
+
return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
|
|
175
|
+
|
|
176
|
+
# Check if the table exists
|
|
177
|
+
if not self.parent._table_exists(self.alias):
|
|
178
|
+
logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
|
|
179
|
+
return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
|
|
180
|
+
|
|
181
|
+
# Get table definition
|
|
182
|
+
table = self._sqla_table(cd)
|
|
183
|
+
engine = self.parent.engine
|
|
184
|
+
|
|
185
|
+
# Create a SQLAlchemy select statement for groups
|
|
186
|
+
from sqlalchemy import select
|
|
187
|
+
|
|
188
|
+
group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
|
|
189
|
+
|
|
190
|
+
if not group_cols:
|
|
191
|
+
logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
|
|
192
|
+
return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
|
|
193
|
+
|
|
194
|
+
stmt = select(*group_cols).distinct()
|
|
195
|
+
|
|
196
|
+
# Add where conditions if specified
|
|
197
|
+
if where:
|
|
198
|
+
conditions = []
|
|
199
|
+
for k, v in where.items():
|
|
200
|
+
if k in table.columns.keys():
|
|
201
|
+
# Handle different operator types (dict values for operators)
|
|
202
|
+
if isinstance(v, dict):
|
|
203
|
+
for op, val in v.items():
|
|
204
|
+
if op == "$gt":
|
|
205
|
+
conditions.append(table.c[k] > val)
|
|
206
|
+
elif op == "$gte":
|
|
207
|
+
conditions.append(table.c[k] >= val)
|
|
208
|
+
elif op == "$lt":
|
|
209
|
+
conditions.append(table.c[k] < val)
|
|
210
|
+
elif op == "$lte":
|
|
211
|
+
conditions.append(table.c[k] <= val)
|
|
212
|
+
elif op == "$ne":
|
|
213
|
+
conditions.append(table.c[k] != val)
|
|
214
|
+
elif op == "$in":
|
|
215
|
+
conditions.append(table.c[k].in_(val))
|
|
216
|
+
else:
|
|
217
|
+
# Default to equality for unknown operators
|
|
218
|
+
logger.warning(f"Unknown operator {op}, using equality")
|
|
219
|
+
conditions.append(table.c[k] == val)
|
|
220
|
+
else:
|
|
221
|
+
# Direct equality comparison
|
|
222
|
+
conditions.append(table.c[k] == v)
|
|
223
|
+
|
|
224
|
+
if conditions:
|
|
225
|
+
for condition in conditions:
|
|
226
|
+
stmt = stmt.where(condition)
|
|
227
|
+
|
|
228
|
+
results = []
|
|
229
|
+
try:
|
|
230
|
+
with engine.connect() as conn:
|
|
231
|
+
# Get all distinct groups
|
|
232
|
+
group_result = conn.execute(stmt)
|
|
233
|
+
group_rows = list(group_result)
|
|
234
|
+
|
|
235
|
+
# For each group, get all objects
|
|
236
|
+
for group_row in group_rows:
|
|
237
|
+
# Build conditions for this group
|
|
238
|
+
group_conditions = []
|
|
239
|
+
group_dict = {}
|
|
240
|
+
|
|
241
|
+
for i, field in enumerate(group_by_fields):
|
|
242
|
+
if field in table.columns.keys():
|
|
243
|
+
value = group_row[i]
|
|
244
|
+
group_dict[field] = value
|
|
245
|
+
if value is None:
|
|
246
|
+
group_conditions.append(table.c[field].is_(None))
|
|
247
|
+
else:
|
|
248
|
+
group_conditions.append(table.c[field] == value)
|
|
249
|
+
|
|
250
|
+
# Get all rows for this group
|
|
251
|
+
row_stmt = select(*table.columns)
|
|
252
|
+
for condition in group_conditions:
|
|
253
|
+
row_stmt = row_stmt.where(condition)
|
|
254
|
+
|
|
255
|
+
# Add original where conditions
|
|
256
|
+
if where:
|
|
257
|
+
for k, v in where.items():
|
|
258
|
+
if k in table.columns.keys():
|
|
259
|
+
# Handle different operator types for the row query as well
|
|
260
|
+
if isinstance(v, dict):
|
|
261
|
+
for op, val in v.items():
|
|
262
|
+
if op == "$gt":
|
|
263
|
+
row_stmt = row_stmt.where(table.c[k] > val)
|
|
264
|
+
elif op == "$gte":
|
|
265
|
+
row_stmt = row_stmt.where(table.c[k] >= val)
|
|
266
|
+
elif op == "$lt":
|
|
267
|
+
row_stmt = row_stmt.where(table.c[k] < val)
|
|
268
|
+
elif op == "$lte":
|
|
269
|
+
row_stmt = row_stmt.where(table.c[k] <= val)
|
|
270
|
+
elif op == "$ne":
|
|
271
|
+
row_stmt = row_stmt.where(table.c[k] != val)
|
|
272
|
+
elif op == "$in":
|
|
273
|
+
row_stmt = row_stmt.where(table.c[k].in_(val))
|
|
274
|
+
else:
|
|
275
|
+
# Default to equality for unknown operators
|
|
276
|
+
row_stmt = row_stmt.where(table.c[k] == val)
|
|
277
|
+
else:
|
|
278
|
+
# Direct equality comparison
|
|
279
|
+
row_stmt = row_stmt.where(table.c[k] == v)
|
|
280
|
+
|
|
281
|
+
row_result = conn.execute(row_stmt)
|
|
282
|
+
rows = list(row_result)
|
|
283
|
+
|
|
284
|
+
# Convert rows to dictionaries
|
|
285
|
+
objects = []
|
|
286
|
+
for row in rows:
|
|
287
|
+
obj = {}
|
|
288
|
+
for i, col in enumerate(row._fields):
|
|
289
|
+
obj[col] = row[i]
|
|
290
|
+
objects.append(obj)
|
|
291
|
+
|
|
292
|
+
# Apply agg_map to filter fields if specified
|
|
293
|
+
if agg_map and "list" in agg_map:
|
|
294
|
+
list_fields = agg_map["list"]
|
|
295
|
+
if list_fields:
|
|
296
|
+
objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
|
|
297
|
+
|
|
298
|
+
# Create the result object
|
|
299
|
+
result_obj = group_dict.copy()
|
|
300
|
+
result_obj[inlined_field] = objects
|
|
301
|
+
results.append(result_obj)
|
|
302
|
+
|
|
303
|
+
return QueryResult(num_rows=len(results), rows=results)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f"Error in DuckDB group_by: {e}")
|
|
306
|
+
# Fall back to parent implementation
|
|
307
|
+
return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
|
|
308
|
+
|
|
309
|
+
def _create_table(self, cd: ClassDefinition):
|
|
310
|
+
if self._table_created or self.metadata.is_prepopulated:
|
|
311
|
+
logger.info(f"Already have table for: {cd.name}")
|
|
312
|
+
return
|
|
313
|
+
if self.parent._table_exists(self.alias):
|
|
314
|
+
logger.info(f"Table already exists for {cd.name}")
|
|
315
|
+
self._table_created = True
|
|
316
|
+
self._initialized = True
|
|
317
|
+
self.metadata.is_prepopulated = True
|
|
318
|
+
return
|
|
319
|
+
# query = Query(
|
|
320
|
+
# from_table="information_schema.tables",
|
|
321
|
+
# where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
|
|
322
|
+
# )
|
|
323
|
+
# qr = self.parent.query(query)
|
|
324
|
+
# if qr.num_rows > 0:
|
|
325
|
+
# logger.info(f"Table already exists for {cd.name}")
|
|
326
|
+
# self._table_created = True
|
|
327
|
+
# self._initialized = True
|
|
328
|
+
# self.metadata.is_prepopulated = True
|
|
329
|
+
# return
|
|
330
|
+
logger.info(f"Creating table for {cd.name}")
|
|
331
|
+
t = self._sqla_table(cd)
|
|
332
|
+
ct = CreateTable(t)
|
|
333
|
+
ddl = str(ct.compile(self.parent.engine))
|
|
334
|
+
with self.parent.engine.connect() as conn:
|
|
335
|
+
conn.execute(text(ddl))
|
|
336
|
+
conn.commit()
|
|
337
|
+
self._table_created = True
|
|
338
|
+
self._initialized = True
|
|
339
|
+
self.metadata.is_prepopulated = True
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
from linkml_runtime import SchemaView
|
|
9
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
10
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
11
|
+
from sqlalchemy import NullPool, text
|
|
12
|
+
|
|
13
|
+
from linkml_store.api import Database
|
|
14
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
15
|
+
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
|
|
16
|
+
from linkml_store.utils.format_utils import Format
|
|
17
|
+
from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
|
|
18
|
+
|
|
19
|
+
TYPE_MAP = {
|
|
20
|
+
"VARCHAR": "string",
|
|
21
|
+
"BIGINT": "integer",
|
|
22
|
+
"BOOLEAN": "boolean",
|
|
23
|
+
"DATE": "date",
|
|
24
|
+
"DOUBLE": "float",
|
|
25
|
+
"INTEGER": "integer",
|
|
26
|
+
"JSON": "Any",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
MEMORY_HANDLE = "duckdb:///:memory:"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DuckDBDatabase(Database):
|
|
36
|
+
"""
|
|
37
|
+
An adapter for DuckDB databases.
|
|
38
|
+
|
|
39
|
+
Note that this adapter does not make use of a LinkML relational model transformation and
|
|
40
|
+
SQL Alchemy ORM layer. Instead, it attempts to map each collection (which is of type
|
|
41
|
+
some LinkML class) to a *single* DuckDB table. New tables are not created for nested references,
|
|
42
|
+
and linking tables are not created for many-to-many relationships.
|
|
43
|
+
|
|
44
|
+
Instead the native DuckDB ARRAY type is used to store multivalued attributes, and DuckDB JSON
|
|
45
|
+
types are used for nested inlined objects.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# _connection: DuckDBPyConnection = None
|
|
49
|
+
_engine: sqlalchemy.Engine = None
|
|
50
|
+
collection_class = DuckDBCollection
|
|
51
|
+
|
|
52
|
+
def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
|
|
53
|
+
if handle is None:
|
|
54
|
+
handle = MEMORY_HANDLE
|
|
55
|
+
if recreate_if_exists:
|
|
56
|
+
path = Path(handle.replace("duckdb:///", ""))
|
|
57
|
+
if path.exists():
|
|
58
|
+
path.unlink()
|
|
59
|
+
super().__init__(handle=handle, **kwargs)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def engine(self) -> sqlalchemy.Engine:
|
|
63
|
+
if not self._engine:
|
|
64
|
+
handle = self.handle
|
|
65
|
+
if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
|
|
66
|
+
handle = f"duckdb:///{handle}"
|
|
67
|
+
if ":memory:" not in handle:
|
|
68
|
+
# TODO: investigate this; duckdb appears to be prematurely caching
|
|
69
|
+
self._engine = sqlalchemy.create_engine(handle, poolclass=NullPool)
|
|
70
|
+
else:
|
|
71
|
+
self._engine = sqlalchemy.create_engine(handle)
|
|
72
|
+
return self._engine
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def _is_sqlite(self) -> bool:
|
|
76
|
+
return self.handle and self.handle.startswith("sqlite:")
|
|
77
|
+
|
|
78
|
+
def commit(self, **kwargs):
|
|
79
|
+
with self.engine.connect() as conn:
|
|
80
|
+
conn.commit()
|
|
81
|
+
|
|
82
|
+
def close(self, **kwargs):
|
|
83
|
+
self.engine.dispose()
|
|
84
|
+
|
|
85
|
+
def drop(self, missing_ok=True, **kwargs):
|
|
86
|
+
self.close()
|
|
87
|
+
if self.handle == MEMORY_HANDLE:
|
|
88
|
+
return
|
|
89
|
+
path = Path(self.handle.replace("duckdb:///", ""))
|
|
90
|
+
if path.exists():
|
|
91
|
+
path.unlink()
|
|
92
|
+
else:
|
|
93
|
+
if not missing_ok:
|
|
94
|
+
raise FileNotFoundError(f"Database file not found: {path}")
|
|
95
|
+
|
|
96
|
+
def _table_exists(self, table: str) -> bool:
|
|
97
|
+
if self._is_sqlite:
|
|
98
|
+
if table == "sqlite_master":
|
|
99
|
+
return True
|
|
100
|
+
meta_query = Query(
|
|
101
|
+
from_table="sqlite_master",
|
|
102
|
+
where_clause={
|
|
103
|
+
# "type": "table",
|
|
104
|
+
"name": table,
|
|
105
|
+
},
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
if table.startswith("information_schema"):
|
|
109
|
+
return True
|
|
110
|
+
meta_query = Query(
|
|
111
|
+
from_table="information_schema.tables",
|
|
112
|
+
where_clause={
|
|
113
|
+
"table_type": "BASE TABLE",
|
|
114
|
+
"table_name": table,
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
qr = self.query(meta_query)
|
|
119
|
+
if qr.num_rows == 0:
|
|
120
|
+
logger.debug(f"Table {self.alias} not created yet")
|
|
121
|
+
return False
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
|
|
125
|
+
json_encoded_cols = []
|
|
126
|
+
if table_name:
|
|
127
|
+
if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
|
|
128
|
+
return []
|
|
129
|
+
sv = self.schema_view
|
|
130
|
+
if sv:
|
|
131
|
+
cd = None
|
|
132
|
+
for c in self._collections.values():
|
|
133
|
+
if c.alias == table_name or c.target_class_name == table_name:
|
|
134
|
+
cd = c.class_definition()
|
|
135
|
+
break
|
|
136
|
+
if cd:
|
|
137
|
+
for att in sv.class_induced_slots(cd.name):
|
|
138
|
+
if att.inlined or att.inlined_as_list:
|
|
139
|
+
json_encoded_cols.append(att.name)
|
|
140
|
+
return json_encoded_cols
|
|
141
|
+
|
|
142
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
143
|
+
if not self._table_exists(query.from_table):
|
|
144
|
+
return QueryResult(query=query, num_rows=0, rows=[])
|
|
145
|
+
json_encoded_cols = self._json_encoded_cols(query.from_table)
|
|
146
|
+
|
|
147
|
+
with self.engine.connect() as conn:
|
|
148
|
+
count_query_str = text(query_to_sql(query, count=True))
|
|
149
|
+
logger.debug(f"count_query_str: {count_query_str}")
|
|
150
|
+
num_rows = list(conn.execute(count_query_str))[0][0]
|
|
151
|
+
logger.debug(f"num_rows: {num_rows}")
|
|
152
|
+
query_str = query_to_sql(query, **kwargs) # include offset, limit
|
|
153
|
+
logger.debug(f"query_str: {query_str}")
|
|
154
|
+
rows = list(conn.execute(text(query_str)).mappings())
|
|
155
|
+
qr = QueryResult(query=query, num_rows=num_rows, rows=rows)
|
|
156
|
+
if json_encoded_cols:
|
|
157
|
+
for row in qr.rows:
|
|
158
|
+
for col in json_encoded_cols:
|
|
159
|
+
if row[col]:
|
|
160
|
+
if isinstance(row[col], list):
|
|
161
|
+
for i in range(len(row[col])):
|
|
162
|
+
try:
|
|
163
|
+
parsed_val = json.loads(row[col][i])
|
|
164
|
+
except json.JSONDecodeError as e:
|
|
165
|
+
logger.error(f"Failed to parse col {col}[{i}] == {row[col][i]}")
|
|
166
|
+
raise e
|
|
167
|
+
row[col][i] = parsed_val
|
|
168
|
+
elif isinstance(row[col], dict):
|
|
169
|
+
pass
|
|
170
|
+
else:
|
|
171
|
+
row[col] = json.loads(row[col])
|
|
172
|
+
qr.set_rows(pd.DataFrame(rows))
|
|
173
|
+
facet_columns = query.facet_slots
|
|
174
|
+
if query.include_facet_counts and not facet_columns:
|
|
175
|
+
raise ValueError("Facet counts requested but no facet columns specified")
|
|
176
|
+
if facet_columns:
|
|
177
|
+
raise NotImplementedError
|
|
178
|
+
return qr
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def supports_sql(self) -> bool:
|
|
182
|
+
"""Return True - DuckDB supports raw SQL queries."""
|
|
183
|
+
return True
|
|
184
|
+
|
|
185
|
+
def execute_sql(self, sql: str, **kwargs) -> QueryResult:
|
|
186
|
+
"""
|
|
187
|
+
Execute a raw SQL query against the DuckDB database.
|
|
188
|
+
|
|
189
|
+
:param sql: SQL query string
|
|
190
|
+
:param kwargs: Additional arguments
|
|
191
|
+
:return: QueryResult containing the results
|
|
192
|
+
"""
|
|
193
|
+
with self.engine.connect() as conn:
|
|
194
|
+
result = conn.execute(text(sql))
|
|
195
|
+
rows = [dict(row._mapping) for row in result]
|
|
196
|
+
return QueryResult(num_rows=len(rows), rows=rows)
|
|
197
|
+
|
|
198
|
+
def init_collections(self):
|
|
199
|
+
# TODO: unify schema introspection
|
|
200
|
+
if not self.schema_view:
|
|
201
|
+
schema = introspect_schema(self.engine)
|
|
202
|
+
else:
|
|
203
|
+
schema = self.schema_view.schema
|
|
204
|
+
table_names = schema.classes.keys()
|
|
205
|
+
if self._collections is None:
|
|
206
|
+
self._collections = {}
|
|
207
|
+
for table_name in table_names:
|
|
208
|
+
if table_name not in self._collections:
|
|
209
|
+
collection = DuckDBCollection(name=table_name, parent=self)
|
|
210
|
+
self._collections[table_name] = collection
|
|
211
|
+
|
|
212
|
+
def induce_schema_view(self) -> SchemaView:
|
|
213
|
+
# TODO: unify schema introspection
|
|
214
|
+
# TODO: handle case where schema is provided in advance
|
|
215
|
+
logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
|
|
216
|
+
sb = SchemaBuilder()
|
|
217
|
+
schema = sb.schema
|
|
218
|
+
logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
|
|
219
|
+
if self._is_sqlite:
|
|
220
|
+
return SchemaView(schema)
|
|
221
|
+
query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
|
|
222
|
+
qr = self.query(query)
|
|
223
|
+
logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
|
|
224
|
+
if qr.num_rows:
|
|
225
|
+
table_names = [row["table_name"] for row in qr.rows]
|
|
226
|
+
for tbl in table_names:
|
|
227
|
+
sb.add_class(tbl)
|
|
228
|
+
query = Query(from_table="information_schema.columns", sort_by=["ordinal_position"])
|
|
229
|
+
for row in self.query(query, limit=-1).rows:
|
|
230
|
+
tbl_name = row["table_name"]
|
|
231
|
+
if tbl_name not in sb.schema.classes:
|
|
232
|
+
continue
|
|
233
|
+
dt = row["data_type"]
|
|
234
|
+
if dt.endswith("[]"):
|
|
235
|
+
dt = dt[0:-2]
|
|
236
|
+
multivalued = True
|
|
237
|
+
else:
|
|
238
|
+
multivalued = False
|
|
239
|
+
rng = TYPE_MAP.get(dt, "string")
|
|
240
|
+
sd = SlotDefinition(
|
|
241
|
+
row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
|
|
242
|
+
)
|
|
243
|
+
if dt == "JSON":
|
|
244
|
+
sd.inlined_as_list = True
|
|
245
|
+
sb.schema.classes[tbl_name].attributes[sd.name] = sd
|
|
246
|
+
logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range} FROM {dt}")
|
|
247
|
+
sb.add_defaults()
|
|
248
|
+
for cls_name in schema.classes:
|
|
249
|
+
if cls_name in self.metadata.collections:
|
|
250
|
+
collection_metadata = self.metadata.collections[cls_name]
|
|
251
|
+
if collection_metadata.attributes:
|
|
252
|
+
del schema.classes[cls_name]
|
|
253
|
+
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
254
|
+
schema.classes[cls.name] = cls
|
|
255
|
+
return SchemaView(schema)
|
|
256
|
+
|
|
257
|
+
def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
258
|
+
if target_format == "duckdb" or target_format == Format.SQLDUMP_DUCKDB:
|
|
259
|
+
path = Path(location)
|
|
260
|
+
if path.exists():
|
|
261
|
+
if path.is_file():
|
|
262
|
+
path.unlink()
|
|
263
|
+
with self.engine.connect() as conn:
|
|
264
|
+
sql = text(f"EXPORT DATABASE '{location}'")
|
|
265
|
+
conn.execute(sql)
|
|
266
|
+
else:
|
|
267
|
+
super().export_database(location, target_format=target_format, **kwargs)
|
|
268
|
+
|
|
269
|
+
def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
|
|
270
|
+
"""
|
|
271
|
+
Import a database from a file or location.
|
|
272
|
+
|
|
273
|
+
:param location: location of the file
|
|
274
|
+
:param source_format: source format
|
|
275
|
+
:param kwargs: additional arguments
|
|
276
|
+
"""
|
|
277
|
+
if source_format == Format.SQLDUMP_DUCKDB.value or source_format == Format.SQLDUMP_DUCKDB:
|
|
278
|
+
with self.engine.connect() as conn:
|
|
279
|
+
sql = text(f"IMPORT DATABASE '{location}'")
|
|
280
|
+
conn.execute(sql)
|
|
281
|
+
conn.commit()
|
|
282
|
+
else:
|
|
283
|
+
super().import_database(location, source_format=source_format, **kwargs)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adapter for FileSystem wrapper
|
|
3
|
+
|
|
4
|
+
Handles have the form:
|
|
5
|
+
|
|
6
|
+
- ``file:<path>`` for a local file
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
10
|
+
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"FileSystemCollection",
|
|
14
|
+
"FileSystemDatabase",
|
|
15
|
+
]
|