linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,16 @@
1
+ """
2
+ Adapter for DuckDB embedded database.
3
+
4
+ Handles have the form:
5
+
6
+ - ``duckdb:///<path>`` for a file-based database
7
+ - ``duckdb:///:memory:`` for an in-memory database
8
+ """
9
+
10
+ from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
11
+ from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
+
13
+ __all__ = [
14
+ "DuckDBCollection",
15
+ "DuckDBDatabase",
16
+ ]
@@ -0,0 +1,339 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import sqlalchemy as sqla
5
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
+ from sqlalchemy import Column, Table, delete, insert, inspect, text
7
+ from sqlalchemy.sql.ddl import CreateTable
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
11
+ from linkml_store.api.queries import Query, QueryResult
12
+ from linkml_store.api.stores.duckdb.mappings import TMAP
13
+ from linkml_store.utils.sql_utils import facet_count_sql
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DuckDBCollection(Collection):
19
+ _table_created: bool = None
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super().__init__(*args, **kwargs)
23
+
24
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
25
+ logger.debug(f"Inserting {len(objs)}")
26
+ if not isinstance(objs, list):
27
+ objs = [objs]
28
+ if not objs:
29
+ return
30
+ cd = self.class_definition()
31
+ if not cd:
32
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
33
+ cd = self.induce_class_definition_from_objects(objs)
34
+ self._create_table(cd)
35
+ table = self._sqla_table(cd)
36
+ logger.info(f"Inserting into: {self.alias} // T={table.name}")
37
+ engine = self.parent.engine
38
+ col_names = [c.name for c in table.columns]
39
+ bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
40
+ if bad_objs:
41
+ logger.error(f"Bad objects: {bad_objs}")
42
+ objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
43
+ with engine.connect() as conn:
44
+ with conn.begin():
45
+ conn.execute(insert(table), objs)
46
+ conn.commit()
47
+ self._post_insert_hook(objs)
48
+
49
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
50
+ if not isinstance(objs, list):
51
+ objs = [objs]
52
+ cd = self.class_definition()
53
+ if not cd or not cd.attributes:
54
+ cd = self.induce_class_definition_from_objects(objs)
55
+ assert cd.attributes
56
+ table = self._sqla_table(cd)
57
+ engine = self.parent.engine
58
+ with engine.connect() as conn:
59
+ for obj in objs:
60
+ conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
61
+ stmt = delete(table).where(*conditions)
62
+ stmt = stmt.compile(engine)
63
+ conn.execute(stmt)
64
+ conn.commit()
65
+ self._post_delete_hook()
66
+ return None
67
+
68
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
69
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
70
+ if where is None:
71
+ where = {}
72
+ cd = self.class_definition()
73
+ if not cd:
74
+ logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
75
+ return 0
76
+ table = self._sqla_table(cd)
77
+ engine = self.parent.engine
78
+ inspector = inspect(engine)
79
+ table_exists = table.name in inspector.get_table_names()
80
+ if not table_exists:
81
+ logger.info(f"Table {table.name} does not exist, assuming no data")
82
+ return 0
83
+ with engine.connect() as conn:
84
+ conditions = [table.c[k] == v for k, v in where.items()]
85
+ stmt = delete(table).where(*conditions)
86
+ stmt = stmt.compile(engine)
87
+ result = conn.execute(stmt)
88
+ deleted_rows_count = result.rowcount
89
+ if deleted_rows_count == 0 and not missing_ok:
90
+ raise ValueError(f"No rows found for {where}")
91
+ conn.commit()
92
+ self._post_delete_hook()
93
+ return deleted_rows_count if deleted_rows_count > -1 else None
94
+
95
+ def query_facets(
96
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
97
+ ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
98
+ if facet_limit is None:
99
+ facet_limit = DEFAULT_FACET_LIMIT
100
+ results = {}
101
+ cd = self.class_definition()
102
+ with self.parent.engine.connect() as conn:
103
+ if not facet_columns:
104
+ if not cd:
105
+ raise ValueError(f"No class definition found for {self.target_class_name}")
106
+ facet_columns = list(cd.attributes.keys())
107
+ for col in facet_columns:
108
+ logger.debug(f"Faceting on {col}")
109
+ if isinstance(col, tuple):
110
+ sd = SlotDefinition(name="PLACEHOLDER")
111
+ else:
112
+ sd = cd.attributes[col]
113
+ facet_query = self._create_query(where_clause=where)
114
+ facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
115
+ logger.debug(f"Facet query: {facet_query_str}")
116
+ rows = list(conn.execute(text(facet_query_str)))
117
+ results[col] = [tuple(row) for row in rows]
118
+ return results
119
+
120
+ def _sqla_table(self, cd: ClassDefinition) -> Table:
121
+ schema_view = self.parent.schema_view
122
+ metadata_obj = sqla.MetaData()
123
+ cols = []
124
+ for att in schema_view.class_induced_slots(cd.name):
125
+ typ = TMAP.get(att.range, sqla.String)
126
+ if att.inlined or att.inlined_as_list:
127
+ typ = sqla.JSON
128
+ if att.multivalued:
129
+ typ = sqla.ARRAY(typ, dimensions=1)
130
+ if att.array:
131
+ typ = sqla.ARRAY(typ, dimensions=1)
132
+ col = Column(att.name, typ)
133
+ cols.append(col)
134
+ t = Table(self.alias, metadata_obj, *cols)
135
+ return t
136
+
137
+ def _check_if_initialized(self) -> bool:
138
+ # if self._initialized:
139
+ # return True
140
+ query = Query(
141
+ from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
142
+ )
143
+ qr = self.parent.query(query)
144
+ if qr.num_rows > 0:
145
+ return True
146
+ return False
147
+
148
+ def group_by(
149
+ self,
150
+ group_by_fields: List[str],
151
+ inlined_field="objects",
152
+ agg_map: Optional[Dict[str, str]] = None,
153
+ where: Optional[Dict] = None,
154
+ **kwargs,
155
+ ) -> QueryResult:
156
+ """
157
+ Group objects in the collection by specified fields using SQLAlchemy.
158
+
159
+ This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
160
+
161
+ :param group_by_fields: List of fields to group by
162
+ :param inlined_field: Field name to store aggregated objects
163
+ :param agg_map: Dictionary mapping aggregation types to fields
164
+ :param where: Filter conditions
165
+ :param kwargs: Additional arguments
166
+ :return: Query result containing grouped data
167
+ """
168
+ if isinstance(group_by_fields, str):
169
+ group_by_fields = [group_by_fields]
170
+
171
+ cd = self.class_definition()
172
+ if not cd:
173
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
174
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
175
+
176
+ # Check if the table exists
177
+ if not self.parent._table_exists(self.alias):
178
+ logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
179
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
180
+
181
+ # Get table definition
182
+ table = self._sqla_table(cd)
183
+ engine = self.parent.engine
184
+
185
+ # Create a SQLAlchemy select statement for groups
186
+ from sqlalchemy import select
187
+
188
+ group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
189
+
190
+ if not group_cols:
191
+ logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
192
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
193
+
194
+ stmt = select(*group_cols).distinct()
195
+
196
+ # Add where conditions if specified
197
+ if where:
198
+ conditions = []
199
+ for k, v in where.items():
200
+ if k in table.columns.keys():
201
+ # Handle different operator types (dict values for operators)
202
+ if isinstance(v, dict):
203
+ for op, val in v.items():
204
+ if op == "$gt":
205
+ conditions.append(table.c[k] > val)
206
+ elif op == "$gte":
207
+ conditions.append(table.c[k] >= val)
208
+ elif op == "$lt":
209
+ conditions.append(table.c[k] < val)
210
+ elif op == "$lte":
211
+ conditions.append(table.c[k] <= val)
212
+ elif op == "$ne":
213
+ conditions.append(table.c[k] != val)
214
+ elif op == "$in":
215
+ conditions.append(table.c[k].in_(val))
216
+ else:
217
+ # Default to equality for unknown operators
218
+ logger.warning(f"Unknown operator {op}, using equality")
219
+ conditions.append(table.c[k] == val)
220
+ else:
221
+ # Direct equality comparison
222
+ conditions.append(table.c[k] == v)
223
+
224
+ if conditions:
225
+ for condition in conditions:
226
+ stmt = stmt.where(condition)
227
+
228
+ results = []
229
+ try:
230
+ with engine.connect() as conn:
231
+ # Get all distinct groups
232
+ group_result = conn.execute(stmt)
233
+ group_rows = list(group_result)
234
+
235
+ # For each group, get all objects
236
+ for group_row in group_rows:
237
+ # Build conditions for this group
238
+ group_conditions = []
239
+ group_dict = {}
240
+
241
+ for i, field in enumerate(group_by_fields):
242
+ if field in table.columns.keys():
243
+ value = group_row[i]
244
+ group_dict[field] = value
245
+ if value is None:
246
+ group_conditions.append(table.c[field].is_(None))
247
+ else:
248
+ group_conditions.append(table.c[field] == value)
249
+
250
+ # Get all rows for this group
251
+ row_stmt = select(*table.columns)
252
+ for condition in group_conditions:
253
+ row_stmt = row_stmt.where(condition)
254
+
255
+ # Add original where conditions
256
+ if where:
257
+ for k, v in where.items():
258
+ if k in table.columns.keys():
259
+ # Handle different operator types for the row query as well
260
+ if isinstance(v, dict):
261
+ for op, val in v.items():
262
+ if op == "$gt":
263
+ row_stmt = row_stmt.where(table.c[k] > val)
264
+ elif op == "$gte":
265
+ row_stmt = row_stmt.where(table.c[k] >= val)
266
+ elif op == "$lt":
267
+ row_stmt = row_stmt.where(table.c[k] < val)
268
+ elif op == "$lte":
269
+ row_stmt = row_stmt.where(table.c[k] <= val)
270
+ elif op == "$ne":
271
+ row_stmt = row_stmt.where(table.c[k] != val)
272
+ elif op == "$in":
273
+ row_stmt = row_stmt.where(table.c[k].in_(val))
274
+ else:
275
+ # Default to equality for unknown operators
276
+ row_stmt = row_stmt.where(table.c[k] == val)
277
+ else:
278
+ # Direct equality comparison
279
+ row_stmt = row_stmt.where(table.c[k] == v)
280
+
281
+ row_result = conn.execute(row_stmt)
282
+ rows = list(row_result)
283
+
284
+ # Convert rows to dictionaries
285
+ objects = []
286
+ for row in rows:
287
+ obj = {}
288
+ for i, col in enumerate(row._fields):
289
+ obj[col] = row[i]
290
+ objects.append(obj)
291
+
292
+ # Apply agg_map to filter fields if specified
293
+ if agg_map and "list" in agg_map:
294
+ list_fields = agg_map["list"]
295
+ if list_fields:
296
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
297
+
298
+ # Create the result object
299
+ result_obj = group_dict.copy()
300
+ result_obj[inlined_field] = objects
301
+ results.append(result_obj)
302
+
303
+ return QueryResult(num_rows=len(results), rows=results)
304
+ except Exception as e:
305
+ logger.warning(f"Error in DuckDB group_by: {e}")
306
+ # Fall back to parent implementation
307
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
308
+
309
+ def _create_table(self, cd: ClassDefinition):
310
+ if self._table_created or self.metadata.is_prepopulated:
311
+ logger.info(f"Already have table for: {cd.name}")
312
+ return
313
+ if self.parent._table_exists(self.alias):
314
+ logger.info(f"Table already exists for {cd.name}")
315
+ self._table_created = True
316
+ self._initialized = True
317
+ self.metadata.is_prepopulated = True
318
+ return
319
+ # query = Query(
320
+ # from_table="information_schema.tables",
321
+ # where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
322
+ # )
323
+ # qr = self.parent.query(query)
324
+ # if qr.num_rows > 0:
325
+ # logger.info(f"Table already exists for {cd.name}")
326
+ # self._table_created = True
327
+ # self._initialized = True
328
+ # self.metadata.is_prepopulated = True
329
+ # return
330
+ logger.info(f"Creating table for {cd.name}")
331
+ t = self._sqla_table(cd)
332
+ ct = CreateTable(t)
333
+ ddl = str(ct.compile(self.parent.engine))
334
+ with self.parent.engine.connect() as conn:
335
+ conn.execute(text(ddl))
336
+ conn.commit()
337
+ self._table_created = True
338
+ self._initialized = True
339
+ self.metadata.is_prepopulated = True
@@ -0,0 +1,283 @@
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import List, Optional, Union
5
+
6
+ import pandas as pd
7
+ import sqlalchemy
8
+ from linkml_runtime import SchemaView
9
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
10
+ from linkml_runtime.utils.schema_builder import SchemaBuilder
11
+ from sqlalchemy import NullPool, text
12
+
13
+ from linkml_store.api import Database
14
+ from linkml_store.api.queries import Query, QueryResult
15
+ from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
16
+ from linkml_store.utils.format_utils import Format
17
+ from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
18
+
19
+ TYPE_MAP = {
20
+ "VARCHAR": "string",
21
+ "BIGINT": "integer",
22
+ "BOOLEAN": "boolean",
23
+ "DATE": "date",
24
+ "DOUBLE": "float",
25
+ "INTEGER": "integer",
26
+ "JSON": "Any",
27
+ }
28
+
29
+ MEMORY_HANDLE = "duckdb:///:memory:"
30
+
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DuckDBDatabase(Database):
36
+ """
37
+ An adapter for DuckDB databases.
38
+
39
+ Note that this adapter does not make use of a LinkML relational model transformation and
40
+ SQL Alchemy ORM layer. Instead, it attempts to map each collection (which is of type
41
+ some LinkML class) to a *single* DuckDB table. New tables are not created for nested references,
42
+ and linking tables are not created for many-to-many relationships.
43
+
44
+ Instead the native DuckDB ARRAY type is used to store multivalued attributes, and DuckDB JSON
45
+ types are used for nested inlined objects.
46
+ """
47
+
48
+ # _connection: DuckDBPyConnection = None
49
+ _engine: sqlalchemy.Engine = None
50
+ collection_class = DuckDBCollection
51
+
52
+ def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
53
+ if handle is None:
54
+ handle = MEMORY_HANDLE
55
+ if recreate_if_exists:
56
+ path = Path(handle.replace("duckdb:///", ""))
57
+ if path.exists():
58
+ path.unlink()
59
+ super().__init__(handle=handle, **kwargs)
60
+
61
+ @property
62
+ def engine(self) -> sqlalchemy.Engine:
63
+ if not self._engine:
64
+ handle = self.handle
65
+ if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
66
+ handle = f"duckdb:///{handle}"
67
+ if ":memory:" not in handle:
68
+ # TODO: investigate this; duckdb appears to be prematurely caching
69
+ self._engine = sqlalchemy.create_engine(handle, poolclass=NullPool)
70
+ else:
71
+ self._engine = sqlalchemy.create_engine(handle)
72
+ return self._engine
73
+
74
+ @property
75
+ def _is_sqlite(self) -> bool:
76
+ return self.handle and self.handle.startswith("sqlite:")
77
+
78
+ def commit(self, **kwargs):
79
+ with self.engine.connect() as conn:
80
+ conn.commit()
81
+
82
+ def close(self, **kwargs):
83
+ self.engine.dispose()
84
+
85
+ def drop(self, missing_ok=True, **kwargs):
86
+ self.close()
87
+ if self.handle == MEMORY_HANDLE:
88
+ return
89
+ path = Path(self.handle.replace("duckdb:///", ""))
90
+ if path.exists():
91
+ path.unlink()
92
+ else:
93
+ if not missing_ok:
94
+ raise FileNotFoundError(f"Database file not found: {path}")
95
+
96
+ def _table_exists(self, table: str) -> bool:
97
+ if self._is_sqlite:
98
+ if table == "sqlite_master":
99
+ return True
100
+ meta_query = Query(
101
+ from_table="sqlite_master",
102
+ where_clause={
103
+ # "type": "table",
104
+ "name": table,
105
+ },
106
+ )
107
+ else:
108
+ if table.startswith("information_schema"):
109
+ return True
110
+ meta_query = Query(
111
+ from_table="information_schema.tables",
112
+ where_clause={
113
+ "table_type": "BASE TABLE",
114
+ "table_name": table,
115
+ },
116
+ )
117
+
118
+ qr = self.query(meta_query)
119
+ if qr.num_rows == 0:
120
+ logger.debug(f"Table {self.alias} not created yet")
121
+ return False
122
+ return True
123
+
124
+ def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
125
+ json_encoded_cols = []
126
+ if table_name:
127
+ if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
128
+ return []
129
+ sv = self.schema_view
130
+ if sv:
131
+ cd = None
132
+ for c in self._collections.values():
133
+ if c.alias == table_name or c.target_class_name == table_name:
134
+ cd = c.class_definition()
135
+ break
136
+ if cd:
137
+ for att in sv.class_induced_slots(cd.name):
138
+ if att.inlined or att.inlined_as_list:
139
+ json_encoded_cols.append(att.name)
140
+ return json_encoded_cols
141
+
142
+ def query(self, query: Query, **kwargs) -> QueryResult:
143
+ if not self._table_exists(query.from_table):
144
+ return QueryResult(query=query, num_rows=0, rows=[])
145
+ json_encoded_cols = self._json_encoded_cols(query.from_table)
146
+
147
+ with self.engine.connect() as conn:
148
+ count_query_str = text(query_to_sql(query, count=True))
149
+ logger.debug(f"count_query_str: {count_query_str}")
150
+ num_rows = list(conn.execute(count_query_str))[0][0]
151
+ logger.debug(f"num_rows: {num_rows}")
152
+ query_str = query_to_sql(query, **kwargs) # include offset, limit
153
+ logger.debug(f"query_str: {query_str}")
154
+ rows = list(conn.execute(text(query_str)).mappings())
155
+ qr = QueryResult(query=query, num_rows=num_rows, rows=rows)
156
+ if json_encoded_cols:
157
+ for row in qr.rows:
158
+ for col in json_encoded_cols:
159
+ if row[col]:
160
+ if isinstance(row[col], list):
161
+ for i in range(len(row[col])):
162
+ try:
163
+ parsed_val = json.loads(row[col][i])
164
+ except json.JSONDecodeError as e:
165
+ logger.error(f"Failed to parse col {col}[{i}] == {row[col][i]}")
166
+ raise e
167
+ row[col][i] = parsed_val
168
+ elif isinstance(row[col], dict):
169
+ pass
170
+ else:
171
+ row[col] = json.loads(row[col])
172
+ qr.set_rows(pd.DataFrame(rows))
173
+ facet_columns = query.facet_slots
174
+ if query.include_facet_counts and not facet_columns:
175
+ raise ValueError("Facet counts requested but no facet columns specified")
176
+ if facet_columns:
177
+ raise NotImplementedError
178
+ return qr
179
+
180
+ @property
181
+ def supports_sql(self) -> bool:
182
+ """Return True - DuckDB supports raw SQL queries."""
183
+ return True
184
+
185
+ def execute_sql(self, sql: str, **kwargs) -> QueryResult:
186
+ """
187
+ Execute a raw SQL query against the DuckDB database.
188
+
189
+ :param sql: SQL query string
190
+ :param kwargs: Additional arguments
191
+ :return: QueryResult containing the results
192
+ """
193
+ with self.engine.connect() as conn:
194
+ result = conn.execute(text(sql))
195
+ rows = [dict(row._mapping) for row in result]
196
+ return QueryResult(num_rows=len(rows), rows=rows)
197
+
198
+ def init_collections(self):
199
+ # TODO: unify schema introspection
200
+ if not self.schema_view:
201
+ schema = introspect_schema(self.engine)
202
+ else:
203
+ schema = self.schema_view.schema
204
+ table_names = schema.classes.keys()
205
+ if self._collections is None:
206
+ self._collections = {}
207
+ for table_name in table_names:
208
+ if table_name not in self._collections:
209
+ collection = DuckDBCollection(name=table_name, parent=self)
210
+ self._collections[table_name] = collection
211
+
212
+ def induce_schema_view(self) -> SchemaView:
213
+ # TODO: unify schema introspection
214
+ # TODO: handle case where schema is provided in advance
215
+ logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
216
+ sb = SchemaBuilder()
217
+ schema = sb.schema
218
+ logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
219
+ if self._is_sqlite:
220
+ return SchemaView(schema)
221
+ query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
222
+ qr = self.query(query)
223
+ logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
224
+ if qr.num_rows:
225
+ table_names = [row["table_name"] for row in qr.rows]
226
+ for tbl in table_names:
227
+ sb.add_class(tbl)
228
+ query = Query(from_table="information_schema.columns", sort_by=["ordinal_position"])
229
+ for row in self.query(query, limit=-1).rows:
230
+ tbl_name = row["table_name"]
231
+ if tbl_name not in sb.schema.classes:
232
+ continue
233
+ dt = row["data_type"]
234
+ if dt.endswith("[]"):
235
+ dt = dt[0:-2]
236
+ multivalued = True
237
+ else:
238
+ multivalued = False
239
+ rng = TYPE_MAP.get(dt, "string")
240
+ sd = SlotDefinition(
241
+ row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
242
+ )
243
+ if dt == "JSON":
244
+ sd.inlined_as_list = True
245
+ sb.schema.classes[tbl_name].attributes[sd.name] = sd
246
+ logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range} FROM {dt}")
247
+ sb.add_defaults()
248
+ for cls_name in schema.classes:
249
+ if cls_name in self.metadata.collections:
250
+ collection_metadata = self.metadata.collections[cls_name]
251
+ if collection_metadata.attributes:
252
+ del schema.classes[cls_name]
253
+ cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
254
+ schema.classes[cls.name] = cls
255
+ return SchemaView(schema)
256
+
257
+ def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
258
+ if target_format == "duckdb" or target_format == Format.SQLDUMP_DUCKDB:
259
+ path = Path(location)
260
+ if path.exists():
261
+ if path.is_file():
262
+ path.unlink()
263
+ with self.engine.connect() as conn:
264
+ sql = text(f"EXPORT DATABASE '{location}'")
265
+ conn.execute(sql)
266
+ else:
267
+ super().export_database(location, target_format=target_format, **kwargs)
268
+
269
+ def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
270
+ """
271
+ Import a database from a file or location.
272
+
273
+ :param location: location of the file
274
+ :param source_format: source format
275
+ :param kwargs: additional arguments
276
+ """
277
+ if source_format == Format.SQLDUMP_DUCKDB.value or source_format == Format.SQLDUMP_DUCKDB:
278
+ with self.engine.connect() as conn:
279
+ sql = text(f"IMPORT DATABASE '{location}'")
280
+ conn.execute(sql)
281
+ conn.commit()
282
+ else:
283
+ super().import_database(location, source_format=source_format, **kwargs)
@@ -0,0 +1,8 @@
1
+ import sqlalchemy as sqla
2
+
3
+ TMAP = {
4
+ "string": sqla.String,
5
+ "integer": sqla.Integer,
6
+ "float": sqla.Float,
7
+ "linkml:Any": sqla.JSON,
8
+ }
@@ -0,0 +1,15 @@
1
+ """
2
+ Adapter for FileSystem wrapper
3
+
4
+ Handles have the form:
5
+
6
+ - ``file:<path>`` for a local file
7
+ """
8
+
9
+ from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
+ from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
11
+
12
+ __all__ = [
13
+ "FileSystemCollection",
14
+ "FileSystemDatabase",
15
+ ]