linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +9 -6
- linkml_store/api/collection.py +118 -5
- linkml_store/api/database.py +45 -14
- linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
- linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
- linkml_store/api/stores/filesystem/__init__.py +1 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
- linkml_store/api/stores/solr/solr_collection.py +7 -1
- linkml_store/cli.py +202 -21
- linkml_store/index/implementations/llm_indexer.py +14 -6
- linkml_store/index/indexer.py +7 -4
- linkml_store/inference/implementations/llm_inference_engine.py +13 -9
- linkml_store/inference/implementations/rag_inference_engine.py +13 -10
- linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
- linkml_store/inference/inference_config.py +1 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/format_utils.py +183 -3
- linkml_store/utils/llm_utils.py +3 -1
- linkml_store/utils/pandas_utils.py +1 -1
- linkml_store/utils/sql_utils.py +7 -1
- linkml_store/utils/vector_utils.py +4 -11
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Optional, Union
|
|
4
|
+
from typing import Optional, Union, List
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import sqlalchemy
|
|
@@ -14,7 +14,7 @@ from linkml_store.api import Database
|
|
|
14
14
|
from linkml_store.api.queries import Query, QueryResult
|
|
15
15
|
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
|
|
16
16
|
from linkml_store.utils.format_utils import Format
|
|
17
|
-
from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
|
|
17
|
+
from linkml_store.utils.sql_utils import introspect_schema, query_to_sql, where_clause_to_sql
|
|
18
18
|
|
|
19
19
|
TYPE_MAP = {
|
|
20
20
|
"VARCHAR": "string",
|
|
@@ -62,7 +62,7 @@ class DuckDBDatabase(Database):
|
|
|
62
62
|
def engine(self) -> sqlalchemy.Engine:
|
|
63
63
|
if not self._engine:
|
|
64
64
|
handle = self.handle
|
|
65
|
-
if not handle.startswith("duckdb://") and not handle.startswith(":"):
|
|
65
|
+
if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
|
|
66
66
|
handle = f"duckdb:///{handle}"
|
|
67
67
|
if ":memory:" not in handle:
|
|
68
68
|
# TODO: investigate this; duckdb appears to be prematurely caching
|
|
@@ -71,6 +71,10 @@ class DuckDBDatabase(Database):
|
|
|
71
71
|
self._engine = sqlalchemy.create_engine(handle)
|
|
72
72
|
return self._engine
|
|
73
73
|
|
|
74
|
+
@property
|
|
75
|
+
def _is_sqlite(self) -> bool:
|
|
76
|
+
return self.handle and self.handle.startswith("sqlite:")
|
|
77
|
+
|
|
74
78
|
def commit(self, **kwargs):
|
|
75
79
|
with self.engine.connect() as conn:
|
|
76
80
|
conn.commit()
|
|
@@ -89,34 +93,60 @@ class DuckDBDatabase(Database):
|
|
|
89
93
|
if not missing_ok:
|
|
90
94
|
raise FileNotFoundError(f"Database file not found: {path}")
|
|
91
95
|
|
|
92
|
-
def
|
|
96
|
+
def _table_exists(self, table: str) -> bool:
|
|
97
|
+
if self._is_sqlite:
|
|
98
|
+
if table == "sqlite_master":
|
|
99
|
+
return True
|
|
100
|
+
meta_query = Query(
|
|
101
|
+
from_table="sqlite_master",
|
|
102
|
+
where_clause={
|
|
103
|
+
# "type": "table",
|
|
104
|
+
"name": table,
|
|
105
|
+
},
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
if table.startswith("information_schema"):
|
|
109
|
+
return True
|
|
110
|
+
meta_query = Query(
|
|
111
|
+
from_table="information_schema.tables",
|
|
112
|
+
where_clause={
|
|
113
|
+
"table_type": "BASE TABLE",
|
|
114
|
+
"table_name": table,
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
qr = self.query(meta_query)
|
|
119
|
+
if qr.num_rows == 0:
|
|
120
|
+
logger.debug(f"Table {self.alias} not created yet")
|
|
121
|
+
return False
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
|
|
93
125
|
json_encoded_cols = []
|
|
94
|
-
if
|
|
95
|
-
if
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
)
|
|
99
|
-
qr = self.query(meta_query)
|
|
100
|
-
if qr.num_rows == 0:
|
|
101
|
-
logger.debug(f"Table {query.from_table} not created yet")
|
|
102
|
-
return QueryResult(query=query, num_rows=0, rows=[])
|
|
103
|
-
if not query.from_table.startswith("information_schema"):
|
|
104
|
-
sv = self.schema_view
|
|
105
|
-
else:
|
|
106
|
-
sv = None
|
|
126
|
+
if table_name:
|
|
127
|
+
if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
|
|
128
|
+
return []
|
|
129
|
+
sv = self.schema_view
|
|
107
130
|
if sv:
|
|
108
131
|
cd = None
|
|
109
132
|
for c in self._collections.values():
|
|
110
|
-
|
|
111
|
-
if c.alias == query.from_table or c.target_class_name == query.from_table:
|
|
133
|
+
if c.alias == table_name or c.target_class_name == table_name:
|
|
112
134
|
cd = c.class_definition()
|
|
113
135
|
break
|
|
114
136
|
if cd:
|
|
115
137
|
for att in sv.class_induced_slots(cd.name):
|
|
116
138
|
if att.inlined or att.inlined_as_list:
|
|
117
139
|
json_encoded_cols.append(att.name)
|
|
140
|
+
return json_encoded_cols
|
|
141
|
+
|
|
142
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
143
|
+
if not self._table_exists(query.from_table):
|
|
144
|
+
return QueryResult(query=query, num_rows=0, rows=[])
|
|
145
|
+
json_encoded_cols = self._json_encoded_cols(query.from_table)
|
|
146
|
+
|
|
118
147
|
with self.engine.connect() as conn:
|
|
119
148
|
count_query_str = text(query_to_sql(query, count=True))
|
|
149
|
+
logger.debug(f"count_query_str: {count_query_str}")
|
|
120
150
|
num_rows = list(conn.execute(count_query_str))[0][0]
|
|
121
151
|
logger.debug(f"num_rows: {num_rows}")
|
|
122
152
|
query_str = query_to_sql(query, **kwargs) # include offset, limit
|
|
@@ -167,6 +197,9 @@ class DuckDBDatabase(Database):
|
|
|
167
197
|
logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
|
|
168
198
|
sb = SchemaBuilder()
|
|
169
199
|
schema = sb.schema
|
|
200
|
+
logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
|
|
201
|
+
if self._is_sqlite:
|
|
202
|
+
return SchemaView(schema)
|
|
170
203
|
query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
|
|
171
204
|
qr = self.query(query)
|
|
172
205
|
logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
|
|
@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
|
|
|
4
4
|
Handles have the form:
|
|
5
5
|
|
|
6
6
|
- ``file:<path>`` for a local file
|
|
7
|
-
|
|
7
|
+
"""
|
|
8
8
|
|
|
9
9
|
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
10
10
|
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
|
|
@@ -41,6 +41,92 @@ class MongoDBCollection(Collection):
|
|
|
41
41
|
del obj["_id"]
|
|
42
42
|
self._post_insert_hook(objs)
|
|
43
43
|
|
|
44
|
+
def index(
|
|
45
|
+
self,
|
|
46
|
+
objs: Union[OBJECT, List[OBJECT]],
|
|
47
|
+
index_name: Optional[str] = None,
|
|
48
|
+
replace: bool = False,
|
|
49
|
+
unique: bool = False,
|
|
50
|
+
**kwargs,
|
|
51
|
+
):
|
|
52
|
+
"""
|
|
53
|
+
Create indexes on the collection.
|
|
54
|
+
|
|
55
|
+
:param objs: Field(s) to index.
|
|
56
|
+
:param index_name: Optional name for the index.
|
|
57
|
+
:param replace: If True, the index will be dropped and recreated.
|
|
58
|
+
:param unique: If True, creates a unique index (default: False).
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
if not isinstance(objs, list):
|
|
62
|
+
objs = [objs]
|
|
63
|
+
|
|
64
|
+
existing_indexes = self.mongo_collection.index_information()
|
|
65
|
+
|
|
66
|
+
for obj in objs:
|
|
67
|
+
field_exists = False
|
|
68
|
+
index_to_drop = None
|
|
69
|
+
|
|
70
|
+
# Extract existing index details
|
|
71
|
+
for index_name_existing, index_details in existing_indexes.items():
|
|
72
|
+
indexed_fields = [field[0] for field in index_details.get("key", [])] # Extract field names
|
|
73
|
+
|
|
74
|
+
if obj in indexed_fields: # If this field is already indexed
|
|
75
|
+
field_exists = True
|
|
76
|
+
index_to_drop = index_name_existing if replace else None
|
|
77
|
+
|
|
78
|
+
# Drop the index if replace=True and index_to_drop is valid
|
|
79
|
+
if index_to_drop:
|
|
80
|
+
self.mongo_collection.drop_index(index_to_drop)
|
|
81
|
+
logging.debug(f"Dropped existing index: {index_to_drop}")
|
|
82
|
+
|
|
83
|
+
# Create the new index only if it doesn't exist or was dropped
|
|
84
|
+
if not field_exists or replace:
|
|
85
|
+
self.mongo_collection.create_index(obj, name=index_name, unique=unique)
|
|
86
|
+
logging.debug(f"Created new index: {index_name} on field {obj}, unique={unique}")
|
|
87
|
+
else:
|
|
88
|
+
logging.debug(f"Index already exists for field {obj}, skipping creation.")
|
|
89
|
+
|
|
90
|
+
def upsert(
|
|
91
|
+
self,
|
|
92
|
+
objs: Union[OBJECT, List[OBJECT]],
|
|
93
|
+
filter_fields: List[str],
|
|
94
|
+
update_fields: Optional[List[str]] = None,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
"""
|
|
98
|
+
Upsert one or more documents into the MongoDB collection.
|
|
99
|
+
|
|
100
|
+
:param objs: The document(s) to insert or update.
|
|
101
|
+
:param filter_fields: List of field names to use as the filter for matching existing documents.
|
|
102
|
+
:param update_fields: List of field names to include in the update. If None, all fields are updated.
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(objs, list):
|
|
105
|
+
objs = [objs]
|
|
106
|
+
|
|
107
|
+
for obj in objs:
|
|
108
|
+
# Ensure filter fields exist in the object
|
|
109
|
+
filter_criteria = {field: obj[field] for field in filter_fields if field in obj}
|
|
110
|
+
if not filter_criteria:
|
|
111
|
+
raise ValueError("At least one valid filter field must be present in each object.")
|
|
112
|
+
|
|
113
|
+
# Check if a document already exists
|
|
114
|
+
existing_doc = self.mongo_collection.find_one(filter_criteria)
|
|
115
|
+
|
|
116
|
+
if existing_doc:
|
|
117
|
+
# Update only changed fields
|
|
118
|
+
updates = {key: obj[key] for key in update_fields if key in obj and obj[key] != existing_doc.get(key)}
|
|
119
|
+
|
|
120
|
+
if updates:
|
|
121
|
+
self.mongo_collection.update_one(filter_criteria, {"$set": updates})
|
|
122
|
+
logging.debug(f"Updated existing document: {filter_criteria} with {updates}")
|
|
123
|
+
else:
|
|
124
|
+
logging.debug(f"No changes detected for document: {filter_criteria}. Skipping update.")
|
|
125
|
+
else:
|
|
126
|
+
# Insert a new document
|
|
127
|
+
self.mongo_collection.insert_one(obj)
|
|
128
|
+
logging.debug(f"Inserted new document: {obj}")
|
|
129
|
+
|
|
44
130
|
def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
|
|
45
131
|
mongo_filter = self._build_mongo_filter(query.where_clause)
|
|
46
132
|
limit = limit or query.limit
|
|
@@ -81,6 +167,8 @@ class MongoDBCollection(Collection):
|
|
|
81
167
|
facet_limit=DEFAULT_FACET_LIMIT,
|
|
82
168
|
**kwargs,
|
|
83
169
|
) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
|
|
170
|
+
if facet_limit is None:
|
|
171
|
+
facet_limit = DEFAULT_FACET_LIMIT
|
|
84
172
|
results = {}
|
|
85
173
|
if not facet_columns:
|
|
86
174
|
facet_columns = list(self.class_definition().attributes.keys())
|
|
@@ -177,3 +265,101 @@ class MongoDBCollection(Collection):
|
|
|
177
265
|
if deleted_rows_count == 0 and not missing_ok:
|
|
178
266
|
raise ValueError(f"No rows found for {where}")
|
|
179
267
|
return deleted_rows_count
|
|
268
|
+
|
|
269
|
+
def group_by(
|
|
270
|
+
self,
|
|
271
|
+
group_by_fields: List[str],
|
|
272
|
+
inlined_field="objects",
|
|
273
|
+
agg_map: Optional[Dict[str, str]] = None,
|
|
274
|
+
where: Optional[Dict] = None,
|
|
275
|
+
**kwargs,
|
|
276
|
+
) -> QueryResult:
|
|
277
|
+
"""
|
|
278
|
+
Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
|
|
279
|
+
|
|
280
|
+
This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
|
|
281
|
+
|
|
282
|
+
:param group_by_fields: List of fields to group by
|
|
283
|
+
:param inlined_field: Field name to store aggregated objects
|
|
284
|
+
:param agg_map: Dictionary mapping aggregation types to fields
|
|
285
|
+
:param where: Filter conditions
|
|
286
|
+
:param kwargs: Additional arguments
|
|
287
|
+
:return: Query result containing grouped data
|
|
288
|
+
"""
|
|
289
|
+
if isinstance(group_by_fields, str):
|
|
290
|
+
group_by_fields = [group_by_fields]
|
|
291
|
+
|
|
292
|
+
# Build the group key for MongoDB
|
|
293
|
+
if len(group_by_fields) == 1:
|
|
294
|
+
# Single field grouping
|
|
295
|
+
group_id = f"${group_by_fields[0]}"
|
|
296
|
+
else:
|
|
297
|
+
# Multi-field grouping
|
|
298
|
+
group_id = {field: f"${field}" for field in group_by_fields}
|
|
299
|
+
|
|
300
|
+
# Start building the pipeline
|
|
301
|
+
pipeline = []
|
|
302
|
+
|
|
303
|
+
# Add match stage if where clause is provided
|
|
304
|
+
if where:
|
|
305
|
+
pipeline.append({"$match": where})
|
|
306
|
+
|
|
307
|
+
# Add the group stage
|
|
308
|
+
group_stage = {
|
|
309
|
+
"$group": {
|
|
310
|
+
"_id": group_id,
|
|
311
|
+
"objects": {"$push": "$$ROOT"}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
pipeline.append(group_stage)
|
|
315
|
+
|
|
316
|
+
# Execute the aggregation
|
|
317
|
+
logger.debug(f"MongoDB group_by pipeline: {pipeline}")
|
|
318
|
+
aggregation_results = list(self.mongo_collection.aggregate(pipeline))
|
|
319
|
+
|
|
320
|
+
# Transform the results to match the expected format
|
|
321
|
+
results = []
|
|
322
|
+
for result in aggregation_results:
|
|
323
|
+
# Skip null groups if needed
|
|
324
|
+
if result["_id"] is None and kwargs.get("skip_nulls", False):
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
# Create the group object
|
|
328
|
+
if isinstance(result["_id"], dict):
|
|
329
|
+
# Multi-field grouping
|
|
330
|
+
group_obj = result["_id"]
|
|
331
|
+
else:
|
|
332
|
+
# Single field grouping
|
|
333
|
+
group_obj = {group_by_fields[0]: result["_id"]}
|
|
334
|
+
|
|
335
|
+
# Add the grouped objects
|
|
336
|
+
objects = result["objects"]
|
|
337
|
+
|
|
338
|
+
# Remove MongoDB _id field from each object
|
|
339
|
+
for obj in objects:
|
|
340
|
+
if "_id" in obj:
|
|
341
|
+
del obj["_id"]
|
|
342
|
+
|
|
343
|
+
# Apply any field selection or transformations based on agg_map
|
|
344
|
+
if agg_map:
|
|
345
|
+
# Get first fields (fields to keep as single values)
|
|
346
|
+
first_fields = agg_map.get("first", [])
|
|
347
|
+
if first_fields:
|
|
348
|
+
# These are already in the group_obj from the _id
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
# Get list fields (fields to aggregate as lists)
|
|
352
|
+
list_fields = agg_map.get("list", [])
|
|
353
|
+
if list_fields:
|
|
354
|
+
# Filter objects to only include specified fields
|
|
355
|
+
objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
|
|
356
|
+
elif not list_fields and first_fields:
|
|
357
|
+
# If list_fields is empty but first_fields is specified,
|
|
358
|
+
# filter out first_fields from objects to avoid duplication
|
|
359
|
+
objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
|
|
360
|
+
|
|
361
|
+
# Add the objects to the group
|
|
362
|
+
group_obj[inlined_field] = objects
|
|
363
|
+
results.append(group_obj)
|
|
364
|
+
|
|
365
|
+
return QueryResult(num_rows=len(results), rows=results)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Optional, Union
|
|
6
|
+
from urllib.parse import urlparse
|
|
6
7
|
|
|
7
8
|
from pymongo import MongoClient
|
|
8
9
|
from pymongo.database import Database as NativeDatabase
|
|
@@ -38,10 +39,14 @@ class MongoDBDatabase(Database):
|
|
|
38
39
|
@property
|
|
39
40
|
def _db_name(self) -> str:
|
|
40
41
|
if self.handle:
|
|
41
|
-
|
|
42
|
+
parsed_url = urlparse(self.handle)
|
|
43
|
+
path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
|
|
44
|
+
db_name = path_parts[0] if path_parts else "default"
|
|
45
|
+
if not db_name:
|
|
46
|
+
db_name = self.alias
|
|
42
47
|
else:
|
|
43
|
-
|
|
44
|
-
return
|
|
48
|
+
db_name = "default"
|
|
49
|
+
return db_name
|
|
45
50
|
|
|
46
51
|
@property
|
|
47
52
|
def native_client(self) -> MongoClient:
|
|
@@ -62,12 +62,18 @@ class SolrCollection(Collection):
|
|
|
62
62
|
return QueryResult(query=query, num_rows=num_rows, rows=rows)
|
|
63
63
|
|
|
64
64
|
def query_facets(
|
|
65
|
-
self,
|
|
65
|
+
self,
|
|
66
|
+
where: Optional[Dict] = None,
|
|
67
|
+
facet_columns: List[str] = None,
|
|
68
|
+
facet_limit=DEFAULT_FACET_LIMIT,
|
|
69
|
+
facet_min_count: int = 1,
|
|
70
|
+
**kwargs,
|
|
66
71
|
) -> Dict[str, Dict[str, int]]:
|
|
67
72
|
solr_query = self._build_solr_query(where)
|
|
68
73
|
solr_query["facet"] = "true"
|
|
69
74
|
solr_query["facet.field"] = facet_columns
|
|
70
75
|
solr_query["facet.limit"] = facet_limit
|
|
76
|
+
solr_query["facet.mincount"] = facet_min_count
|
|
71
77
|
|
|
72
78
|
logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
|
|
73
79
|
|