linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show
  1. linkml_store/api/client.py +9 -6
  2. linkml_store/api/collection.py +118 -5
  3. linkml_store/api/database.py +45 -14
  4. linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
  5. linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
  6. linkml_store/api/stores/filesystem/__init__.py +1 -1
  7. linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
  8. linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
  9. linkml_store/api/stores/solr/solr_collection.py +7 -1
  10. linkml_store/cli.py +202 -21
  11. linkml_store/index/implementations/llm_indexer.py +14 -6
  12. linkml_store/index/indexer.py +7 -4
  13. linkml_store/inference/implementations/llm_inference_engine.py +13 -9
  14. linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  15. linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  16. linkml_store/inference/inference_config.py +1 -0
  17. linkml_store/utils/dat_parser.py +95 -0
  18. linkml_store/utils/enrichment_analyzer.py +217 -0
  19. linkml_store/utils/format_utils.py +183 -3
  20. linkml_store/utils/llm_utils.py +3 -1
  21. linkml_store/utils/pandas_utils.py +1 -1
  22. linkml_store/utils/sql_utils.py +7 -1
  23. linkml_store/utils/vector_utils.py +4 -11
  24. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
  25. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
  26. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
  27. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
  28. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
- from typing import Optional, Union
4
+ from typing import Optional, Union, List
5
5
 
6
6
  import pandas as pd
7
7
  import sqlalchemy
@@ -14,7 +14,7 @@ from linkml_store.api import Database
14
14
  from linkml_store.api.queries import Query, QueryResult
15
15
  from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
16
16
  from linkml_store.utils.format_utils import Format
17
- from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
17
+ from linkml_store.utils.sql_utils import introspect_schema, query_to_sql, where_clause_to_sql
18
18
 
19
19
  TYPE_MAP = {
20
20
  "VARCHAR": "string",
@@ -62,7 +62,7 @@ class DuckDBDatabase(Database):
62
62
  def engine(self) -> sqlalchemy.Engine:
63
63
  if not self._engine:
64
64
  handle = self.handle
65
- if not handle.startswith("duckdb://") and not handle.startswith(":"):
65
+ if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
66
66
  handle = f"duckdb:///{handle}"
67
67
  if ":memory:" not in handle:
68
68
  # TODO: investigate this; duckdb appears to be prematurely caching
@@ -71,6 +71,10 @@ class DuckDBDatabase(Database):
71
71
  self._engine = sqlalchemy.create_engine(handle)
72
72
  return self._engine
73
73
 
74
+ @property
75
+ def _is_sqlite(self) -> bool:
76
+ return self.handle and self.handle.startswith("sqlite:")
77
+
74
78
  def commit(self, **kwargs):
75
79
  with self.engine.connect() as conn:
76
80
  conn.commit()
@@ -89,34 +93,60 @@ class DuckDBDatabase(Database):
89
93
  if not missing_ok:
90
94
  raise FileNotFoundError(f"Database file not found: {path}")
91
95
 
92
- def query(self, query: Query, **kwargs) -> QueryResult:
96
+ def _table_exists(self, table: str) -> bool:
97
+ if self._is_sqlite:
98
+ if table == "sqlite_master":
99
+ return True
100
+ meta_query = Query(
101
+ from_table="sqlite_master",
102
+ where_clause={
103
+ # "type": "table",
104
+ "name": table,
105
+ },
106
+ )
107
+ else:
108
+ if table.startswith("information_schema"):
109
+ return True
110
+ meta_query = Query(
111
+ from_table="information_schema.tables",
112
+ where_clause={
113
+ "table_type": "BASE TABLE",
114
+ "table_name": table,
115
+ },
116
+ )
117
+
118
+ qr = self.query(meta_query)
119
+ if qr.num_rows == 0:
120
+ logger.debug(f"Table {self.alias} not created yet")
121
+ return False
122
+ return True
123
+
124
+ def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
93
125
  json_encoded_cols = []
94
- if query.from_table:
95
- if not query.from_table.startswith("information_schema"):
96
- meta_query = Query(
97
- from_table="information_schema.tables", where_clause={"table_name": query.from_table}
98
- )
99
- qr = self.query(meta_query)
100
- if qr.num_rows == 0:
101
- logger.debug(f"Table {query.from_table} not created yet")
102
- return QueryResult(query=query, num_rows=0, rows=[])
103
- if not query.from_table.startswith("information_schema"):
104
- sv = self.schema_view
105
- else:
106
- sv = None
126
+ if table_name:
127
+ if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
128
+ return []
129
+ sv = self.schema_view
107
130
  if sv:
108
131
  cd = None
109
132
  for c in self._collections.values():
110
- # if c.name == query.from_table or c.metadata.alias == query.from_table:
111
- if c.alias == query.from_table or c.target_class_name == query.from_table:
133
+ if c.alias == table_name or c.target_class_name == table_name:
112
134
  cd = c.class_definition()
113
135
  break
114
136
  if cd:
115
137
  for att in sv.class_induced_slots(cd.name):
116
138
  if att.inlined or att.inlined_as_list:
117
139
  json_encoded_cols.append(att.name)
140
+ return json_encoded_cols
141
+
142
+ def query(self, query: Query, **kwargs) -> QueryResult:
143
+ if not self._table_exists(query.from_table):
144
+ return QueryResult(query=query, num_rows=0, rows=[])
145
+ json_encoded_cols = self._json_encoded_cols(query.from_table)
146
+
118
147
  with self.engine.connect() as conn:
119
148
  count_query_str = text(query_to_sql(query, count=True))
149
+ logger.debug(f"count_query_str: {count_query_str}")
120
150
  num_rows = list(conn.execute(count_query_str))[0][0]
121
151
  logger.debug(f"num_rows: {num_rows}")
122
152
  query_str = query_to_sql(query, **kwargs) # include offset, limit
@@ -167,6 +197,9 @@ class DuckDBDatabase(Database):
167
197
  logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
168
198
  sb = SchemaBuilder()
169
199
  schema = sb.schema
200
+ logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
201
+ if self._is_sqlite:
202
+ return SchemaView(schema)
170
203
  query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
171
204
  qr = self.query(query)
172
205
  logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
4
4
  Handles have the form:
5
5
 
6
6
  - ``file:<path>`` for a local file
7
- """
7
+ """
8
8
 
9
9
  from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
10
  from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
@@ -41,6 +41,92 @@ class MongoDBCollection(Collection):
41
41
  del obj["_id"]
42
42
  self._post_insert_hook(objs)
43
43
 
44
+ def index(
45
+ self,
46
+ objs: Union[OBJECT, List[OBJECT]],
47
+ index_name: Optional[str] = None,
48
+ replace: bool = False,
49
+ unique: bool = False,
50
+ **kwargs,
51
+ ):
52
+ """
53
+ Create indexes on the collection.
54
+
55
+ :param objs: Field(s) to index.
56
+ :param index_name: Optional name for the index.
57
+ :param replace: If True, the index will be dropped and recreated.
58
+ :param unique: If True, creates a unique index (default: False).
59
+ """
60
+
61
+ if not isinstance(objs, list):
62
+ objs = [objs]
63
+
64
+ existing_indexes = self.mongo_collection.index_information()
65
+
66
+ for obj in objs:
67
+ field_exists = False
68
+ index_to_drop = None
69
+
70
+ # Extract existing index details
71
+ for index_name_existing, index_details in existing_indexes.items():
72
+ indexed_fields = [field[0] for field in index_details.get("key", [])] # Extract field names
73
+
74
+ if obj in indexed_fields: # If this field is already indexed
75
+ field_exists = True
76
+ index_to_drop = index_name_existing if replace else None
77
+
78
+ # Drop the index if replace=True and index_to_drop is valid
79
+ if index_to_drop:
80
+ self.mongo_collection.drop_index(index_to_drop)
81
+ logging.debug(f"Dropped existing index: {index_to_drop}")
82
+
83
+ # Create the new index only if it doesn't exist or was dropped
84
+ if not field_exists or replace:
85
+ self.mongo_collection.create_index(obj, name=index_name, unique=unique)
86
+ logging.debug(f"Created new index: {index_name} on field {obj}, unique={unique}")
87
+ else:
88
+ logging.debug(f"Index already exists for field {obj}, skipping creation.")
89
+
90
+ def upsert(
91
+ self,
92
+ objs: Union[OBJECT, List[OBJECT]],
93
+ filter_fields: List[str],
94
+ update_fields: Optional[List[str]] = None,
95
+ **kwargs,
96
+ ):
97
+ """
98
+ Upsert one or more documents into the MongoDB collection.
99
+
100
+ :param objs: The document(s) to insert or update.
101
+ :param filter_fields: List of field names to use as the filter for matching existing documents.
102
+ :param update_fields: List of field names to include in the update. If None, all fields are updated.
103
+ """
104
+ if not isinstance(objs, list):
105
+ objs = [objs]
106
+
107
+ for obj in objs:
108
+ # Ensure filter fields exist in the object
109
+ filter_criteria = {field: obj[field] for field in filter_fields if field in obj}
110
+ if not filter_criteria:
111
+ raise ValueError("At least one valid filter field must be present in each object.")
112
+
113
+ # Check if a document already exists
114
+ existing_doc = self.mongo_collection.find_one(filter_criteria)
115
+
116
+ if existing_doc:
117
+ # Update only changed fields
118
+ updates = {key: obj[key] for key in update_fields if key in obj and obj[key] != existing_doc.get(key)}
119
+
120
+ if updates:
121
+ self.mongo_collection.update_one(filter_criteria, {"$set": updates})
122
+ logging.debug(f"Updated existing document: {filter_criteria} with {updates}")
123
+ else:
124
+ logging.debug(f"No changes detected for document: {filter_criteria}. Skipping update.")
125
+ else:
126
+ # Insert a new document
127
+ self.mongo_collection.insert_one(obj)
128
+ logging.debug(f"Inserted new document: {obj}")
129
+
44
130
  def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
45
131
  mongo_filter = self._build_mongo_filter(query.where_clause)
46
132
  limit = limit or query.limit
@@ -81,6 +167,8 @@ class MongoDBCollection(Collection):
81
167
  facet_limit=DEFAULT_FACET_LIMIT,
82
168
  **kwargs,
83
169
  ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
170
+ if facet_limit is None:
171
+ facet_limit = DEFAULT_FACET_LIMIT
84
172
  results = {}
85
173
  if not facet_columns:
86
174
  facet_columns = list(self.class_definition().attributes.keys())
@@ -177,3 +265,101 @@ class MongoDBCollection(Collection):
177
265
  if deleted_rows_count == 0 and not missing_ok:
178
266
  raise ValueError(f"No rows found for {where}")
179
267
  return deleted_rows_count
268
+
269
+ def group_by(
270
+ self,
271
+ group_by_fields: List[str],
272
+ inlined_field="objects",
273
+ agg_map: Optional[Dict[str, str]] = None,
274
+ where: Optional[Dict] = None,
275
+ **kwargs,
276
+ ) -> QueryResult:
277
+ """
278
+ Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
279
+
280
+ This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
281
+
282
+ :param group_by_fields: List of fields to group by
283
+ :param inlined_field: Field name to store aggregated objects
284
+ :param agg_map: Dictionary mapping aggregation types to fields
285
+ :param where: Filter conditions
286
+ :param kwargs: Additional arguments
287
+ :return: Query result containing grouped data
288
+ """
289
+ if isinstance(group_by_fields, str):
290
+ group_by_fields = [group_by_fields]
291
+
292
+ # Build the group key for MongoDB
293
+ if len(group_by_fields) == 1:
294
+ # Single field grouping
295
+ group_id = f"${group_by_fields[0]}"
296
+ else:
297
+ # Multi-field grouping
298
+ group_id = {field: f"${field}" for field in group_by_fields}
299
+
300
+ # Start building the pipeline
301
+ pipeline = []
302
+
303
+ # Add match stage if where clause is provided
304
+ if where:
305
+ pipeline.append({"$match": where})
306
+
307
+ # Add the group stage
308
+ group_stage = {
309
+ "$group": {
310
+ "_id": group_id,
311
+ "objects": {"$push": "$$ROOT"}
312
+ }
313
+ }
314
+ pipeline.append(group_stage)
315
+
316
+ # Execute the aggregation
317
+ logger.debug(f"MongoDB group_by pipeline: {pipeline}")
318
+ aggregation_results = list(self.mongo_collection.aggregate(pipeline))
319
+
320
+ # Transform the results to match the expected format
321
+ results = []
322
+ for result in aggregation_results:
323
+ # Skip null groups if needed
324
+ if result["_id"] is None and kwargs.get("skip_nulls", False):
325
+ continue
326
+
327
+ # Create the group object
328
+ if isinstance(result["_id"], dict):
329
+ # Multi-field grouping
330
+ group_obj = result["_id"]
331
+ else:
332
+ # Single field grouping
333
+ group_obj = {group_by_fields[0]: result["_id"]}
334
+
335
+ # Add the grouped objects
336
+ objects = result["objects"]
337
+
338
+ # Remove MongoDB _id field from each object
339
+ for obj in objects:
340
+ if "_id" in obj:
341
+ del obj["_id"]
342
+
343
+ # Apply any field selection or transformations based on agg_map
344
+ if agg_map:
345
+ # Get first fields (fields to keep as single values)
346
+ first_fields = agg_map.get("first", [])
347
+ if first_fields:
348
+ # These are already in the group_obj from the _id
349
+ pass
350
+
351
+ # Get list fields (fields to aggregate as lists)
352
+ list_fields = agg_map.get("list", [])
353
+ if list_fields:
354
+ # Filter objects to only include specified fields
355
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
356
+ elif not list_fields and first_fields:
357
+ # If list_fields is empty but first_fields is specified,
358
+ # filter out first_fields from objects to avoid duplication
359
+ objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
360
+
361
+ # Add the objects to the group
362
+ group_obj[inlined_field] = objects
363
+ results.append(group_obj)
364
+
365
+ return QueryResult(num_rows=len(results), rows=results)
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  from pathlib import Path
5
5
  from typing import Optional, Union
6
+ from urllib.parse import urlparse
6
7
 
7
8
  from pymongo import MongoClient
8
9
  from pymongo.database import Database as NativeDatabase
@@ -38,10 +39,14 @@ class MongoDBDatabase(Database):
38
39
  @property
39
40
  def _db_name(self) -> str:
40
41
  if self.handle:
41
- db = self.handle.split("/")[-1]
42
+ parsed_url = urlparse(self.handle)
43
+ path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
44
+ db_name = path_parts[0] if path_parts else "default"
45
+ if not db_name:
46
+ db_name = self.alias
42
47
  else:
43
- db = "default"
44
- return db
48
+ db_name = "default"
49
+ return db_name
45
50
 
46
51
  @property
47
52
  def native_client(self) -> MongoClient:
@@ -62,12 +62,18 @@ class SolrCollection(Collection):
62
62
  return QueryResult(query=query, num_rows=num_rows, rows=rows)
63
63
 
64
64
  def query_facets(
65
- self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
65
+ self,
66
+ where: Optional[Dict] = None,
67
+ facet_columns: List[str] = None,
68
+ facet_limit=DEFAULT_FACET_LIMIT,
69
+ facet_min_count: int = 1,
70
+ **kwargs,
66
71
  ) -> Dict[str, Dict[str, int]]:
67
72
  solr_query = self._build_solr_query(where)
68
73
  solr_query["facet"] = "true"
69
74
  solr_query["facet.field"] = facet_columns
70
75
  solr_query["facet.limit"] = facet_limit
76
+ solr_query["facet.mincount"] = facet_min_count
71
77
 
72
78
  logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
73
79