linkml-store 0.2.6__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show
  1. linkml_store/api/client.py +2 -3
  2. linkml_store/api/collection.py +63 -8
  3. linkml_store/api/database.py +30 -2
  4. linkml_store/api/stores/duckdb/duckdb_collection.py +165 -3
  5. linkml_store/api/stores/duckdb/duckdb_database.py +3 -3
  6. linkml_store/api/stores/filesystem/__init__.py +1 -1
  7. linkml_store/api/stores/mongodb/mongodb_collection.py +115 -12
  8. linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
  9. linkml_store/api/stores/solr/solr_collection.py +7 -1
  10. linkml_store/cli.py +201 -20
  11. linkml_store/index/implementations/llm_indexer.py +14 -6
  12. linkml_store/index/indexer.py +7 -4
  13. linkml_store/inference/implementations/llm_inference_engine.py +13 -9
  14. linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  15. linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  16. linkml_store/inference/inference_config.py +1 -0
  17. linkml_store/utils/dat_parser.py +95 -0
  18. linkml_store/utils/enrichment_analyzer.py +217 -0
  19. linkml_store/utils/format_utils.py +124 -3
  20. linkml_store/utils/llm_utils.py +3 -1
  21. linkml_store/utils/pandas_utils.py +1 -1
  22. linkml_store/utils/sql_utils.py +1 -1
  23. linkml_store/utils/vector_utils.py +3 -10
  24. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/METADATA +3 -1
  25. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
  26. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
  27. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
  28. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
@@ -12,7 +12,6 @@ from linkml_store.api.config import ClientConfig
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
-
16
15
  HANDLE_MAP = {
17
16
  "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
17
  "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
@@ -220,14 +219,14 @@ class Client:
220
219
  scheme, _ = handle.split(":", 1)
221
220
  if scheme not in HANDLE_MAP:
222
221
  raise ValueError(f"Unknown scheme: {scheme}")
223
- module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
222
+ module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
224
223
  try:
225
224
  module = importlib.import_module(module_path)
226
225
  cls = getattr(module, class_name)
227
226
  except ImportError as e:
228
227
  raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
229
228
 
230
- #cls = HANDLE_MAP[scheme]
229
+ # cls = HANDLE_MAP[scheme]
231
230
  db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
232
231
  if schema_view:
233
232
  db.set_schema_view(schema_view)
@@ -211,7 +211,7 @@ class Collection(Generic[DatabaseType]):
211
211
  """
212
212
  raise NotImplementedError
213
213
 
214
- def index (
214
+ def index(
215
215
  self,
216
216
  objs: Union[OBJECT, List[OBJECT]],
217
217
  index_name: Optional[str] = None,
@@ -231,10 +231,13 @@ class Collection(Generic[DatabaseType]):
231
231
  """
232
232
  raise NotImplementedError
233
233
 
234
- def upsert(self,
235
- objs: Union[OBJECT, List[OBJECT]],
236
- filter_fields: List[str],
237
- update_fields: Union[List[str], None] = None, **kwargs):
234
+ def upsert(
235
+ self,
236
+ objs: Union[OBJECT, List[OBJECT]],
237
+ filter_fields: List[str],
238
+ update_fields: Union[List[str], None] = None,
239
+ **kwargs,
240
+ ):
238
241
  """
239
242
  Add one or more objects to the collection.
240
243
 
@@ -454,7 +457,12 @@ class Collection(Generic[DatabaseType]):
454
457
  return qr.rows[0]
455
458
  return None
456
459
 
457
- def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
460
+ def find(
461
+ self,
462
+ where: Optional[Any] = None,
463
+ select_cols: Optional[List[str]] = None,
464
+ **kwargs,
465
+ ) -> QueryResult:
458
466
  """
459
467
  Find objects in the collection using a where query.
460
468
 
@@ -484,10 +492,14 @@ class Collection(Generic[DatabaseType]):
484
492
 
485
493
 
486
494
  :param where:
495
+ :param select_cols:
487
496
  :param kwargs:
488
497
  :return:
489
498
  """
490
- query = self._create_query(where_clause=where)
499
+ query = self._create_query(
500
+ where_clause=where,
501
+ select_cols=select_cols,
502
+ )
491
503
  self._pre_query_hook(query)
492
504
  return self.query(query, **kwargs)
493
505
 
@@ -587,6 +599,7 @@ class Collection(Generic[DatabaseType]):
587
599
  assert ix_coll.size() > 0
588
600
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
589
601
  index_col = ix.index_field
602
+
590
603
  # TODO: optimize this for large indexes
591
604
  def row2array(row):
592
605
  v = row[index_col]
@@ -594,6 +607,7 @@ class Collection(Generic[DatabaseType]):
594
607
  # sqlite stores arrays as strings
595
608
  v = json.loads(v)
596
609
  return np.array(v, dtype=float)
610
+
597
611
  vector_pairs = [(row, row2array(row)) for row in qr.rows]
598
612
  results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
599
613
  for r in results:
@@ -608,6 +622,47 @@ class Collection(Generic[DatabaseType]):
608
622
  new_qr.rows = [r[1] for r in results]
609
623
  return new_qr
610
624
 
625
+ def group_by(
626
+ self,
627
+ group_by_fields: List[str],
628
+ inlined_field="objects",
629
+ agg_map: Optional[Dict[str, str]] = None,
630
+ where: Optional[Dict] = None,
631
+ **kwargs,
632
+ ) -> QueryResult:
633
+ """
634
+ Group objects in the collection by a column.
635
+
636
+ :param group_by:
637
+ :param where:
638
+ :param kwargs:
639
+ :return:
640
+ """
641
+ if isinstance(group_by_fields, str):
642
+ group_by_fields = [group_by_fields]
643
+ df = self.find(where=where, limit=-1).rows_dataframe
644
+
645
+ # Handle the case where agg_map is None
646
+ if agg_map is None:
647
+ agg_map = {}
648
+
649
+ pk_fields = agg_map.get("first", []) + group_by_fields
650
+ list_fields = agg_map.get("list", [])
651
+ if not list_fields:
652
+ list_fields = [a for a in df.columns if a not in pk_fields]
653
+
654
+ grouped_objs = defaultdict(list)
655
+ for _, row in df.iterrows():
656
+ pk = tuple(row[pk_fields])
657
+ grouped_objs[pk].append({k: row[k] for k in list_fields})
658
+ results = []
659
+ for pk, objs in grouped_objs.items():
660
+ top_obj = {k: v for k, v in zip(pk_fields, pk)}
661
+ top_obj[inlined_field] = objs
662
+ results.append(top_obj)
663
+ r = QueryResult(num_rows=len(results), rows=results)
664
+ return r
665
+
611
666
  @property
612
667
  def is_internal(self) -> bool:
613
668
  """
@@ -1062,7 +1117,7 @@ class Collection(Generic[DatabaseType]):
1062
1117
  multivalued = any(multivalueds)
1063
1118
  inlined = any(inlineds)
1064
1119
  if multivalued and False in multivalueds:
1065
- raise ValueError(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1120
+ logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1066
1121
  # if not rngs:
1067
1122
  # raise AssertionError(f"Empty rngs for {k} = {vs}")
1068
1123
  rng = rngs[0] if rngs else None
@@ -595,7 +595,31 @@ class Database(ABC, Generic[CollectionType]):
595
595
  sb.add_class(coll.target_class_name)
596
596
  return SchemaView(sb.schema)
597
597
 
598
- def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
598
+ def validate_database(self, **kwargs) -> List["ValidationResult"]:
599
+ """
600
+ Validate the contents of the database.
601
+
602
+ As `iter_validate_database`, but returns a list of validation results.
603
+
604
+ :param kwargs:
605
+ :return:
606
+ """
607
+ return list(self.iter_validate_database(**kwargs))
608
+
609
+ def validate_database(self, **kwargs) -> List["ValidationResult"]:
610
+ """
611
+ Validate the contents of the database.
612
+
613
+ As `iter_validate_database`, but returns a list of validation results.
614
+
615
+ :param kwargs:
616
+ :return:
617
+ """
618
+ return list(self.iter_validate_database(**kwargs))
619
+
620
+ def iter_validate_database(
621
+ self, ensure_referential_integrity: bool = None, **kwargs
622
+ ) -> Iterator["ValidationResult"]:
599
623
  """
600
624
  Validate the contents of the database.
601
625
 
@@ -635,12 +659,14 @@ class Database(ABC, Generic[CollectionType]):
635
659
  'capital' is a required property
636
660
  'continent' is a required proper
637
661
 
662
+ :param ensure_referential_integrity: ensure referential integrity
638
663
  :param kwargs:
639
664
  :return: iterator over validation results
640
665
  """
641
666
  for collection in self.list_collections():
642
667
  yield from collection.iter_validate_collection(**kwargs)
643
- if self.metadata.ensure_referential_integrity:
668
+ if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
669
+ logger.info(f"Validating referential integrity on {self.alias}")
644
670
  yield from self._validate_referential_integrity(**kwargs)
645
671
 
646
672
  def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
@@ -661,7 +687,9 @@ class Database(ABC, Generic[CollectionType]):
661
687
  induced_slots = sv.class_induced_slots(cd.name)
662
688
  slot_map = {s.name: s for s in induced_slots}
663
689
  # rmap = {s.name: s.range for s in induced_slots}
690
+ # map slot ranges to a collection where that range is stored
664
691
  sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
692
+ logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
665
693
  for obj in collection.find_iter():
666
694
  for k, v in obj.items():
667
695
  if k not in sr_to_coll:
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Optional, Union
2
+ from typing import Any, Dict, List, Optional, Union, Tuple
3
3
 
4
4
  import sqlalchemy as sqla
5
5
  from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
@@ -8,7 +8,7 @@ from sqlalchemy.sql.ddl import CreateTable
8
8
 
9
9
  from linkml_store.api import Collection
10
10
  from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
11
- from linkml_store.api.queries import Query
11
+ from linkml_store.api.queries import Query, QueryResult
12
12
  from linkml_store.api.stores.duckdb.mappings import TMAP
13
13
  from linkml_store.utils.sql_utils import facet_count_sql
14
14
 
@@ -94,7 +94,9 @@ class DuckDBCollection(Collection):
94
94
 
95
95
  def query_facets(
96
96
  self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
97
- ) -> Dict[str, Dict[str, int]]:
97
+ ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
98
+ if facet_limit is None:
99
+ facet_limit = DEFAULT_FACET_LIMIT
98
100
  results = {}
99
101
  cd = self.class_definition()
100
102
  with self.parent.engine.connect() as conn:
@@ -143,6 +145,166 @@ class DuckDBCollection(Collection):
143
145
  return True
144
146
  return False
145
147
 
148
+ def group_by(
149
+ self,
150
+ group_by_fields: List[str],
151
+ inlined_field="objects",
152
+ agg_map: Optional[Dict[str, str]] = None,
153
+ where: Optional[Dict] = None,
154
+ **kwargs,
155
+ ) -> QueryResult:
156
+ """
157
+ Group objects in the collection by specified fields using SQLAlchemy.
158
+
159
+ This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
160
+
161
+ :param group_by_fields: List of fields to group by
162
+ :param inlined_field: Field name to store aggregated objects
163
+ :param agg_map: Dictionary mapping aggregation types to fields
164
+ :param where: Filter conditions
165
+ :param kwargs: Additional arguments
166
+ :return: Query result containing grouped data
167
+ """
168
+ if isinstance(group_by_fields, str):
169
+ group_by_fields = [group_by_fields]
170
+
171
+ cd = self.class_definition()
172
+ if not cd:
173
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
174
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
175
+
176
+ # Check if the table exists
177
+ if not self.parent._table_exists(self.alias):
178
+ logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
179
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
180
+
181
+ # Get table definition
182
+ table = self._sqla_table(cd)
183
+ engine = self.parent.engine
184
+
185
+ # Create a SQLAlchemy select statement for groups
186
+ from sqlalchemy import select, func, and_, or_
187
+ group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
188
+
189
+ if not group_cols:
190
+ logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
191
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
192
+
193
+ stmt = select(*group_cols).distinct()
194
+
195
+ # Add where conditions if specified
196
+ if where:
197
+ conditions = []
198
+ for k, v in where.items():
199
+ if k in table.columns.keys():
200
+ # Handle different operator types (dict values for operators)
201
+ if isinstance(v, dict):
202
+ for op, val in v.items():
203
+ if op == "$gt":
204
+ conditions.append(table.c[k] > val)
205
+ elif op == "$gte":
206
+ conditions.append(table.c[k] >= val)
207
+ elif op == "$lt":
208
+ conditions.append(table.c[k] < val)
209
+ elif op == "$lte":
210
+ conditions.append(table.c[k] <= val)
211
+ elif op == "$ne":
212
+ conditions.append(table.c[k] != val)
213
+ elif op == "$in":
214
+ conditions.append(table.c[k].in_(val))
215
+ else:
216
+ # Default to equality for unknown operators
217
+ logger.warning(f"Unknown operator {op}, using equality")
218
+ conditions.append(table.c[k] == val)
219
+ else:
220
+ # Direct equality comparison
221
+ conditions.append(table.c[k] == v)
222
+
223
+ if conditions:
224
+ for condition in conditions:
225
+ stmt = stmt.where(condition)
226
+
227
+ results = []
228
+ try:
229
+ with engine.connect() as conn:
230
+ # Get all distinct groups
231
+ group_result = conn.execute(stmt)
232
+ group_rows = list(group_result)
233
+
234
+ # For each group, get all objects
235
+ for group_row in group_rows:
236
+ # Build conditions for this group
237
+ group_conditions = []
238
+ group_dict = {}
239
+
240
+ for i, field in enumerate(group_by_fields):
241
+ if field in table.columns.keys():
242
+ value = group_row[i]
243
+ group_dict[field] = value
244
+ if value is None:
245
+ group_conditions.append(table.c[field].is_(None))
246
+ else:
247
+ group_conditions.append(table.c[field] == value)
248
+
249
+ # Get all rows for this group
250
+ row_stmt = select(*table.columns)
251
+ for condition in group_conditions:
252
+ row_stmt = row_stmt.where(condition)
253
+
254
+ # Add original where conditions
255
+ if where:
256
+ for k, v in where.items():
257
+ if k in table.columns.keys():
258
+ # Handle different operator types for the row query as well
259
+ if isinstance(v, dict):
260
+ for op, val in v.items():
261
+ if op == "$gt":
262
+ row_stmt = row_stmt.where(table.c[k] > val)
263
+ elif op == "$gte":
264
+ row_stmt = row_stmt.where(table.c[k] >= val)
265
+ elif op == "$lt":
266
+ row_stmt = row_stmt.where(table.c[k] < val)
267
+ elif op == "$lte":
268
+ row_stmt = row_stmt.where(table.c[k] <= val)
269
+ elif op == "$ne":
270
+ row_stmt = row_stmt.where(table.c[k] != val)
271
+ elif op == "$in":
272
+ row_stmt = row_stmt.where(table.c[k].in_(val))
273
+ else:
274
+ # Default to equality for unknown operators
275
+ row_stmt = row_stmt.where(table.c[k] == val)
276
+ else:
277
+ # Direct equality comparison
278
+ row_stmt = row_stmt.where(table.c[k] == v)
279
+
280
+ row_result = conn.execute(row_stmt)
281
+ rows = list(row_result)
282
+
283
+ # Convert rows to dictionaries
284
+ objects = []
285
+ for row in rows:
286
+ obj = {}
287
+ for i, col in enumerate(row._fields):
288
+ obj[col] = row[i]
289
+ objects.append(obj)
290
+
291
+ # Apply agg_map to filter fields if specified
292
+ if agg_map and "list" in agg_map:
293
+ list_fields = agg_map["list"]
294
+ if list_fields:
295
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
296
+
297
+ # Create the result object
298
+ result_obj = group_dict.copy()
299
+ result_obj[inlined_field] = objects
300
+ results.append(result_obj)
301
+
302
+ return QueryResult(num_rows=len(results), rows=results)
303
+ except Exception as e:
304
+ logger.warning(f"Error in DuckDB group_by: {e}")
305
+ # Fall back to parent implementation
306
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
307
+
146
308
  def _create_table(self, cd: ClassDefinition):
147
309
  if self._table_created or self.metadata.is_prepopulated:
148
310
  logger.info(f"Already have table for: {cd.name}")
@@ -100,9 +100,9 @@ class DuckDBDatabase(Database):
100
100
  meta_query = Query(
101
101
  from_table="sqlite_master",
102
102
  where_clause={
103
- #"type": "table",
103
+ # "type": "table",
104
104
  "name": table,
105
- }
105
+ },
106
106
  )
107
107
  else:
108
108
  if table.startswith("information_schema"):
@@ -112,7 +112,7 @@ class DuckDBDatabase(Database):
112
112
  where_clause={
113
113
  "table_type": "BASE TABLE",
114
114
  "table_name": table,
115
- }
115
+ },
116
116
  )
117
117
 
118
118
  qr = self.query(meta_query)
@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
4
4
  Handles have the form:
5
5
 
6
6
  - ``file:<path>`` for a local file
7
- """
7
+ """
8
8
 
9
9
  from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
10
  from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
@@ -41,13 +41,14 @@ class MongoDBCollection(Collection):
41
41
  del obj["_id"]
42
42
  self._post_insert_hook(objs)
43
43
 
44
-
45
- def index(self,
46
- objs: Union[OBJECT, List[OBJECT]],
47
- index_name: Optional[str] = None,
48
- replace: bool = False,
49
- unique: bool = False,
50
- **kwargs):
44
+ def index(
45
+ self,
46
+ objs: Union[OBJECT, List[OBJECT]],
47
+ index_name: Optional[str] = None,
48
+ replace: bool = False,
49
+ unique: bool = False,
50
+ **kwargs,
51
+ ):
51
52
  """
52
53
  Create indexes on the collection.
53
54
 
@@ -86,11 +87,13 @@ class MongoDBCollection(Collection):
86
87
  else:
87
88
  logging.debug(f"Index already exists for field {obj}, skipping creation.")
88
89
 
89
- def upsert(self,
90
- objs: Union[OBJECT, List[OBJECT]],
91
- filter_fields: List[str],
92
- update_fields: Optional[List[str]] = None,
93
- **kwargs):
90
+ def upsert(
91
+ self,
92
+ objs: Union[OBJECT, List[OBJECT]],
93
+ filter_fields: List[str],
94
+ update_fields: Optional[List[str]] = None,
95
+ **kwargs,
96
+ ):
94
97
  """
95
98
  Upsert one or more documents into the MongoDB collection.
96
99
 
@@ -164,6 +167,8 @@ class MongoDBCollection(Collection):
164
167
  facet_limit=DEFAULT_FACET_LIMIT,
165
168
  **kwargs,
166
169
  ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
170
+ if facet_limit is None:
171
+ facet_limit = DEFAULT_FACET_LIMIT
167
172
  results = {}
168
173
  if not facet_columns:
169
174
  facet_columns = list(self.class_definition().attributes.keys())
@@ -260,3 +265,101 @@ class MongoDBCollection(Collection):
260
265
  if deleted_rows_count == 0 and not missing_ok:
261
266
  raise ValueError(f"No rows found for {where}")
262
267
  return deleted_rows_count
268
+
269
+ def group_by(
270
+ self,
271
+ group_by_fields: List[str],
272
+ inlined_field="objects",
273
+ agg_map: Optional[Dict[str, str]] = None,
274
+ where: Optional[Dict] = None,
275
+ **kwargs,
276
+ ) -> QueryResult:
277
+ """
278
+ Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
279
+
280
+ This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
281
+
282
+ :param group_by_fields: List of fields to group by
283
+ :param inlined_field: Field name to store aggregated objects
284
+ :param agg_map: Dictionary mapping aggregation types to fields
285
+ :param where: Filter conditions
286
+ :param kwargs: Additional arguments
287
+ :return: Query result containing grouped data
288
+ """
289
+ if isinstance(group_by_fields, str):
290
+ group_by_fields = [group_by_fields]
291
+
292
+ # Build the group key for MongoDB
293
+ if len(group_by_fields) == 1:
294
+ # Single field grouping
295
+ group_id = f"${group_by_fields[0]}"
296
+ else:
297
+ # Multi-field grouping
298
+ group_id = {field: f"${field}" for field in group_by_fields}
299
+
300
+ # Start building the pipeline
301
+ pipeline = []
302
+
303
+ # Add match stage if where clause is provided
304
+ if where:
305
+ pipeline.append({"$match": where})
306
+
307
+ # Add the group stage
308
+ group_stage = {
309
+ "$group": {
310
+ "_id": group_id,
311
+ "objects": {"$push": "$$ROOT"}
312
+ }
313
+ }
314
+ pipeline.append(group_stage)
315
+
316
+ # Execute the aggregation
317
+ logger.debug(f"MongoDB group_by pipeline: {pipeline}")
318
+ aggregation_results = list(self.mongo_collection.aggregate(pipeline))
319
+
320
+ # Transform the results to match the expected format
321
+ results = []
322
+ for result in aggregation_results:
323
+ # Skip null groups if needed
324
+ if result["_id"] is None and kwargs.get("skip_nulls", False):
325
+ continue
326
+
327
+ # Create the group object
328
+ if isinstance(result["_id"], dict):
329
+ # Multi-field grouping
330
+ group_obj = result["_id"]
331
+ else:
332
+ # Single field grouping
333
+ group_obj = {group_by_fields[0]: result["_id"]}
334
+
335
+ # Add the grouped objects
336
+ objects = result["objects"]
337
+
338
+ # Remove MongoDB _id field from each object
339
+ for obj in objects:
340
+ if "_id" in obj:
341
+ del obj["_id"]
342
+
343
+ # Apply any field selection or transformations based on agg_map
344
+ if agg_map:
345
+ # Get first fields (fields to keep as single values)
346
+ first_fields = agg_map.get("first", [])
347
+ if first_fields:
348
+ # These are already in the group_obj from the _id
349
+ pass
350
+
351
+ # Get list fields (fields to aggregate as lists)
352
+ list_fields = agg_map.get("list", [])
353
+ if list_fields:
354
+ # Filter objects to only include specified fields
355
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
356
+ elif not list_fields and first_fields:
357
+ # If list_fields is empty but first_fields is specified,
358
+ # filter out first_fields from objects to avoid duplication
359
+ objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
360
+
361
+ # Add the objects to the group
362
+ group_obj[inlined_field] = objects
363
+ results.append(group_obj)
364
+
365
+ return QueryResult(num_rows=len(results), rows=results)
@@ -41,8 +41,9 @@ class MongoDBDatabase(Database):
41
41
  if self.handle:
42
42
  parsed_url = urlparse(self.handle)
43
43
  path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
44
- print(path_parts)
45
44
  db_name = path_parts[0] if path_parts else "default"
45
+ if not db_name:
46
+ db_name = self.alias
46
47
  else:
47
48
  db_name = "default"
48
49
  return db_name
@@ -62,12 +62,18 @@ class SolrCollection(Collection):
62
62
  return QueryResult(query=query, num_rows=num_rows, rows=rows)
63
63
 
64
64
  def query_facets(
65
- self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
65
+ self,
66
+ where: Optional[Dict] = None,
67
+ facet_columns: List[str] = None,
68
+ facet_limit=DEFAULT_FACET_LIMIT,
69
+ facet_min_count: int = 1,
70
+ **kwargs,
66
71
  ) -> Dict[str, Dict[str, int]]:
67
72
  solr_query = self._build_solr_query(where)
68
73
  solr_query["facet"] = "true"
69
74
  solr_query["facet.field"] = facet_columns
70
75
  solr_query["facet.limit"] = facet_limit
76
+ solr_query["facet.mincount"] = facet_min_count
71
77
 
72
78
  logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
73
79