linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (37) hide show
  1. linkml_store/api/__init__.py +2 -2
  2. linkml_store/api/client.py +113 -8
  3. linkml_store/api/collection.py +272 -34
  4. linkml_store/api/config.py +101 -0
  5. linkml_store/api/database.py +282 -18
  6. linkml_store/api/queries.py +12 -1
  7. linkml_store/api/stores/chromadb/__init__.py +3 -0
  8. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  9. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  10. linkml_store/api/stores/duckdb/__init__.py +7 -0
  11. linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
  12. linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
  13. linkml_store/api/stores/hdf5/__init__.py +0 -0
  14. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  15. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  16. linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
  17. linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
  18. linkml_store/api/stores/solr/__init__.py +3 -0
  19. linkml_store/api/stores/solr/solr_collection.py +133 -0
  20. linkml_store/api/stores/solr/solr_database.py +83 -0
  21. linkml_store/api/stores/solr/solr_utils.py +0 -0
  22. linkml_store/cli.py +369 -0
  23. linkml_store/index/__init__.py +33 -0
  24. linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
  25. linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
  26. linkml_store/index/{index.py → indexer.py} +7 -4
  27. linkml_store/utils/format_utils.py +93 -0
  28. linkml_store/utils/object_utils.py +81 -0
  29. linkml_store/utils/sql_utils.py +46 -7
  30. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
  31. linkml_store-0.1.7.dist-info/RECORD +42 -0
  32. linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
  33. linkml_store/api/metadata.py +0 -5
  34. linkml_store-0.0.0.dist-info/RECORD +0 -29
  35. linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
  36. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
  37. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
@@ -0,0 +1,101 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class CollectionConfig(BaseModel):
7
+ name: Optional[str] = Field(
8
+ default=None,
9
+ description="An optional name for the collection",
10
+ )
11
+ alias: Optional[str] = Field(
12
+ default=None,
13
+ description="An optional alias for the collection",
14
+ )
15
+ type: Optional[str] = Field(
16
+ default=None,
17
+ description="The type of object in the collection. TODO; use this instead of name",
18
+ )
19
+ metadata: Optional[Dict] = Field(
20
+ default=None,
21
+ description="Optional metadata for the collection",
22
+ )
23
+ attributes: Optional[Dict[str, Dict]] = Field(
24
+ default=None,
25
+ description="Optional attributes for the collection, following LinkML schema",
26
+ )
27
+ indexers: Optional[Dict[str, Dict]] = Field(
28
+ default=None,
29
+ description="Optional configuration for indexers",
30
+ )
31
+ hidden: Optional[bool] = Field(
32
+ default=False,
33
+ description="Whether the collection is hidden",
34
+ )
35
+ is_prepopulated: Optional[bool] = Field(
36
+ default=False,
37
+ description="Whether the collection is prepopulated",
38
+ )
39
+
40
+
41
+ class DatabaseConfig(BaseModel):
42
+ handle: str = Field(
43
+ default="duckdb:///:memory:",
44
+ description="The database handle, e.g., 'duckdb:///:memory:' or 'mongodb://localhost:27017'",
45
+ )
46
+ alias: Optional[str] = Field(
47
+ default=None,
48
+ description="An optional alias for the database",
49
+ )
50
+ schema_location: Optional[str] = Field(
51
+ default=None,
52
+ description="The location of the schema file, either a path on disk or URL",
53
+ )
54
+ schema_dict: Optional[Dict[str, Any]] = Field(
55
+ default=None,
56
+ description="The LinkML schema as a dictionary",
57
+ )
58
+ collections: Dict[str, CollectionConfig] = Field(
59
+ default={},
60
+ description="A dictionary of collection configurations",
61
+ )
62
+ recreate_if_exists: bool = Field(
63
+ default=False,
64
+ description="Whether to recreate the database if it already exists",
65
+ )
66
+ collection_type_slot: Optional[str] = Field(
67
+ default=None,
68
+ description=(
69
+ "For databases that combine multiple collections into a single space, this field"
70
+ "specifies the field that contains the collection type. An example of this is a Solr"
71
+ "index that does not use cores for collections, and instead uses a single global"
72
+ "document space; if this has a field 'document_type', then this field should be set"
73
+ ),
74
+ )
75
+ searchable_slots: Optional[List[str]] = Field(
76
+ default=None,
77
+ description="Optional configuration for search fields",
78
+ )
79
+ ensure_referential_integrity: bool = Field(
80
+ default=False,
81
+ description="Whether to ensure referential integrity",
82
+ )
83
+
84
+
85
+ class ClientConfig(BaseModel):
86
+ handle: Optional[str] = Field(
87
+ default=None,
88
+ description="The client handle",
89
+ )
90
+ databases: Dict[str, DatabaseConfig] = Field(
91
+ default={},
92
+ description="A dictionary of database configurations",
93
+ )
94
+ schema_path: Optional[str] = Field(
95
+ default=None,
96
+ description="The path to the LinkML schema file",
97
+ )
98
+ base_dir: Optional[str] = Field(
99
+ default=None,
100
+ description="The base directory for the client",
101
+ )
@@ -1,15 +1,28 @@
1
+ import logging
1
2
  from abc import ABC
2
- from dataclasses import dataclass
3
- from typing import Dict, Optional, Sequence
3
+ from collections import defaultdict
4
+ from copy import copy
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
7
+
8
+ try:
9
+ from linkml.validator.report import Severity, ValidationResult
10
+ except ImportError:
11
+ ValidationResult = None
4
12
 
5
13
  from linkml_runtime import SchemaView
14
+ from linkml_runtime.linkml_model import ClassDefinition, SchemaDefinition
6
15
 
7
16
  from linkml_store.api.collection import Collection
8
- from linkml_store.api.metadata import MetaData
17
+ from linkml_store.api.config import CollectionConfig, DatabaseConfig
9
18
  from linkml_store.api.queries import Query, QueryResult
10
19
 
20
+ if TYPE_CHECKING:
21
+ from linkml_store.api.client import Client
22
+
23
+ logger = logging.getLogger(__name__)
24
+
11
25
 
12
- @dataclass
13
26
  class Database(ABC):
14
27
  """
15
28
  A Database provides access to named collections of data.
@@ -27,7 +40,7 @@ class Database(ABC):
27
40
  >>> db.get_collection("Person") == collection
28
41
  True
29
42
  >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
30
- >>> collection.add(objs)
43
+ >>> collection.insert(objs)
31
44
  >>> qr = collection.find()
32
45
  >>> len(qr.rows)
33
46
  2
@@ -43,10 +56,82 @@ class Database(ABC):
43
56
 
44
57
  """
45
58
 
46
- handle: Optional[str] = None
47
- recreate_if_exists: Optional[bool] = False
48
59
  _schema_view: Optional[SchemaView] = None
49
60
  _collections: Optional[Dict[str, Collection]] = None
61
+ parent: Optional["Client"] = None
62
+ metadata: Optional[DatabaseConfig] = None
63
+ collection_class: ClassVar[Optional[Type[Collection]]] = None
64
+
65
+ def __init__(self, handle: Optional[str] = None, metadata: Optional[DatabaseConfig] = None, **kwargs):
66
+ if metadata:
67
+ self.metadata = metadata
68
+ else:
69
+ self.metadata = DatabaseConfig(handle=handle, **kwargs)
70
+ if handle is not None and self.metadata.handle is not None and handle != self.metadata.handle:
71
+ raise ValueError(f"Handle mismatch: {handle} != {self.metadata.handle}")
72
+ self._initialize_schema()
73
+ self._initialize_collections()
74
+
75
+ def _initialize_schema(self, **kwargs):
76
+ db_config = self.metadata
77
+ if db_config.schema_location:
78
+ schema_location = db_config.schema_location.format(base_dir=self.parent.metadata.base_dir)
79
+ logger.info(f"Loading schema from: {schema_location}")
80
+ self.load_schema_view(schema_location)
81
+ if db_config.schema_dict:
82
+ schema_dict = copy(db_config.schema_dict)
83
+ if "id" not in schema_dict:
84
+ schema_dict["id"] = "tmp"
85
+ if "name" not in schema_dict:
86
+ schema_dict["name"] = "tmp"
87
+ self.set_schema_view(SchemaView(SchemaDefinition(**schema_dict)))
88
+
89
+ def from_config(self, db_config: DatabaseConfig, **kwargs):
90
+ """
91
+ Initialize a database from a configuration.
92
+
93
+ TODO: DEPRECATE
94
+
95
+ :param db_config: database configuration
96
+ :param kwargs: additional arguments
97
+ """
98
+ self.metadata = db_config
99
+ self._initialize_schema()
100
+ self._initialize_collections()
101
+ return self
102
+
103
+ def _initialize_collections(self):
104
+ for name, collection_config in self.metadata.collections.items():
105
+ alias = collection_config.alias
106
+ typ = collection_config.type
107
+ # if typ and alias is None:
108
+ # alias = name
109
+ # if typ is None:
110
+ # typ = name
111
+ # collection = self.create_collection(
112
+ # typ, alias=alias, metadata=collection_config.metadata
113
+ # )
114
+ if False and typ is not None:
115
+ if not alias:
116
+ alias = name
117
+ name = typ
118
+ if not collection_config.name:
119
+ collection_config.name = name
120
+ _collection = self.create_collection(name, alias=alias, metadata=collection_config)
121
+ if collection_config.attributes:
122
+ sv = self.schema_view
123
+ cd = ClassDefinition(name, attributes=collection_config.attributes)
124
+ sv.schema.classes[cd.name] = cd
125
+ sv.set_modified()
126
+ # assert collection.class_definition() is not None
127
+
128
+ @property
129
+ def recreate_if_exists(self) -> bool:
130
+ return self.metadata.recreate_if_exists
131
+
132
+ @property
133
+ def handle(self) -> str:
134
+ return self.metadata.handle
50
135
 
51
136
  def store(self, obj: Dict[str, str], **kwargs):
52
137
  """
@@ -55,13 +140,28 @@ class Database(ABC):
55
140
  :param obj: object to store
56
141
  :param kwargs: additional arguments
57
142
  """
143
+ sv = self.schema_view
144
+ roots = [c for c in sv.all_classes().values() if c.tree_root]
145
+ root = roots[0] if roots else None
58
146
  for k, v in obj.items():
147
+ if root:
148
+ slot = sv.induced_slot(k, root.name)
149
+ if not slot:
150
+ raise ValueError(f"Cannot determine type for {k}")
151
+ else:
152
+ slot = None
153
+ if isinstance(v, dict):
154
+ logger.debug(f"Coercing dict to list: {v}")
155
+ v = [v]
59
156
  if not isinstance(v, list):
60
157
  continue
61
158
  if not v:
62
159
  continue
63
- collection = self.get_collection(k, create_if_not_exists=True)
64
- collection.add(v)
160
+ if slot:
161
+ collection = self.get_collection(slot.range, create_if_not_exists=True)
162
+ else:
163
+ collection = self.get_collection(k, create_if_not_exists=True)
164
+ collection.replace(v)
65
165
 
66
166
  def commit(self, **kwargs):
67
167
  """
@@ -75,8 +175,17 @@ class Database(ABC):
75
175
  """
76
176
  raise NotImplementedError()
77
177
 
178
+ @property
179
+ def _collection_class(self) -> Type[Collection]:
180
+ raise NotImplementedError()
181
+
78
182
  def create_collection(
79
- self, name: str, alias: Optional[str] = None, metadata: Optional[MetaData] = None, **kwargs
183
+ self,
184
+ name: str,
185
+ alias: Optional[str] = None,
186
+ metadata: Optional[CollectionConfig] = None,
187
+ recreate_if_exists=False,
188
+ **kwargs,
80
189
  ) -> Collection:
81
190
  """
82
191
  Create a new collection
@@ -91,11 +200,28 @@ class Database(ABC):
91
200
  :param name: name of the collection
92
201
  :param alias: alias for the collection
93
202
  :param metadata: metadata for the collection
203
+ :param recreate_if_exists: recreate the collection if it already exists
94
204
  :param kwargs: additional arguments
95
205
  """
96
- raise NotImplementedError()
97
-
98
- def list_collections(self) -> Sequence[Collection]:
206
+ if not name:
207
+ raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
208
+ collection_cls = self.collection_class
209
+ collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
210
+ if metadata and metadata.attributes:
211
+ sv = self.schema_view
212
+ schema = sv.schema
213
+ cd = ClassDefinition(name=metadata.type, attributes=metadata.attributes)
214
+ schema.classes[cd.name] = cd
215
+ if not self._collections:
216
+ self._collections = {}
217
+ if not alias:
218
+ alias = name
219
+ self._collections[alias] = collection
220
+ if recreate_if_exists:
221
+ collection.delete_where({}, missing_ok=True)
222
+ return collection
223
+
224
+ def list_collections(self, include_internal=False) -> Sequence[Collection]:
99
225
  """
100
226
  List all collections.
101
227
 
@@ -112,10 +238,32 @@ class Database(ABC):
112
238
  >>> [c.name for c in collections]
113
239
  ['Person', 'Product']
114
240
 
241
+ :param include_internal: include internal collections
242
+ :return: list of collections
115
243
  """
116
244
  if not self._collections:
117
245
  self.init_collections()
118
- return list(self._collections.values())
246
+ return [c for c in self._collections.values() if include_internal or not c.is_internal]
247
+
248
+ def list_collection_names(self, **kwargs) -> Sequence[str]:
249
+ """
250
+ List all collection names.
251
+
252
+ Examples
253
+ --------
254
+ >>> from linkml_store.api.client import Client
255
+ >>> client = Client()
256
+ >>> db = client.attach_database("duckdb", alias="test")
257
+ >>> c1 = db.create_collection("Person")
258
+ >>> c2 = db.create_collection("Product")
259
+ >>> collection_names = db.list_collection_names()
260
+ >>> len(collection_names)
261
+ 2
262
+ >>> collection_names
263
+ ['Person', 'Product']
264
+
265
+ """
266
+ return [c.name for c in self.list_collections(**kwargs)]
119
267
 
120
268
  def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
121
269
  """
@@ -140,7 +288,7 @@ class Database(ABC):
140
288
  """
141
289
  if not self._collections:
142
290
  self.init_collections()
143
- if name not in self._collections:
291
+ if name not in self._collections.keys():
144
292
  if create_if_not_exists:
145
293
  self._collections[name] = self.create_collection(name)
146
294
  else:
@@ -167,7 +315,7 @@ class Database(ABC):
167
315
  >>> client = Client()
168
316
  >>> db = client.attach_database("duckdb", alias="test")
169
317
  >>> collection = db.create_collection("Person")
170
- >>> collection.add([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
318
+ >>> collection.insert([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
171
319
  >>> query = Query(from_table="Person", where_clause={"name": "John"})
172
320
  >>> result = db.query(query)
173
321
  >>> len(result.rows)
@@ -187,12 +335,64 @@ class Database(ABC):
187
335
  """
188
336
  Return a schema view for the named collection
189
337
  """
338
+ if not self._schema_view:
339
+ self._initialize_schema()
190
340
  if not self._schema_view:
191
341
  self._schema_view = self.induce_schema_view()
192
342
  return self._schema_view
193
343
 
194
- def set_schema_view(self, schema_view: SchemaView):
344
+ def set_schema_view(self, schema_view: Union[str, Path, SchemaView]):
345
+ """
346
+ Set the schema view for the database.
347
+
348
+ :param schema_view:
349
+ :return:
350
+ """
351
+ if isinstance(schema_view, Path):
352
+ schema_view = str(schema_view)
353
+ if isinstance(schema_view, str):
354
+ schema_view = SchemaView(schema_view)
195
355
  self._schema_view = schema_view
356
+ if not self._collections:
357
+ return
358
+ # align with induced schema
359
+ roots = [c for c in schema_view.all_classes().values() if c.tree_root]
360
+ if len(roots) == 0:
361
+ all_ranges = set()
362
+ for cn in schema_view.all_classes():
363
+ for slot in schema_view.class_induced_slots(cn):
364
+ if slot.range:
365
+ all_ranges.add(slot.range)
366
+ roots = [
367
+ c
368
+ for c in schema_view.all_classes().values()
369
+ if not all_ranges.intersection(schema_view.class_ancestors(c.name, reflexive=True))
370
+ ]
371
+ if len(roots) == 1:
372
+ root = roots[0]
373
+ for slot in schema_view.class_induced_slots(root.name):
374
+ inlined = slot.inlined or slot.inlined_as_list
375
+ if inlined and slot.range:
376
+ if slot.name in self._collections:
377
+ coll = self._collections[slot.name]
378
+ if not coll.metadata.type:
379
+ coll.metadata.type = slot.range
380
+
381
+ def load_schema_view(self, path: Union[str, Path]):
382
+ """
383
+ Load a schema view from a file.
384
+
385
+ >>> from linkml_store.api.client import Client
386
+ >>> client = Client()
387
+ >>> db = client.attach_database("duckdb", alias="test")
388
+ >>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
389
+
390
+ :param path:
391
+ :return:
392
+ """
393
+ if isinstance(path, Path):
394
+ path = str(path)
395
+ self.set_schema_view(SchemaView(path))
196
396
 
197
397
  def induce_schema_view(self) -> SchemaView:
198
398
  """
@@ -203,13 +403,77 @@ class Database(ABC):
203
403
  >>> client = Client()
204
404
  >>> db = client.attach_database("duckdb", alias="test")
205
405
  >>> collection = db.create_collection("Person")
206
- >>> collection.add([{"id": "P1", "name": "John", "age_in_years": 25},
406
+ >>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25},
207
407
  ... {"id": "P2", "name": "Alice", "age_in_years": 25}])
208
408
  >>> schema_view = db.induce_schema_view()
209
409
  >>> cd = schema_view.get_class("Person")
210
410
  >>> cd.attributes["id"].range
211
411
  'string'
412
+ >>> cd.attributes["age_in_years"].range
413
+ 'integer'
212
414
 
213
415
  :return: A schema view
214
416
  """
215
417
  raise NotImplementedError()
418
+
419
+ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
420
+ """
421
+ Validate the contents of the database.
422
+
423
+ :param kwargs:
424
+ :return: iterator over validation results
425
+ """
426
+ for collection in self.list_collections():
427
+ yield from collection.iter_validate_collection(**kwargs)
428
+ if self.metadata.ensure_referential_integrity:
429
+ yield from self._validate_referential_integrity(**kwargs)
430
+
431
+ def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
432
+ """
433
+ Validate referential integrity of the database.
434
+
435
+ :param kwargs:
436
+ :return: iterator over validation results
437
+ """
438
+ sv = self.schema_view
439
+ cmap = defaultdict(list)
440
+ for collection in self.list_collections():
441
+ if not collection.target_class_name:
442
+ raise ValueError(f"Collection {collection.name} has no target class")
443
+ cmap[collection.target_class_name].append(collection)
444
+ for collection in self.list_collections():
445
+ cd = collection.class_definition()
446
+ induced_slots = sv.class_induced_slots(cd.name)
447
+ slot_map = {s.name: s for s in induced_slots}
448
+ # rmap = {s.name: s.range for s in induced_slots}
449
+ sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
450
+ for obj in collection.find_iter():
451
+ for k, v in obj.items():
452
+ if k not in sr_to_coll:
453
+ continue
454
+ ref_colls = sr_to_coll[k]
455
+ if not ref_colls:
456
+ continue
457
+ if not isinstance(v, (str, int)):
458
+ continue
459
+ slot = slot_map[k]
460
+ found = False
461
+ for ref_coll in ref_colls:
462
+ ref_obj = ref_coll.get_one(v)
463
+ if ref_obj:
464
+ found = True
465
+ break
466
+ if not found:
467
+ yield ValidationResult(
468
+ type="ReferentialIntegrity",
469
+ severity=Severity.ERROR,
470
+ message=f"Referential integrity error: {slot.range} not found",
471
+ instantiates=slot.range,
472
+ instance=v,
473
+ )
474
+
475
+ def drop(self, **kwargs):
476
+ """
477
+ Drop the database and all collections
478
+ """
479
+ raise NotImplementedError()
@@ -22,7 +22,7 @@ class Query(BaseModel):
22
22
 
23
23
  from_table: Optional[str]
24
24
  select_cols: Optional[List[str]] = None
25
- where_clause: Optional[Union[str, List[str], Dict[str, str]]] = None
25
+ where_clause: Optional[Union[str, List[str], Dict[str, Any]]] = None
26
26
  sort_by: Optional[List[str]] = None
27
27
  limit: Optional[int] = None
28
28
  offset: Optional[int] = None
@@ -30,12 +30,21 @@ class Query(BaseModel):
30
30
  facet_slots: Optional[List[str]] = None
31
31
 
32
32
 
33
+ class FacetCountResult(BaseModel):
34
+ """
35
+ A facet count result
36
+ """
37
+
38
+ as_dict: Dict[FACET_GROUP, List[Tuple[FACET_GROUP, int]]]
39
+
40
+
33
41
  class QueryResult(BaseModel):
34
42
  """
35
43
  A query result
36
44
  """
37
45
 
38
46
  query: Optional[Query] = None
47
+ search_term: Optional[str] = None
39
48
  num_rows: int
40
49
  offset: Optional[int] = 0
41
50
  rows: Optional[List[Dict[str, Any]]] = None
@@ -45,6 +54,8 @@ class QueryResult(BaseModel):
45
54
 
46
55
  @property
47
56
  def rows_dataframe(self) -> pd.DataFrame:
57
+ if self.ranked_rows is not None:
58
+ self._rows_dataframe = pd.DataFrame([{"score": score, **row} for score, row in self.ranked_rows])
48
59
  if self._rows_dataframe is None and self.rows:
49
60
  self._rows_dataframe = pd.DataFrame(self.rows)
50
61
  return self._rows_dataframe
@@ -0,0 +1,3 @@
1
+ """
2
+ Support for ChromaDB is experimental.
3
+ """
@@ -0,0 +1,121 @@
1
+ """
2
+ ChromaDB Collection
3
+ """
4
+
5
+ import logging
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ from chromadb.api.models.Collection import Collection as ChromaCollection
9
+ from linkml_runtime.linkml_model import SlotDefinition
10
+
11
+ from linkml_store.api import Collection
12
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
13
+ from linkml_store.api.queries import Query, QueryResult
14
+ from linkml_store.index import Indexer
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ChromaDBCollection(Collection):
20
+ """
21
+ A wrapper for ChromaDB collections.
22
+ """
23
+
24
+ @property
25
+ def native_collection(self) -> ChromaCollection:
26
+ return self.parent.client.get_collection(self.name)
27
+
28
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
29
+ if not isinstance(objs, list):
30
+ objs = [objs]
31
+
32
+ documents = []
33
+ metadatas = []
34
+ ids = []
35
+ indexer = Indexer()
36
+
37
+ for obj in objs:
38
+ obj_id = self.object_identifier(obj)
39
+ ids.append(obj_id)
40
+ doc_text = indexer.object_to_text(obj)
41
+ documents.append(doc_text)
42
+ # TODO: handle nesting
43
+ metadata = {k: v for k, v in obj.items()}
44
+ metadatas.append(metadata)
45
+
46
+ self.native_collection.add(
47
+ documents=documents,
48
+ metadatas=metadatas,
49
+ ids=ids,
50
+ )
51
+
52
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
53
+ if not isinstance(objs, list):
54
+ objs = [objs]
55
+ ids = [obj["id"] for obj in objs]
56
+ self.native_collection.delete(ids=ids)
57
+ return len(ids)
58
+
59
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
60
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
61
+ if where is None:
62
+ where = {}
63
+ results = self.native_collection.get(where=where)
64
+ ids = [result["id"] for result in results]
65
+ self.native_collection.delete(ids=ids)
66
+ return len(ids)
67
+
68
+ def query(self, query: Query, **kwargs) -> QueryResult:
69
+ chroma_filter = self._build_chroma_filter(query.where_clause)
70
+ if query.limit:
71
+ results = self.native_collection.get(where=chroma_filter, limit=query.limit)
72
+ else:
73
+ results = self.native_collection.get(where=chroma_filter)
74
+
75
+ count = len(results)
76
+ return QueryResult(query=query, num_rows=count, rows=results)
77
+
78
+ def query_facets(
79
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
80
+ ) -> Dict[str, List[Tuple[Any, int]]]:
81
+ results = {}
82
+ cd = self.class_definition()
83
+ if not facet_columns:
84
+ facet_columns = list(self.class_definition().attributes.keys())
85
+
86
+ for col in facet_columns:
87
+ logger.debug(f"Faceting on {col}")
88
+ if isinstance(col, tuple):
89
+ sd = SlotDefinition(name="PLACEHOLDER")
90
+ else:
91
+ sd = cd.attributes[col]
92
+
93
+ if sd.multivalued:
94
+ facet_results = self.native_collection.aggregate(
95
+ aggregation=[
96
+ {"$match": where} if where else {"$match": {}},
97
+ {"$unwind": f"${col}"},
98
+ {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
99
+ {"$sort": {"count": -1}},
100
+ {"$limit": facet_limit},
101
+ ]
102
+ )
103
+ else:
104
+ facet_results = self.native_collection.aggregate(
105
+ aggregation=[
106
+ {"$match": where} if where else {"$match": {}},
107
+ {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
108
+ {"$sort": {"count": -1}},
109
+ {"$limit": facet_limit},
110
+ ]
111
+ )
112
+
113
+ results[col] = [(result["_id"], result["count"]) for result in facet_results]
114
+
115
+ return results
116
+
117
+ def _build_chroma_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
118
+ chroma_filter = {}
119
+ for field, value in where_clause.items():
120
+ chroma_filter[field] = value
121
+ return chroma_filter