linkml-store 0.0.0__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +108 -7
- linkml_store/api/collection.py +221 -30
- linkml_store/api/config.py +97 -0
- linkml_store/api/database.py +207 -17
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +0 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +114 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +35 -44
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +86 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/solr_collection.py +132 -0
- linkml_store/api/stores/solr/solr_database.py +82 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +73 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/METADATA +17 -6
- linkml_store-0.1.6.dist-info/RECORD +41 -0
- linkml_store-0.1.6.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/WHEEL +0 -0
linkml_store/api/database.py
CHANGED
|
@@ -1,15 +1,27 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from abc import ABC
|
|
2
|
-
from
|
|
3
|
-
from
|
|
3
|
+
from copy import copy
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from linkml.validator.report import ValidationResult
|
|
9
|
+
except ImportError:
|
|
10
|
+
ValidationResult = None
|
|
4
11
|
|
|
5
12
|
from linkml_runtime import SchemaView
|
|
13
|
+
from linkml_runtime.linkml_model import ClassDefinition, SchemaDefinition
|
|
6
14
|
|
|
7
15
|
from linkml_store.api.collection import Collection
|
|
8
|
-
from linkml_store.api.
|
|
16
|
+
from linkml_store.api.config import CollectionConfig, DatabaseConfig
|
|
9
17
|
from linkml_store.api.queries import Query, QueryResult
|
|
10
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from linkml_store.api.client import Client
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
11
24
|
|
|
12
|
-
@dataclass
|
|
13
25
|
class Database(ABC):
|
|
14
26
|
"""
|
|
15
27
|
A Database provides access to named collections of data.
|
|
@@ -27,7 +39,7 @@ class Database(ABC):
|
|
|
27
39
|
>>> db.get_collection("Person") == collection
|
|
28
40
|
True
|
|
29
41
|
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
30
|
-
>>> collection.
|
|
42
|
+
>>> collection.insert(objs)
|
|
31
43
|
>>> qr = collection.find()
|
|
32
44
|
>>> len(qr.rows)
|
|
33
45
|
2
|
|
@@ -43,10 +55,82 @@ class Database(ABC):
|
|
|
43
55
|
|
|
44
56
|
"""
|
|
45
57
|
|
|
46
|
-
handle: Optional[str] = None
|
|
47
|
-
recreate_if_exists: Optional[bool] = False
|
|
48
58
|
_schema_view: Optional[SchemaView] = None
|
|
49
59
|
_collections: Optional[Dict[str, Collection]] = None
|
|
60
|
+
parent: Optional["Client"] = None
|
|
61
|
+
metadata: Optional[DatabaseConfig] = None
|
|
62
|
+
collection_class: ClassVar[Optional[Type[Collection]]] = None
|
|
63
|
+
|
|
64
|
+
def __init__(self, handle: Optional[str] = None, metadata: Optional[DatabaseConfig] = None, **kwargs):
|
|
65
|
+
if metadata:
|
|
66
|
+
self.metadata = metadata
|
|
67
|
+
else:
|
|
68
|
+
self.metadata = DatabaseConfig(handle=handle, **kwargs)
|
|
69
|
+
if handle is not None and self.metadata.handle is not None and handle != self.metadata.handle:
|
|
70
|
+
raise ValueError(f"Handle mismatch: {handle} != {self.metadata.handle}")
|
|
71
|
+
self._initialize_schema()
|
|
72
|
+
self._initialize_collections()
|
|
73
|
+
|
|
74
|
+
def _initialize_schema(self, **kwargs):
|
|
75
|
+
db_config = self.metadata
|
|
76
|
+
if db_config.schema_location:
|
|
77
|
+
schema_location = db_config.schema_location.format(base_dir=self.parent.metadata.base_dir)
|
|
78
|
+
logger.info(f"Loading schema from: {schema_location}")
|
|
79
|
+
self.load_schema_view(schema_location)
|
|
80
|
+
if db_config.schema_dict:
|
|
81
|
+
schema_dict = copy(db_config.schema_dict)
|
|
82
|
+
if "id" not in schema_dict:
|
|
83
|
+
schema_dict["id"] = "tmp"
|
|
84
|
+
if "name" not in schema_dict:
|
|
85
|
+
schema_dict["name"] = "tmp"
|
|
86
|
+
self.set_schema_view(SchemaView(SchemaDefinition(**schema_dict)))
|
|
87
|
+
|
|
88
|
+
def from_config(self, db_config: DatabaseConfig, **kwargs):
|
|
89
|
+
"""
|
|
90
|
+
Initialize a database from a configuration.
|
|
91
|
+
|
|
92
|
+
TODO: DEPRECATE
|
|
93
|
+
|
|
94
|
+
:param db_config: database configuration
|
|
95
|
+
:param kwargs: additional arguments
|
|
96
|
+
"""
|
|
97
|
+
self.metadata = db_config
|
|
98
|
+
self._initialize_schema()
|
|
99
|
+
self._initialize_collections()
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def _initialize_collections(self):
|
|
103
|
+
for name, collection_config in self.metadata.collections.items():
|
|
104
|
+
alias = collection_config.alias
|
|
105
|
+
typ = collection_config.type
|
|
106
|
+
# if typ and alias is None:
|
|
107
|
+
# alias = name
|
|
108
|
+
# if typ is None:
|
|
109
|
+
# typ = name
|
|
110
|
+
# collection = self.create_collection(
|
|
111
|
+
# typ, alias=alias, metadata=collection_config.metadata
|
|
112
|
+
# )
|
|
113
|
+
if False and typ is not None:
|
|
114
|
+
if not alias:
|
|
115
|
+
alias = name
|
|
116
|
+
name = typ
|
|
117
|
+
if not collection_config.name:
|
|
118
|
+
collection_config.name = name
|
|
119
|
+
_collection = self.create_collection(name, alias=alias, metadata=collection_config)
|
|
120
|
+
if collection_config.attributes:
|
|
121
|
+
sv = self.schema_view
|
|
122
|
+
cd = ClassDefinition(name, attributes=collection_config.attributes)
|
|
123
|
+
sv.schema.classes[cd.name] = cd
|
|
124
|
+
sv.set_modified()
|
|
125
|
+
# assert collection.class_definition() is not None
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def recreate_if_exists(self) -> bool:
|
|
129
|
+
return self.metadata.recreate_if_exists
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def handle(self) -> str:
|
|
133
|
+
return self.metadata.handle
|
|
50
134
|
|
|
51
135
|
def store(self, obj: Dict[str, str], **kwargs):
|
|
52
136
|
"""
|
|
@@ -55,13 +139,28 @@ class Database(ABC):
|
|
|
55
139
|
:param obj: object to store
|
|
56
140
|
:param kwargs: additional arguments
|
|
57
141
|
"""
|
|
142
|
+
sv = self.schema_view
|
|
143
|
+
roots = [c for c in sv.all_classes().values() if c.tree_root]
|
|
144
|
+
root = roots[0] if roots else None
|
|
58
145
|
for k, v in obj.items():
|
|
146
|
+
if root:
|
|
147
|
+
slot = sv.induced_slot(k, root.name)
|
|
148
|
+
if not slot:
|
|
149
|
+
raise ValueError(f"Cannot determine type for {k}")
|
|
150
|
+
else:
|
|
151
|
+
slot = None
|
|
152
|
+
if isinstance(v, dict):
|
|
153
|
+
logger.debug(f"Coercing dict to list: {v}")
|
|
154
|
+
v = [v]
|
|
59
155
|
if not isinstance(v, list):
|
|
60
156
|
continue
|
|
61
157
|
if not v:
|
|
62
158
|
continue
|
|
63
|
-
|
|
64
|
-
|
|
159
|
+
if slot:
|
|
160
|
+
collection = self.get_collection(slot.range, create_if_not_exists=True)
|
|
161
|
+
else:
|
|
162
|
+
collection = self.get_collection(k, create_if_not_exists=True)
|
|
163
|
+
collection.replace(v)
|
|
65
164
|
|
|
66
165
|
def commit(self, **kwargs):
|
|
67
166
|
"""
|
|
@@ -75,8 +174,17 @@ class Database(ABC):
|
|
|
75
174
|
"""
|
|
76
175
|
raise NotImplementedError()
|
|
77
176
|
|
|
177
|
+
@property
|
|
178
|
+
def _collection_class(self) -> Type[Collection]:
|
|
179
|
+
raise NotImplementedError()
|
|
180
|
+
|
|
78
181
|
def create_collection(
|
|
79
|
-
self,
|
|
182
|
+
self,
|
|
183
|
+
name: str,
|
|
184
|
+
alias: Optional[str] = None,
|
|
185
|
+
metadata: Optional[CollectionConfig] = None,
|
|
186
|
+
recreate_if_exists=False,
|
|
187
|
+
**kwargs,
|
|
80
188
|
) -> Collection:
|
|
81
189
|
"""
|
|
82
190
|
Create a new collection
|
|
@@ -91,11 +199,29 @@ class Database(ABC):
|
|
|
91
199
|
:param name: name of the collection
|
|
92
200
|
:param alias: alias for the collection
|
|
93
201
|
:param metadata: metadata for the collection
|
|
202
|
+
:param recreate_if_exists: recreate the collection if it already exists
|
|
94
203
|
:param kwargs: additional arguments
|
|
95
204
|
"""
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
205
|
+
if not name:
|
|
206
|
+
raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
|
|
207
|
+
# collection_cls = self._collection_class
|
|
208
|
+
collection_cls = self.collection_class
|
|
209
|
+
collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
|
|
210
|
+
if metadata and metadata.attributes:
|
|
211
|
+
sv = self.schema_view
|
|
212
|
+
schema = sv.schema
|
|
213
|
+
cd = ClassDefinition(name=metadata.type, attributes=metadata.attributes)
|
|
214
|
+
schema.classes[cd.name] = cd
|
|
215
|
+
if not self._collections:
|
|
216
|
+
self._collections = {}
|
|
217
|
+
if not alias:
|
|
218
|
+
alias = name
|
|
219
|
+
self._collections[alias] = collection
|
|
220
|
+
if recreate_if_exists:
|
|
221
|
+
collection.delete_where({}, missing_ok=True)
|
|
222
|
+
return collection
|
|
223
|
+
|
|
224
|
+
def list_collections(self, include_internal=False) -> Sequence[Collection]:
|
|
99
225
|
"""
|
|
100
226
|
List all collections.
|
|
101
227
|
|
|
@@ -112,10 +238,32 @@ class Database(ABC):
|
|
|
112
238
|
>>> [c.name for c in collections]
|
|
113
239
|
['Person', 'Product']
|
|
114
240
|
|
|
241
|
+
:param include_internal: include internal collections
|
|
242
|
+
:return: list of collections
|
|
115
243
|
"""
|
|
116
244
|
if not self._collections:
|
|
117
245
|
self.init_collections()
|
|
118
|
-
return
|
|
246
|
+
return [c for c in self._collections.values() if include_internal or not c.is_internal]
|
|
247
|
+
|
|
248
|
+
def list_collection_names(self, **kwargs) -> Sequence[str]:
|
|
249
|
+
"""
|
|
250
|
+
List all collection names.
|
|
251
|
+
|
|
252
|
+
Examples
|
|
253
|
+
--------
|
|
254
|
+
>>> from linkml_store.api.client import Client
|
|
255
|
+
>>> client = Client()
|
|
256
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
257
|
+
>>> c1 = db.create_collection("Person")
|
|
258
|
+
>>> c2 = db.create_collection("Product")
|
|
259
|
+
>>> collection_names = db.list_collection_names()
|
|
260
|
+
>>> len(collection_names)
|
|
261
|
+
2
|
|
262
|
+
>>> collection_names
|
|
263
|
+
['Person', 'Product']
|
|
264
|
+
|
|
265
|
+
"""
|
|
266
|
+
return [c.name for c in self.list_collections(**kwargs)]
|
|
119
267
|
|
|
120
268
|
def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
|
|
121
269
|
"""
|
|
@@ -140,7 +288,7 @@ class Database(ABC):
|
|
|
140
288
|
"""
|
|
141
289
|
if not self._collections:
|
|
142
290
|
self.init_collections()
|
|
143
|
-
if name not in self._collections:
|
|
291
|
+
if name not in self._collections.keys():
|
|
144
292
|
if create_if_not_exists:
|
|
145
293
|
self._collections[name] = self.create_collection(name)
|
|
146
294
|
else:
|
|
@@ -167,7 +315,7 @@ class Database(ABC):
|
|
|
167
315
|
>>> client = Client()
|
|
168
316
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
169
317
|
>>> collection = db.create_collection("Person")
|
|
170
|
-
>>> collection.
|
|
318
|
+
>>> collection.insert([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
|
|
171
319
|
>>> query = Query(from_table="Person", where_clause={"name": "John"})
|
|
172
320
|
>>> result = db.query(query)
|
|
173
321
|
>>> len(result.rows)
|
|
@@ -187,13 +335,37 @@ class Database(ABC):
|
|
|
187
335
|
"""
|
|
188
336
|
Return a schema view for the named collection
|
|
189
337
|
"""
|
|
338
|
+
if not self._schema_view:
|
|
339
|
+
self._initialize_schema()
|
|
190
340
|
if not self._schema_view:
|
|
191
341
|
self._schema_view = self.induce_schema_view()
|
|
192
342
|
return self._schema_view
|
|
193
343
|
|
|
194
344
|
def set_schema_view(self, schema_view: SchemaView):
|
|
345
|
+
"""
|
|
346
|
+
Set the schema view for the database.
|
|
347
|
+
|
|
348
|
+
:param schema_view:
|
|
349
|
+
:return:
|
|
350
|
+
"""
|
|
195
351
|
self._schema_view = schema_view
|
|
196
352
|
|
|
353
|
+
def load_schema_view(self, path: Union[str, Path]):
|
|
354
|
+
"""
|
|
355
|
+
Load a schema view from a file.
|
|
356
|
+
|
|
357
|
+
>>> from linkml_store.api.client import Client
|
|
358
|
+
>>> client = Client()
|
|
359
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
360
|
+
>>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
|
|
361
|
+
|
|
362
|
+
:param path:
|
|
363
|
+
:return:
|
|
364
|
+
"""
|
|
365
|
+
if isinstance(path, Path):
|
|
366
|
+
path = str(path)
|
|
367
|
+
self.set_schema_view(SchemaView(path))
|
|
368
|
+
|
|
197
369
|
def induce_schema_view(self) -> SchemaView:
|
|
198
370
|
"""
|
|
199
371
|
Induce a schema view from a schema definition.
|
|
@@ -203,13 +375,31 @@ class Database(ABC):
|
|
|
203
375
|
>>> client = Client()
|
|
204
376
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
205
377
|
>>> collection = db.create_collection("Person")
|
|
206
|
-
>>> collection.
|
|
378
|
+
>>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25},
|
|
207
379
|
... {"id": "P2", "name": "Alice", "age_in_years": 25}])
|
|
208
380
|
>>> schema_view = db.induce_schema_view()
|
|
209
381
|
>>> cd = schema_view.get_class("Person")
|
|
210
382
|
>>> cd.attributes["id"].range
|
|
211
383
|
'string'
|
|
384
|
+
>>> cd.attributes["age_in_years"].range
|
|
385
|
+
'integer'
|
|
212
386
|
|
|
213
387
|
:return: A schema view
|
|
214
388
|
"""
|
|
215
389
|
raise NotImplementedError()
|
|
390
|
+
|
|
391
|
+
def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
392
|
+
"""
|
|
393
|
+
Validate the contents of the database.
|
|
394
|
+
|
|
395
|
+
:param kwargs:
|
|
396
|
+
:return: iterator over validation results
|
|
397
|
+
"""
|
|
398
|
+
for collection in self.list_collections():
|
|
399
|
+
yield from collection.iter_validate_collection(**kwargs)
|
|
400
|
+
|
|
401
|
+
def drop(self, **kwargs):
|
|
402
|
+
"""
|
|
403
|
+
Drop the database and all collections
|
|
404
|
+
"""
|
|
405
|
+
raise NotImplementedError()
|
linkml_store/api/queries.py
CHANGED
|
@@ -22,7 +22,7 @@ class Query(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
from_table: Optional[str]
|
|
24
24
|
select_cols: Optional[List[str]] = None
|
|
25
|
-
where_clause: Optional[Union[str, List[str], Dict[str,
|
|
25
|
+
where_clause: Optional[Union[str, List[str], Dict[str, Any]]] = None
|
|
26
26
|
sort_by: Optional[List[str]] = None
|
|
27
27
|
limit: Optional[int] = None
|
|
28
28
|
offset: Optional[int] = None
|
|
@@ -30,12 +30,21 @@ class Query(BaseModel):
|
|
|
30
30
|
facet_slots: Optional[List[str]] = None
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
class FacetCountResult(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
A facet count result
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
as_dict: Dict[FACET_GROUP, List[Tuple[FACET_GROUP, int]]]
|
|
39
|
+
|
|
40
|
+
|
|
33
41
|
class QueryResult(BaseModel):
|
|
34
42
|
"""
|
|
35
43
|
A query result
|
|
36
44
|
"""
|
|
37
45
|
|
|
38
46
|
query: Optional[Query] = None
|
|
47
|
+
search_term: Optional[str] = None
|
|
39
48
|
num_rows: int
|
|
40
49
|
offset: Optional[int] = 0
|
|
41
50
|
rows: Optional[List[Dict[str, Any]]] = None
|
|
@@ -45,6 +54,8 @@ class QueryResult(BaseModel):
|
|
|
45
54
|
|
|
46
55
|
@property
|
|
47
56
|
def rows_dataframe(self) -> pd.DataFrame:
|
|
57
|
+
if self.ranked_rows is not None:
|
|
58
|
+
self._rows_dataframe = pd.DataFrame([{"score": score, **row} for score, row in self.ranked_rows])
|
|
48
59
|
if self._rows_dataframe is None and self.rows:
|
|
49
60
|
self._rows_dataframe = pd.DataFrame(self.rows)
|
|
50
61
|
return self._rows_dataframe
|
|
File without changes
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from chromadb.api.models.Collection import Collection as ChromaCollection
|
|
5
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
6
|
+
|
|
7
|
+
from linkml_store.api import Collection
|
|
8
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
9
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
10
|
+
from linkml_store.index import Indexer
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ChromaDBCollection(Collection):
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def native_collection(self) -> ChromaCollection:
|
|
19
|
+
return self.parent.client.get_collection(self.name)
|
|
20
|
+
|
|
21
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
22
|
+
if not isinstance(objs, list):
|
|
23
|
+
objs = [objs]
|
|
24
|
+
|
|
25
|
+
documents = []
|
|
26
|
+
metadatas = []
|
|
27
|
+
ids = []
|
|
28
|
+
indexer = Indexer()
|
|
29
|
+
|
|
30
|
+
for obj in objs:
|
|
31
|
+
obj_id = self.object_identifier(obj)
|
|
32
|
+
ids.append(obj_id)
|
|
33
|
+
doc_text = indexer.object_to_text(obj)
|
|
34
|
+
documents.append(doc_text)
|
|
35
|
+
# TODO: handle nesting
|
|
36
|
+
metadata = {k: v for k, v in obj.items()}
|
|
37
|
+
metadatas.append(metadata)
|
|
38
|
+
|
|
39
|
+
self.native_collection.add(
|
|
40
|
+
documents=documents,
|
|
41
|
+
metadatas=metadatas,
|
|
42
|
+
ids=ids,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
46
|
+
if not isinstance(objs, list):
|
|
47
|
+
objs = [objs]
|
|
48
|
+
ids = [obj["id"] for obj in objs]
|
|
49
|
+
self.native_collection.delete(ids=ids)
|
|
50
|
+
return len(ids)
|
|
51
|
+
|
|
52
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
53
|
+
logger.info(f"Deleting from {self._target_class_name} where: {where}")
|
|
54
|
+
if where is None:
|
|
55
|
+
where = {}
|
|
56
|
+
results = self.native_collection.get(where=where)
|
|
57
|
+
ids = [result["id"] for result in results]
|
|
58
|
+
self.native_collection.delete(ids=ids)
|
|
59
|
+
return len(ids)
|
|
60
|
+
|
|
61
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
62
|
+
chroma_filter = self._build_chroma_filter(query.where_clause)
|
|
63
|
+
if query.limit:
|
|
64
|
+
results = self.native_collection.get(where=chroma_filter, limit=query.limit)
|
|
65
|
+
else:
|
|
66
|
+
results = self.native_collection.get(where=chroma_filter)
|
|
67
|
+
|
|
68
|
+
count = len(results)
|
|
69
|
+
return QueryResult(query=query, num_rows=count, rows=results)
|
|
70
|
+
|
|
71
|
+
def query_facets(
|
|
72
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
73
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
74
|
+
results = {}
|
|
75
|
+
cd = self.class_definition()
|
|
76
|
+
if not facet_columns:
|
|
77
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
78
|
+
|
|
79
|
+
for col in facet_columns:
|
|
80
|
+
logger.debug(f"Faceting on {col}")
|
|
81
|
+
if isinstance(col, tuple):
|
|
82
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
83
|
+
else:
|
|
84
|
+
sd = cd.attributes[col]
|
|
85
|
+
|
|
86
|
+
if sd.multivalued:
|
|
87
|
+
facet_results = self.native_collection.aggregate(
|
|
88
|
+
aggregation=[
|
|
89
|
+
{"$match": where} if where else {"$match": {}},
|
|
90
|
+
{"$unwind": f"${col}"},
|
|
91
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
92
|
+
{"$sort": {"count": -1}},
|
|
93
|
+
{"$limit": facet_limit},
|
|
94
|
+
]
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
facet_results = self.native_collection.aggregate(
|
|
98
|
+
aggregation=[
|
|
99
|
+
{"$match": where} if where else {"$match": {}},
|
|
100
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
101
|
+
{"$sort": {"count": -1}},
|
|
102
|
+
{"$limit": facet_limit},
|
|
103
|
+
]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
107
|
+
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
def _build_chroma_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
|
|
111
|
+
chroma_filter = {}
|
|
112
|
+
for field, value in where_clause.items():
|
|
113
|
+
chroma_filter[field] = value
|
|
114
|
+
return chroma_filter
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# chromadb_database.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import chromadb
|
|
7
|
+
from chromadb.config import Settings
|
|
8
|
+
from linkml_runtime import SchemaView
|
|
9
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
10
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
11
|
+
|
|
12
|
+
from linkml_store.api import Database
|
|
13
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
14
|
+
from linkml_store.api.stores.chromadb.chromadb_collection import ChromaDBCollection
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChromaDBDatabase(Database):
|
|
20
|
+
_client: chromadb.Client = None
|
|
21
|
+
collection_class = ChromaDBCollection
|
|
22
|
+
|
|
23
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
|
+
if handle is None:
|
|
25
|
+
handle = ".chromadb"
|
|
26
|
+
super().__init__(handle=handle, **kwargs)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def client(self) -> chromadb.Client:
|
|
30
|
+
if self._client is None:
|
|
31
|
+
self._client = chromadb.Client(
|
|
32
|
+
Settings(
|
|
33
|
+
chroma_db_impl="duckdb+parquet",
|
|
34
|
+
persist_directory=self.handle,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
return self._client
|
|
38
|
+
|
|
39
|
+
def commit(self, **kwargs):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
def close(self, **kwargs):
|
|
43
|
+
if self._client:
|
|
44
|
+
self._client.close()
|
|
45
|
+
|
|
46
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
47
|
+
if query.from_table:
|
|
48
|
+
collection = self.get_collection(query.from_table)
|
|
49
|
+
return collection.query(query, **kwargs)
|
|
50
|
+
|
|
51
|
+
def init_collections(self):
|
|
52
|
+
if self._collections is None:
|
|
53
|
+
self._collections = {}
|
|
54
|
+
|
|
55
|
+
for collection_name in self.client.list_collections():
|
|
56
|
+
if collection_name not in self._collections:
|
|
57
|
+
collection = ChromaDBCollection(name=collection_name, parent=self)
|
|
58
|
+
self._collections[collection_name] = collection
|
|
59
|
+
|
|
60
|
+
def induce_schema_view(self) -> SchemaView:
|
|
61
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
62
|
+
sb = SchemaBuilder()
|
|
63
|
+
schema = sb.schema
|
|
64
|
+
|
|
65
|
+
for collection_name in self.client.list_collections():
|
|
66
|
+
sb.add_class(collection_name)
|
|
67
|
+
chroma_collection = self.client.get_collection(collection_name)
|
|
68
|
+
sample_doc = chroma_collection.peek(1)
|
|
69
|
+
if sample_doc:
|
|
70
|
+
for field, value in sample_doc[0].items():
|
|
71
|
+
if field == "_id":
|
|
72
|
+
continue
|
|
73
|
+
sd = SlotDefinition(field)
|
|
74
|
+
if isinstance(value, list):
|
|
75
|
+
sd.multivalued = True
|
|
76
|
+
if isinstance(value, dict):
|
|
77
|
+
sd.inlined = True
|
|
78
|
+
sb.schema.classes[collection_name].attributes[sd.name] = sd
|
|
79
|
+
|
|
80
|
+
sb.add_defaults()
|
|
81
|
+
for cls_name in schema.classes:
|
|
82
|
+
if cls_name in self.metadata.collections:
|
|
83
|
+
collection_metadata = self.metadata.collections[cls_name]
|
|
84
|
+
if collection_metadata.attributes:
|
|
85
|
+
del schema.classes[cls_name]
|
|
86
|
+
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
87
|
+
schema.classes[cls.name] = cls
|
|
88
|
+
|
|
89
|
+
return SchemaView(schema)
|