linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +113 -8
- linkml_store/api/collection.py +272 -34
- linkml_store/api/config.py +101 -0
- linkml_store/api/database.py +282 -18
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +3 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/__init__.py +7 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +133 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +81 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
- linkml_store-0.1.7.dist-info/RECORD +42 -0
- linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CollectionConfig(BaseModel):
|
|
7
|
+
name: Optional[str] = Field(
|
|
8
|
+
default=None,
|
|
9
|
+
description="An optional name for the collection",
|
|
10
|
+
)
|
|
11
|
+
alias: Optional[str] = Field(
|
|
12
|
+
default=None,
|
|
13
|
+
description="An optional alias for the collection",
|
|
14
|
+
)
|
|
15
|
+
type: Optional[str] = Field(
|
|
16
|
+
default=None,
|
|
17
|
+
description="The type of object in the collection. TODO; use this instead of name",
|
|
18
|
+
)
|
|
19
|
+
metadata: Optional[Dict] = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description="Optional metadata for the collection",
|
|
22
|
+
)
|
|
23
|
+
attributes: Optional[Dict[str, Dict]] = Field(
|
|
24
|
+
default=None,
|
|
25
|
+
description="Optional attributes for the collection, following LinkML schema",
|
|
26
|
+
)
|
|
27
|
+
indexers: Optional[Dict[str, Dict]] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="Optional configuration for indexers",
|
|
30
|
+
)
|
|
31
|
+
hidden: Optional[bool] = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Whether the collection is hidden",
|
|
34
|
+
)
|
|
35
|
+
is_prepopulated: Optional[bool] = Field(
|
|
36
|
+
default=False,
|
|
37
|
+
description="Whether the collection is prepopulated",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabaseConfig(BaseModel):
|
|
42
|
+
handle: str = Field(
|
|
43
|
+
default="duckdb:///:memory:",
|
|
44
|
+
description="The database handle, e.g., 'duckdb:///:memory:' or 'mongodb://localhost:27017'",
|
|
45
|
+
)
|
|
46
|
+
alias: Optional[str] = Field(
|
|
47
|
+
default=None,
|
|
48
|
+
description="An optional alias for the database",
|
|
49
|
+
)
|
|
50
|
+
schema_location: Optional[str] = Field(
|
|
51
|
+
default=None,
|
|
52
|
+
description="The location of the schema file, either a path on disk or URL",
|
|
53
|
+
)
|
|
54
|
+
schema_dict: Optional[Dict[str, Any]] = Field(
|
|
55
|
+
default=None,
|
|
56
|
+
description="The LinkML schema as a dictionary",
|
|
57
|
+
)
|
|
58
|
+
collections: Dict[str, CollectionConfig] = Field(
|
|
59
|
+
default={},
|
|
60
|
+
description="A dictionary of collection configurations",
|
|
61
|
+
)
|
|
62
|
+
recreate_if_exists: bool = Field(
|
|
63
|
+
default=False,
|
|
64
|
+
description="Whether to recreate the database if it already exists",
|
|
65
|
+
)
|
|
66
|
+
collection_type_slot: Optional[str] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description=(
|
|
69
|
+
"For databases that combine multiple collections into a single space, this field"
|
|
70
|
+
"specifies the field that contains the collection type. An example of this is a Solr"
|
|
71
|
+
"index that does not use cores for collections, and instead uses a single global"
|
|
72
|
+
"document space; if this has a field 'document_type', then this field should be set"
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
searchable_slots: Optional[List[str]] = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="Optional configuration for search fields",
|
|
78
|
+
)
|
|
79
|
+
ensure_referential_integrity: bool = Field(
|
|
80
|
+
default=False,
|
|
81
|
+
description="Whether to ensure referential integrity",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ClientConfig(BaseModel):
|
|
86
|
+
handle: Optional[str] = Field(
|
|
87
|
+
default=None,
|
|
88
|
+
description="The client handle",
|
|
89
|
+
)
|
|
90
|
+
databases: Dict[str, DatabaseConfig] = Field(
|
|
91
|
+
default={},
|
|
92
|
+
description="A dictionary of database configurations",
|
|
93
|
+
)
|
|
94
|
+
schema_path: Optional[str] = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
description="The path to the LinkML schema file",
|
|
97
|
+
)
|
|
98
|
+
base_dir: Optional[str] = Field(
|
|
99
|
+
default=None,
|
|
100
|
+
description="The base directory for the client",
|
|
101
|
+
)
|
linkml_store/api/database.py
CHANGED
|
@@ -1,15 +1,28 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from abc import ABC
|
|
2
|
-
from
|
|
3
|
-
from
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from copy import copy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from linkml.validator.report import Severity, ValidationResult
|
|
10
|
+
except ImportError:
|
|
11
|
+
ValidationResult = None
|
|
4
12
|
|
|
5
13
|
from linkml_runtime import SchemaView
|
|
14
|
+
from linkml_runtime.linkml_model import ClassDefinition, SchemaDefinition
|
|
6
15
|
|
|
7
16
|
from linkml_store.api.collection import Collection
|
|
8
|
-
from linkml_store.api.
|
|
17
|
+
from linkml_store.api.config import CollectionConfig, DatabaseConfig
|
|
9
18
|
from linkml_store.api.queries import Query, QueryResult
|
|
10
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from linkml_store.api.client import Client
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
11
25
|
|
|
12
|
-
@dataclass
|
|
13
26
|
class Database(ABC):
|
|
14
27
|
"""
|
|
15
28
|
A Database provides access to named collections of data.
|
|
@@ -27,7 +40,7 @@ class Database(ABC):
|
|
|
27
40
|
>>> db.get_collection("Person") == collection
|
|
28
41
|
True
|
|
29
42
|
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
30
|
-
>>> collection.
|
|
43
|
+
>>> collection.insert(objs)
|
|
31
44
|
>>> qr = collection.find()
|
|
32
45
|
>>> len(qr.rows)
|
|
33
46
|
2
|
|
@@ -43,10 +56,82 @@ class Database(ABC):
|
|
|
43
56
|
|
|
44
57
|
"""
|
|
45
58
|
|
|
46
|
-
handle: Optional[str] = None
|
|
47
|
-
recreate_if_exists: Optional[bool] = False
|
|
48
59
|
_schema_view: Optional[SchemaView] = None
|
|
49
60
|
_collections: Optional[Dict[str, Collection]] = None
|
|
61
|
+
parent: Optional["Client"] = None
|
|
62
|
+
metadata: Optional[DatabaseConfig] = None
|
|
63
|
+
collection_class: ClassVar[Optional[Type[Collection]]] = None
|
|
64
|
+
|
|
65
|
+
def __init__(self, handle: Optional[str] = None, metadata: Optional[DatabaseConfig] = None, **kwargs):
|
|
66
|
+
if metadata:
|
|
67
|
+
self.metadata = metadata
|
|
68
|
+
else:
|
|
69
|
+
self.metadata = DatabaseConfig(handle=handle, **kwargs)
|
|
70
|
+
if handle is not None and self.metadata.handle is not None and handle != self.metadata.handle:
|
|
71
|
+
raise ValueError(f"Handle mismatch: {handle} != {self.metadata.handle}")
|
|
72
|
+
self._initialize_schema()
|
|
73
|
+
self._initialize_collections()
|
|
74
|
+
|
|
75
|
+
def _initialize_schema(self, **kwargs):
|
|
76
|
+
db_config = self.metadata
|
|
77
|
+
if db_config.schema_location:
|
|
78
|
+
schema_location = db_config.schema_location.format(base_dir=self.parent.metadata.base_dir)
|
|
79
|
+
logger.info(f"Loading schema from: {schema_location}")
|
|
80
|
+
self.load_schema_view(schema_location)
|
|
81
|
+
if db_config.schema_dict:
|
|
82
|
+
schema_dict = copy(db_config.schema_dict)
|
|
83
|
+
if "id" not in schema_dict:
|
|
84
|
+
schema_dict["id"] = "tmp"
|
|
85
|
+
if "name" not in schema_dict:
|
|
86
|
+
schema_dict["name"] = "tmp"
|
|
87
|
+
self.set_schema_view(SchemaView(SchemaDefinition(**schema_dict)))
|
|
88
|
+
|
|
89
|
+
def from_config(self, db_config: DatabaseConfig, **kwargs):
|
|
90
|
+
"""
|
|
91
|
+
Initialize a database from a configuration.
|
|
92
|
+
|
|
93
|
+
TODO: DEPRECATE
|
|
94
|
+
|
|
95
|
+
:param db_config: database configuration
|
|
96
|
+
:param kwargs: additional arguments
|
|
97
|
+
"""
|
|
98
|
+
self.metadata = db_config
|
|
99
|
+
self._initialize_schema()
|
|
100
|
+
self._initialize_collections()
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def _initialize_collections(self):
|
|
104
|
+
for name, collection_config in self.metadata.collections.items():
|
|
105
|
+
alias = collection_config.alias
|
|
106
|
+
typ = collection_config.type
|
|
107
|
+
# if typ and alias is None:
|
|
108
|
+
# alias = name
|
|
109
|
+
# if typ is None:
|
|
110
|
+
# typ = name
|
|
111
|
+
# collection = self.create_collection(
|
|
112
|
+
# typ, alias=alias, metadata=collection_config.metadata
|
|
113
|
+
# )
|
|
114
|
+
if False and typ is not None:
|
|
115
|
+
if not alias:
|
|
116
|
+
alias = name
|
|
117
|
+
name = typ
|
|
118
|
+
if not collection_config.name:
|
|
119
|
+
collection_config.name = name
|
|
120
|
+
_collection = self.create_collection(name, alias=alias, metadata=collection_config)
|
|
121
|
+
if collection_config.attributes:
|
|
122
|
+
sv = self.schema_view
|
|
123
|
+
cd = ClassDefinition(name, attributes=collection_config.attributes)
|
|
124
|
+
sv.schema.classes[cd.name] = cd
|
|
125
|
+
sv.set_modified()
|
|
126
|
+
# assert collection.class_definition() is not None
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def recreate_if_exists(self) -> bool:
|
|
130
|
+
return self.metadata.recreate_if_exists
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def handle(self) -> str:
|
|
134
|
+
return self.metadata.handle
|
|
50
135
|
|
|
51
136
|
def store(self, obj: Dict[str, str], **kwargs):
|
|
52
137
|
"""
|
|
@@ -55,13 +140,28 @@ class Database(ABC):
|
|
|
55
140
|
:param obj: object to store
|
|
56
141
|
:param kwargs: additional arguments
|
|
57
142
|
"""
|
|
143
|
+
sv = self.schema_view
|
|
144
|
+
roots = [c for c in sv.all_classes().values() if c.tree_root]
|
|
145
|
+
root = roots[0] if roots else None
|
|
58
146
|
for k, v in obj.items():
|
|
147
|
+
if root:
|
|
148
|
+
slot = sv.induced_slot(k, root.name)
|
|
149
|
+
if not slot:
|
|
150
|
+
raise ValueError(f"Cannot determine type for {k}")
|
|
151
|
+
else:
|
|
152
|
+
slot = None
|
|
153
|
+
if isinstance(v, dict):
|
|
154
|
+
logger.debug(f"Coercing dict to list: {v}")
|
|
155
|
+
v = [v]
|
|
59
156
|
if not isinstance(v, list):
|
|
60
157
|
continue
|
|
61
158
|
if not v:
|
|
62
159
|
continue
|
|
63
|
-
|
|
64
|
-
|
|
160
|
+
if slot:
|
|
161
|
+
collection = self.get_collection(slot.range, create_if_not_exists=True)
|
|
162
|
+
else:
|
|
163
|
+
collection = self.get_collection(k, create_if_not_exists=True)
|
|
164
|
+
collection.replace(v)
|
|
65
165
|
|
|
66
166
|
def commit(self, **kwargs):
|
|
67
167
|
"""
|
|
@@ -75,8 +175,17 @@ class Database(ABC):
|
|
|
75
175
|
"""
|
|
76
176
|
raise NotImplementedError()
|
|
77
177
|
|
|
178
|
+
@property
|
|
179
|
+
def _collection_class(self) -> Type[Collection]:
|
|
180
|
+
raise NotImplementedError()
|
|
181
|
+
|
|
78
182
|
def create_collection(
|
|
79
|
-
self,
|
|
183
|
+
self,
|
|
184
|
+
name: str,
|
|
185
|
+
alias: Optional[str] = None,
|
|
186
|
+
metadata: Optional[CollectionConfig] = None,
|
|
187
|
+
recreate_if_exists=False,
|
|
188
|
+
**kwargs,
|
|
80
189
|
) -> Collection:
|
|
81
190
|
"""
|
|
82
191
|
Create a new collection
|
|
@@ -91,11 +200,28 @@ class Database(ABC):
|
|
|
91
200
|
:param name: name of the collection
|
|
92
201
|
:param alias: alias for the collection
|
|
93
202
|
:param metadata: metadata for the collection
|
|
203
|
+
:param recreate_if_exists: recreate the collection if it already exists
|
|
94
204
|
:param kwargs: additional arguments
|
|
95
205
|
"""
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
206
|
+
if not name:
|
|
207
|
+
raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
|
|
208
|
+
collection_cls = self.collection_class
|
|
209
|
+
collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
|
|
210
|
+
if metadata and metadata.attributes:
|
|
211
|
+
sv = self.schema_view
|
|
212
|
+
schema = sv.schema
|
|
213
|
+
cd = ClassDefinition(name=metadata.type, attributes=metadata.attributes)
|
|
214
|
+
schema.classes[cd.name] = cd
|
|
215
|
+
if not self._collections:
|
|
216
|
+
self._collections = {}
|
|
217
|
+
if not alias:
|
|
218
|
+
alias = name
|
|
219
|
+
self._collections[alias] = collection
|
|
220
|
+
if recreate_if_exists:
|
|
221
|
+
collection.delete_where({}, missing_ok=True)
|
|
222
|
+
return collection
|
|
223
|
+
|
|
224
|
+
def list_collections(self, include_internal=False) -> Sequence[Collection]:
|
|
99
225
|
"""
|
|
100
226
|
List all collections.
|
|
101
227
|
|
|
@@ -112,10 +238,32 @@ class Database(ABC):
|
|
|
112
238
|
>>> [c.name for c in collections]
|
|
113
239
|
['Person', 'Product']
|
|
114
240
|
|
|
241
|
+
:param include_internal: include internal collections
|
|
242
|
+
:return: list of collections
|
|
115
243
|
"""
|
|
116
244
|
if not self._collections:
|
|
117
245
|
self.init_collections()
|
|
118
|
-
return
|
|
246
|
+
return [c for c in self._collections.values() if include_internal or not c.is_internal]
|
|
247
|
+
|
|
248
|
+
def list_collection_names(self, **kwargs) -> Sequence[str]:
|
|
249
|
+
"""
|
|
250
|
+
List all collection names.
|
|
251
|
+
|
|
252
|
+
Examples
|
|
253
|
+
--------
|
|
254
|
+
>>> from linkml_store.api.client import Client
|
|
255
|
+
>>> client = Client()
|
|
256
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
257
|
+
>>> c1 = db.create_collection("Person")
|
|
258
|
+
>>> c2 = db.create_collection("Product")
|
|
259
|
+
>>> collection_names = db.list_collection_names()
|
|
260
|
+
>>> len(collection_names)
|
|
261
|
+
2
|
|
262
|
+
>>> collection_names
|
|
263
|
+
['Person', 'Product']
|
|
264
|
+
|
|
265
|
+
"""
|
|
266
|
+
return [c.name for c in self.list_collections(**kwargs)]
|
|
119
267
|
|
|
120
268
|
def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
|
|
121
269
|
"""
|
|
@@ -140,7 +288,7 @@ class Database(ABC):
|
|
|
140
288
|
"""
|
|
141
289
|
if not self._collections:
|
|
142
290
|
self.init_collections()
|
|
143
|
-
if name not in self._collections:
|
|
291
|
+
if name not in self._collections.keys():
|
|
144
292
|
if create_if_not_exists:
|
|
145
293
|
self._collections[name] = self.create_collection(name)
|
|
146
294
|
else:
|
|
@@ -167,7 +315,7 @@ class Database(ABC):
|
|
|
167
315
|
>>> client = Client()
|
|
168
316
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
169
317
|
>>> collection = db.create_collection("Person")
|
|
170
|
-
>>> collection.
|
|
318
|
+
>>> collection.insert([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
|
|
171
319
|
>>> query = Query(from_table="Person", where_clause={"name": "John"})
|
|
172
320
|
>>> result = db.query(query)
|
|
173
321
|
>>> len(result.rows)
|
|
@@ -187,12 +335,64 @@ class Database(ABC):
|
|
|
187
335
|
"""
|
|
188
336
|
Return a schema view for the named collection
|
|
189
337
|
"""
|
|
338
|
+
if not self._schema_view:
|
|
339
|
+
self._initialize_schema()
|
|
190
340
|
if not self._schema_view:
|
|
191
341
|
self._schema_view = self.induce_schema_view()
|
|
192
342
|
return self._schema_view
|
|
193
343
|
|
|
194
|
-
def set_schema_view(self, schema_view: SchemaView):
|
|
344
|
+
def set_schema_view(self, schema_view: Union[str, Path, SchemaView]):
|
|
345
|
+
"""
|
|
346
|
+
Set the schema view for the database.
|
|
347
|
+
|
|
348
|
+
:param schema_view:
|
|
349
|
+
:return:
|
|
350
|
+
"""
|
|
351
|
+
if isinstance(schema_view, Path):
|
|
352
|
+
schema_view = str(schema_view)
|
|
353
|
+
if isinstance(schema_view, str):
|
|
354
|
+
schema_view = SchemaView(schema_view)
|
|
195
355
|
self._schema_view = schema_view
|
|
356
|
+
if not self._collections:
|
|
357
|
+
return
|
|
358
|
+
# align with induced schema
|
|
359
|
+
roots = [c for c in schema_view.all_classes().values() if c.tree_root]
|
|
360
|
+
if len(roots) == 0:
|
|
361
|
+
all_ranges = set()
|
|
362
|
+
for cn in schema_view.all_classes():
|
|
363
|
+
for slot in schema_view.class_induced_slots(cn):
|
|
364
|
+
if slot.range:
|
|
365
|
+
all_ranges.add(slot.range)
|
|
366
|
+
roots = [
|
|
367
|
+
c
|
|
368
|
+
for c in schema_view.all_classes().values()
|
|
369
|
+
if not all_ranges.intersection(schema_view.class_ancestors(c.name, reflexive=True))
|
|
370
|
+
]
|
|
371
|
+
if len(roots) == 1:
|
|
372
|
+
root = roots[0]
|
|
373
|
+
for slot in schema_view.class_induced_slots(root.name):
|
|
374
|
+
inlined = slot.inlined or slot.inlined_as_list
|
|
375
|
+
if inlined and slot.range:
|
|
376
|
+
if slot.name in self._collections:
|
|
377
|
+
coll = self._collections[slot.name]
|
|
378
|
+
if not coll.metadata.type:
|
|
379
|
+
coll.metadata.type = slot.range
|
|
380
|
+
|
|
381
|
+
def load_schema_view(self, path: Union[str, Path]):
|
|
382
|
+
"""
|
|
383
|
+
Load a schema view from a file.
|
|
384
|
+
|
|
385
|
+
>>> from linkml_store.api.client import Client
|
|
386
|
+
>>> client = Client()
|
|
387
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
388
|
+
>>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
|
|
389
|
+
|
|
390
|
+
:param path:
|
|
391
|
+
:return:
|
|
392
|
+
"""
|
|
393
|
+
if isinstance(path, Path):
|
|
394
|
+
path = str(path)
|
|
395
|
+
self.set_schema_view(SchemaView(path))
|
|
196
396
|
|
|
197
397
|
def induce_schema_view(self) -> SchemaView:
|
|
198
398
|
"""
|
|
@@ -203,13 +403,77 @@ class Database(ABC):
|
|
|
203
403
|
>>> client = Client()
|
|
204
404
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
205
405
|
>>> collection = db.create_collection("Person")
|
|
206
|
-
>>> collection.
|
|
406
|
+
>>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25},
|
|
207
407
|
... {"id": "P2", "name": "Alice", "age_in_years": 25}])
|
|
208
408
|
>>> schema_view = db.induce_schema_view()
|
|
209
409
|
>>> cd = schema_view.get_class("Person")
|
|
210
410
|
>>> cd.attributes["id"].range
|
|
211
411
|
'string'
|
|
412
|
+
>>> cd.attributes["age_in_years"].range
|
|
413
|
+
'integer'
|
|
212
414
|
|
|
213
415
|
:return: A schema view
|
|
214
416
|
"""
|
|
215
417
|
raise NotImplementedError()
|
|
418
|
+
|
|
419
|
+
def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
420
|
+
"""
|
|
421
|
+
Validate the contents of the database.
|
|
422
|
+
|
|
423
|
+
:param kwargs:
|
|
424
|
+
:return: iterator over validation results
|
|
425
|
+
"""
|
|
426
|
+
for collection in self.list_collections():
|
|
427
|
+
yield from collection.iter_validate_collection(**kwargs)
|
|
428
|
+
if self.metadata.ensure_referential_integrity:
|
|
429
|
+
yield from self._validate_referential_integrity(**kwargs)
|
|
430
|
+
|
|
431
|
+
def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
432
|
+
"""
|
|
433
|
+
Validate referential integrity of the database.
|
|
434
|
+
|
|
435
|
+
:param kwargs:
|
|
436
|
+
:return: iterator over validation results
|
|
437
|
+
"""
|
|
438
|
+
sv = self.schema_view
|
|
439
|
+
cmap = defaultdict(list)
|
|
440
|
+
for collection in self.list_collections():
|
|
441
|
+
if not collection.target_class_name:
|
|
442
|
+
raise ValueError(f"Collection {collection.name} has no target class")
|
|
443
|
+
cmap[collection.target_class_name].append(collection)
|
|
444
|
+
for collection in self.list_collections():
|
|
445
|
+
cd = collection.class_definition()
|
|
446
|
+
induced_slots = sv.class_induced_slots(cd.name)
|
|
447
|
+
slot_map = {s.name: s for s in induced_slots}
|
|
448
|
+
# rmap = {s.name: s.range for s in induced_slots}
|
|
449
|
+
sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
|
|
450
|
+
for obj in collection.find_iter():
|
|
451
|
+
for k, v in obj.items():
|
|
452
|
+
if k not in sr_to_coll:
|
|
453
|
+
continue
|
|
454
|
+
ref_colls = sr_to_coll[k]
|
|
455
|
+
if not ref_colls:
|
|
456
|
+
continue
|
|
457
|
+
if not isinstance(v, (str, int)):
|
|
458
|
+
continue
|
|
459
|
+
slot = slot_map[k]
|
|
460
|
+
found = False
|
|
461
|
+
for ref_coll in ref_colls:
|
|
462
|
+
ref_obj = ref_coll.get_one(v)
|
|
463
|
+
if ref_obj:
|
|
464
|
+
found = True
|
|
465
|
+
break
|
|
466
|
+
if not found:
|
|
467
|
+
yield ValidationResult(
|
|
468
|
+
type="ReferentialIntegrity",
|
|
469
|
+
severity=Severity.ERROR,
|
|
470
|
+
message=f"Referential integrity error: {slot.range} not found",
|
|
471
|
+
instantiates=slot.range,
|
|
472
|
+
instance=v,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def drop(self, **kwargs):
|
|
476
|
+
"""
|
|
477
|
+
Drop the database and all collections
|
|
478
|
+
"""
|
|
479
|
+
raise NotImplementedError()
|
linkml_store/api/queries.py
CHANGED
|
@@ -22,7 +22,7 @@ class Query(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
from_table: Optional[str]
|
|
24
24
|
select_cols: Optional[List[str]] = None
|
|
25
|
-
where_clause: Optional[Union[str, List[str], Dict[str,
|
|
25
|
+
where_clause: Optional[Union[str, List[str], Dict[str, Any]]] = None
|
|
26
26
|
sort_by: Optional[List[str]] = None
|
|
27
27
|
limit: Optional[int] = None
|
|
28
28
|
offset: Optional[int] = None
|
|
@@ -30,12 +30,21 @@ class Query(BaseModel):
|
|
|
30
30
|
facet_slots: Optional[List[str]] = None
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
class FacetCountResult(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
A facet count result
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
as_dict: Dict[FACET_GROUP, List[Tuple[FACET_GROUP, int]]]
|
|
39
|
+
|
|
40
|
+
|
|
33
41
|
class QueryResult(BaseModel):
|
|
34
42
|
"""
|
|
35
43
|
A query result
|
|
36
44
|
"""
|
|
37
45
|
|
|
38
46
|
query: Optional[Query] = None
|
|
47
|
+
search_term: Optional[str] = None
|
|
39
48
|
num_rows: int
|
|
40
49
|
offset: Optional[int] = 0
|
|
41
50
|
rows: Optional[List[Dict[str, Any]]] = None
|
|
@@ -45,6 +54,8 @@ class QueryResult(BaseModel):
|
|
|
45
54
|
|
|
46
55
|
@property
|
|
47
56
|
def rows_dataframe(self) -> pd.DataFrame:
|
|
57
|
+
if self.ranked_rows is not None:
|
|
58
|
+
self._rows_dataframe = pd.DataFrame([{"score": score, **row} for score, row in self.ranked_rows])
|
|
48
59
|
if self._rows_dataframe is None and self.rows:
|
|
49
60
|
self._rows_dataframe = pd.DataFrame(self.rows)
|
|
50
61
|
return self._rows_dataframe
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChromaDB Collection
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
from chromadb.api.models.Collection import Collection as ChromaCollection
|
|
9
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
10
|
+
|
|
11
|
+
from linkml_store.api import Collection
|
|
12
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
13
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
14
|
+
from linkml_store.index import Indexer
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChromaDBCollection(Collection):
|
|
20
|
+
"""
|
|
21
|
+
A wrapper for ChromaDB collections.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def native_collection(self) -> ChromaCollection:
|
|
26
|
+
return self.parent.client.get_collection(self.name)
|
|
27
|
+
|
|
28
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
29
|
+
if not isinstance(objs, list):
|
|
30
|
+
objs = [objs]
|
|
31
|
+
|
|
32
|
+
documents = []
|
|
33
|
+
metadatas = []
|
|
34
|
+
ids = []
|
|
35
|
+
indexer = Indexer()
|
|
36
|
+
|
|
37
|
+
for obj in objs:
|
|
38
|
+
obj_id = self.object_identifier(obj)
|
|
39
|
+
ids.append(obj_id)
|
|
40
|
+
doc_text = indexer.object_to_text(obj)
|
|
41
|
+
documents.append(doc_text)
|
|
42
|
+
# TODO: handle nesting
|
|
43
|
+
metadata = {k: v for k, v in obj.items()}
|
|
44
|
+
metadatas.append(metadata)
|
|
45
|
+
|
|
46
|
+
self.native_collection.add(
|
|
47
|
+
documents=documents,
|
|
48
|
+
metadatas=metadatas,
|
|
49
|
+
ids=ids,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
53
|
+
if not isinstance(objs, list):
|
|
54
|
+
objs = [objs]
|
|
55
|
+
ids = [obj["id"] for obj in objs]
|
|
56
|
+
self.native_collection.delete(ids=ids)
|
|
57
|
+
return len(ids)
|
|
58
|
+
|
|
59
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
60
|
+
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
61
|
+
if where is None:
|
|
62
|
+
where = {}
|
|
63
|
+
results = self.native_collection.get(where=where)
|
|
64
|
+
ids = [result["id"] for result in results]
|
|
65
|
+
self.native_collection.delete(ids=ids)
|
|
66
|
+
return len(ids)
|
|
67
|
+
|
|
68
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
+
chroma_filter = self._build_chroma_filter(query.where_clause)
|
|
70
|
+
if query.limit:
|
|
71
|
+
results = self.native_collection.get(where=chroma_filter, limit=query.limit)
|
|
72
|
+
else:
|
|
73
|
+
results = self.native_collection.get(where=chroma_filter)
|
|
74
|
+
|
|
75
|
+
count = len(results)
|
|
76
|
+
return QueryResult(query=query, num_rows=count, rows=results)
|
|
77
|
+
|
|
78
|
+
def query_facets(
|
|
79
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
80
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
81
|
+
results = {}
|
|
82
|
+
cd = self.class_definition()
|
|
83
|
+
if not facet_columns:
|
|
84
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
85
|
+
|
|
86
|
+
for col in facet_columns:
|
|
87
|
+
logger.debug(f"Faceting on {col}")
|
|
88
|
+
if isinstance(col, tuple):
|
|
89
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
90
|
+
else:
|
|
91
|
+
sd = cd.attributes[col]
|
|
92
|
+
|
|
93
|
+
if sd.multivalued:
|
|
94
|
+
facet_results = self.native_collection.aggregate(
|
|
95
|
+
aggregation=[
|
|
96
|
+
{"$match": where} if where else {"$match": {}},
|
|
97
|
+
{"$unwind": f"${col}"},
|
|
98
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
99
|
+
{"$sort": {"count": -1}},
|
|
100
|
+
{"$limit": facet_limit},
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
facet_results = self.native_collection.aggregate(
|
|
105
|
+
aggregation=[
|
|
106
|
+
{"$match": where} if where else {"$match": {}},
|
|
107
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
108
|
+
{"$sort": {"count": -1}},
|
|
109
|
+
{"$limit": facet_limit},
|
|
110
|
+
]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
114
|
+
|
|
115
|
+
return results
|
|
116
|
+
|
|
117
|
+
def _build_chroma_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
|
|
118
|
+
chroma_filter = {}
|
|
119
|
+
for field, value in where_clause.items():
|
|
120
|
+
chroma_filter[field] = value
|
|
121
|
+
return chroma_filter
|