linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,1280 @@
|
|
|
1
|
+
"""A structure for representing collections of similar objects."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import (
|
|
9
|
+
TYPE_CHECKING,
|
|
10
|
+
Any,
|
|
11
|
+
ClassVar,
|
|
12
|
+
Dict,
|
|
13
|
+
Generic,
|
|
14
|
+
Iterable,
|
|
15
|
+
Iterator,
|
|
16
|
+
List,
|
|
17
|
+
Optional,
|
|
18
|
+
TextIO,
|
|
19
|
+
Tuple,
|
|
20
|
+
Type,
|
|
21
|
+
Union,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
from linkml_runtime import SchemaView
|
|
26
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
27
|
+
from linkml_runtime.linkml_model.meta import ArrayExpression
|
|
28
|
+
from pydantic import BaseModel
|
|
29
|
+
|
|
30
|
+
from linkml_store.api.types import DatabaseType
|
|
31
|
+
from linkml_store.index import get_indexer
|
|
32
|
+
from linkml_store.utils.format_utils import load_objects, load_objects_from_url
|
|
33
|
+
from linkml_store.utils.object_utils import clean_empties
|
|
34
|
+
from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from linkml.validator.report import ValidationResult
|
|
38
|
+
except ImportError:
|
|
39
|
+
ValidationResult = None
|
|
40
|
+
|
|
41
|
+
from linkml_store.api.config import CollectionConfig
|
|
42
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
43
|
+
from linkml_store.index.indexer import Indexer
|
|
44
|
+
|
|
45
|
+
if TYPE_CHECKING:
|
|
46
|
+
from linkml_store.api.database import Database
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
OBJECT = Union[Dict[str, Any], BaseModel, Type]
|
|
51
|
+
|
|
52
|
+
DEFAULT_FACET_LIMIT = 100
|
|
53
|
+
IDENTIFIER = str
|
|
54
|
+
FIELD_NAME = str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Collection(Generic[DatabaseType]):
|
|
58
|
+
"""
|
|
59
|
+
A collection is an organized set of objects of the same or similar type.
|
|
60
|
+
|
|
61
|
+
- For relational databases, a collection is typically a table
|
|
62
|
+
- For document databases such as MongoDB, a collection is the native type
|
|
63
|
+
- For a file system, a collection could be a single tabular file such as Parquet or CSV.
|
|
64
|
+
|
|
65
|
+
Collection objects are typically not created directly - instead they are generated
|
|
66
|
+
from a parent :class:`.Database` object:
|
|
67
|
+
|
|
68
|
+
>>> from linkml_store import Client
|
|
69
|
+
>>> client = Client()
|
|
70
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
71
|
+
>>> collection = db.create_collection("Person")
|
|
72
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
73
|
+
>>> collection.insert(objs)
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
# name: str
|
|
77
|
+
parent: Optional[DatabaseType] = None
|
|
78
|
+
_indexers: Optional[Dict[str, Indexer]] = None
|
|
79
|
+
_initialized: Optional[bool] = None
|
|
80
|
+
# hidden: Optional[bool] = False
|
|
81
|
+
|
|
82
|
+
metadata: Optional[CollectionConfig] = None
|
|
83
|
+
default_index_name: ClassVar[str] = "simple"
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
|
|
87
|
+
):
|
|
88
|
+
self.parent = parent
|
|
89
|
+
if metadata:
|
|
90
|
+
self.metadata = metadata
|
|
91
|
+
else:
|
|
92
|
+
self.metadata = CollectionConfig(type=name, **kwargs)
|
|
93
|
+
if not self.metadata.alias:
|
|
94
|
+
self.metadata.alias = name
|
|
95
|
+
if not self.metadata.type:
|
|
96
|
+
self.metadata.type = name
|
|
97
|
+
# if name is not None and self.metadata.name is not None and name != self.metadata.name:
|
|
98
|
+
# raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def hidden(self) -> bool:
|
|
102
|
+
"""
|
|
103
|
+
True if the collection is hidden.
|
|
104
|
+
|
|
105
|
+
An example of a hidden collection is a collection that indexes another
|
|
106
|
+
collection
|
|
107
|
+
|
|
108
|
+
:return: True if the collection is hidden
|
|
109
|
+
"""
|
|
110
|
+
# return self.metadata.hidden
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def target_class_name(self):
|
|
114
|
+
"""
|
|
115
|
+
Return the name of the class that this collection represents
|
|
116
|
+
|
|
117
|
+
This MUST be a LinkML class name
|
|
118
|
+
|
|
119
|
+
>>> from linkml_store import Client
|
|
120
|
+
>>> client = Client()
|
|
121
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
122
|
+
>>> collection = db.create_collection("Person", alias="persons")
|
|
123
|
+
>>> collection.target_class_name
|
|
124
|
+
'Person'
|
|
125
|
+
|
|
126
|
+
>>> collection = db.create_collection("Organization")
|
|
127
|
+
>>> collection.target_class_name
|
|
128
|
+
'Organization'
|
|
129
|
+
>>> collection.alias
|
|
130
|
+
'Organization'
|
|
131
|
+
|
|
132
|
+
:return: name of the class which members of this collection instantiate
|
|
133
|
+
"""
|
|
134
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
135
|
+
if self.metadata.type:
|
|
136
|
+
return self.metadata.type
|
|
137
|
+
return self.alias
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def alias(self):
|
|
141
|
+
"""
|
|
142
|
+
Return the primary name/alias used for the collection.
|
|
143
|
+
|
|
144
|
+
This MAY be the name of the LinkML class, but it may be desirable
|
|
145
|
+
to have an alias, for example "persons" which collects all instances
|
|
146
|
+
of class Person.
|
|
147
|
+
|
|
148
|
+
>>> from linkml_store import Client
|
|
149
|
+
>>> client = Client()
|
|
150
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
151
|
+
>>> collection = db.create_collection("Person", alias="persons")
|
|
152
|
+
>>> collection.alias
|
|
153
|
+
'persons'
|
|
154
|
+
|
|
155
|
+
If no explicit alias is provided, then the target class name is used:
|
|
156
|
+
|
|
157
|
+
>>> from linkml_store import Client
|
|
158
|
+
>>> client = Client()
|
|
159
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
160
|
+
>>> collection = db.create_collection("Person")
|
|
161
|
+
>>> collection.alias
|
|
162
|
+
'Person'
|
|
163
|
+
|
|
164
|
+
The alias SHOULD be used for Table names in SQL.
|
|
165
|
+
|
|
166
|
+
For nested data, the alias SHOULD be used as the key; e.g
|
|
167
|
+
|
|
168
|
+
.. code-block:: json
|
|
169
|
+
|
|
170
|
+
{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }
|
|
171
|
+
|
|
172
|
+
:return:
|
|
173
|
+
"""
|
|
174
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
175
|
+
if self.metadata.alias:
|
|
176
|
+
return self.metadata.alias
|
|
177
|
+
return self.target_class_name
|
|
178
|
+
|
|
179
|
+
def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
180
|
+
"""
|
|
181
|
+
Replace entire collection with objects.
|
|
182
|
+
|
|
183
|
+
>>> from linkml_store import Client
|
|
184
|
+
>>> client = Client()
|
|
185
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
186
|
+
>>> collection = db.create_collection("Person")
|
|
187
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
188
|
+
>>> collection.insert(objs)
|
|
189
|
+
|
|
190
|
+
:param objs:
|
|
191
|
+
:param kwargs:
|
|
192
|
+
:return:
|
|
193
|
+
"""
|
|
194
|
+
self.delete_where({})
|
|
195
|
+
self.insert(objs, **kwargs)
|
|
196
|
+
|
|
197
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
198
|
+
"""
|
|
199
|
+
Add one or more objects to the collection.
|
|
200
|
+
|
|
201
|
+
>>> from linkml_store import Client
|
|
202
|
+
>>> client = Client()
|
|
203
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
204
|
+
>>> collection = db.create_collection("Person")
|
|
205
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
206
|
+
>>> collection.insert(objs)
|
|
207
|
+
|
|
208
|
+
:param objs:
|
|
209
|
+
:param kwargs:
|
|
210
|
+
:return:
|
|
211
|
+
"""
|
|
212
|
+
raise NotImplementedError
|
|
213
|
+
|
|
214
|
+
def index(
|
|
215
|
+
self,
|
|
216
|
+
objs: Union[OBJECT, List[OBJECT]],
|
|
217
|
+
index_name: Optional[str] = None,
|
|
218
|
+
replace: bool = False,
|
|
219
|
+
unique: bool = False,
|
|
220
|
+
**kwargs,
|
|
221
|
+
) -> None:
|
|
222
|
+
"""
|
|
223
|
+
Index objects in the collection.
|
|
224
|
+
|
|
225
|
+
:param objs:
|
|
226
|
+
:param index_name:
|
|
227
|
+
:param replace: replace the index, or not
|
|
228
|
+
:param unique: boolean used to declare the index unique or not
|
|
229
|
+
:param kwargs:
|
|
230
|
+
:return:
|
|
231
|
+
"""
|
|
232
|
+
raise NotImplementedError
|
|
233
|
+
|
|
234
|
+
def upsert(
|
|
235
|
+
self,
|
|
236
|
+
objs: Union[OBJECT, List[OBJECT]],
|
|
237
|
+
filter_fields: List[str],
|
|
238
|
+
update_fields: Union[List[str], None] = None,
|
|
239
|
+
**kwargs,
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Add one or more objects to the collection.
|
|
243
|
+
|
|
244
|
+
>>> from linkml_store import Client
|
|
245
|
+
>>> client = Client()
|
|
246
|
+
>>> db = client.attach_database("mongodb", alias="test")
|
|
247
|
+
>>> collection = db.create_collection("Person")
|
|
248
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
249
|
+
>>> collection.upsert(objs)
|
|
250
|
+
|
|
251
|
+
:param objs:
|
|
252
|
+
:param filter_fields: List of field names to use as the filter for matching existing collections.
|
|
253
|
+
:param update_fields: List of field names to include in the update. If None, all fields are updated.
|
|
254
|
+
:param kwargs:
|
|
255
|
+
|
|
256
|
+
:return:
|
|
257
|
+
"""
|
|
258
|
+
raise NotImplementedError
|
|
259
|
+
|
|
260
|
+
def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
|
|
261
|
+
"""
|
|
262
|
+
Pre-query hook.
|
|
263
|
+
|
|
264
|
+
This is called before a query is executed. It is used to materialize derivations and indexes.
|
|
265
|
+
:param query:
|
|
266
|
+
:param kwargs:
|
|
267
|
+
:return:
|
|
268
|
+
"""
|
|
269
|
+
logger.debug(f"Pre-query hook (state: {self._initialized}; Q= {query}") # if logging.info, this is very noisy.
|
|
270
|
+
if not self._initialized:
|
|
271
|
+
self._materialize_derivations()
|
|
272
|
+
self._initialized = True
|
|
273
|
+
|
|
274
|
+
def _pre_insert_hook(self, objs: List[OBJECT], **kwargs):
|
|
275
|
+
if self.metadata.validate_modifications:
|
|
276
|
+
errors = list(self.iter_validate_collection(objs))
|
|
277
|
+
if errors:
|
|
278
|
+
raise ValueError(f"Validation errors: {errors}")
|
|
279
|
+
|
|
280
|
+
def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
|
|
281
|
+
self._initialized = True
|
|
282
|
+
patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
|
|
283
|
+
self._broadcast(patches, **kwargs)
|
|
284
|
+
self._post_modification_hook(**kwargs)
|
|
285
|
+
|
|
286
|
+
def _post_delete_hook(self, **kwargs):
|
|
287
|
+
self._post_modification_hook(**kwargs)
|
|
288
|
+
|
|
289
|
+
def _post_modification_hook(self, **kwargs):
|
|
290
|
+
for indexer in self.indexers.values():
|
|
291
|
+
ix_collection_name = self.get_index_collection_name(indexer)
|
|
292
|
+
ix_collection = self.parent.get_collection(ix_collection_name)
|
|
293
|
+
# Currently updating the source triggers complete reindexing
|
|
294
|
+
# TODO: make this more efficient by only deleting modified
|
|
295
|
+
ix_collection.delete_where({})
|
|
296
|
+
|
|
297
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
|
|
298
|
+
"""
|
|
299
|
+
Delete one or more objects from the collection.
|
|
300
|
+
|
|
301
|
+
First let's set up a collection:
|
|
302
|
+
|
|
303
|
+
>>> from linkml_store import Client
|
|
304
|
+
>>> client = Client()
|
|
305
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
306
|
+
>>> collection = db.create_collection("Person")
|
|
307
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
308
|
+
>>> collection.insert(objs)
|
|
309
|
+
>>> collection.find({}).num_rows
|
|
310
|
+
2
|
|
311
|
+
|
|
312
|
+
Now let's delete an object:
|
|
313
|
+
|
|
314
|
+
>>> collection.delete(objs[0])
|
|
315
|
+
>>> collection.find({}).num_rows
|
|
316
|
+
1
|
|
317
|
+
|
|
318
|
+
Deleting the same object again should have no effect:
|
|
319
|
+
|
|
320
|
+
>>> collection.delete(objs[0])
|
|
321
|
+
>>> collection.find({}).num_rows
|
|
322
|
+
1
|
|
323
|
+
|
|
324
|
+
:param objs:
|
|
325
|
+
:param kwargs:
|
|
326
|
+
:return:
|
|
327
|
+
"""
|
|
328
|
+
raise NotImplementedError
|
|
329
|
+
|
|
330
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
|
|
331
|
+
"""
|
|
332
|
+
Delete objects that match a query.
|
|
333
|
+
|
|
334
|
+
First let's set up a collection:
|
|
335
|
+
|
|
336
|
+
>>> from linkml_store import Client
|
|
337
|
+
>>> client = Client()
|
|
338
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
339
|
+
>>> collection = db.create_collection("Person")
|
|
340
|
+
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
341
|
+
>>> collection.insert(objs)
|
|
342
|
+
|
|
343
|
+
Now let's delete an object:
|
|
344
|
+
|
|
345
|
+
>>> collection.delete_where({"id": "P1"})
|
|
346
|
+
>>> collection.find({}).num_rows
|
|
347
|
+
1
|
|
348
|
+
|
|
349
|
+
Match everything:
|
|
350
|
+
|
|
351
|
+
>>> collection.delete_where({})
|
|
352
|
+
>>> collection.find({}).num_rows
|
|
353
|
+
0
|
|
354
|
+
|
|
355
|
+
:param where: where conditions
|
|
356
|
+
:param missing_ok: if True, do not raise an error if the collection does not exist
|
|
357
|
+
:param kwargs:
|
|
358
|
+
:return: number of objects deleted (or -1 if unsupported)
|
|
359
|
+
"""
|
|
360
|
+
raise NotImplementedError
|
|
361
|
+
|
|
362
|
+
def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
363
|
+
"""
|
|
364
|
+
Update one or more objects in the collection.
|
|
365
|
+
|
|
366
|
+
:param objs:
|
|
367
|
+
:param kwargs:
|
|
368
|
+
:return:
|
|
369
|
+
"""
|
|
370
|
+
raise NotImplementedError
|
|
371
|
+
|
|
372
|
+
def _create_query(self, **kwargs) -> Query:
|
|
373
|
+
return Query(from_table=self.alias, **kwargs)
|
|
374
|
+
|
|
375
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
376
|
+
"""
|
|
377
|
+
Run a query against the collection.
|
|
378
|
+
|
|
379
|
+
First let's load a collection:
|
|
380
|
+
|
|
381
|
+
>>> from linkml_store import Client
|
|
382
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
383
|
+
>>> client = Client()
|
|
384
|
+
>>> db = client.attach_database("duckdb")
|
|
385
|
+
>>> collection = db.create_collection("Country")
|
|
386
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
387
|
+
>>> collection.insert(objs)
|
|
388
|
+
|
|
389
|
+
Now let's run a query:
|
|
390
|
+
|
|
391
|
+
TODO
|
|
392
|
+
|
|
393
|
+
:param query:
|
|
394
|
+
:param kwargs:
|
|
395
|
+
:return:
|
|
396
|
+
"""
|
|
397
|
+
self._pre_query_hook()
|
|
398
|
+
return self.parent.query(query, **kwargs)
|
|
399
|
+
|
|
400
|
+
def query_facets(
|
|
401
|
+
self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
402
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
403
|
+
"""
|
|
404
|
+
Run a query to get facet counts for one or more columns.
|
|
405
|
+
|
|
406
|
+
This function takes a database connection, a Query object, and a list of column names.
|
|
407
|
+
It generates and executes a facet count query for each specified column and returns
|
|
408
|
+
the results as a dictionary where the keys are the column names and the values are
|
|
409
|
+
pandas DataFrames containing the facet counts.
|
|
410
|
+
|
|
411
|
+
The facet count query is generated by modifying the original query's WHERE clause
|
|
412
|
+
to exclude conditions directly related to the facet column. This allows for counting
|
|
413
|
+
the occurrences of each unique value in the facet column while still applying the
|
|
414
|
+
other filtering conditions.
|
|
415
|
+
|
|
416
|
+
:param con: A DuckDB database connection.
|
|
417
|
+
:param query: A Query object representing the base query.
|
|
418
|
+
:param facet_columns: A list of column names to get facet counts for.
|
|
419
|
+
:param facet_limit:
|
|
420
|
+
:return: A dictionary where keys are column names and values are tuples
|
|
421
|
+
containing the facet counts for each unique value in the respective column.
|
|
422
|
+
"""
|
|
423
|
+
raise NotImplementedError
|
|
424
|
+
|
|
425
|
+
def get(self, ids: Optional[List[IDENTIFIER]], **kwargs) -> QueryResult:
|
|
426
|
+
"""
|
|
427
|
+
Get one or more objects by ID.
|
|
428
|
+
|
|
429
|
+
:param ids:
|
|
430
|
+
:param kwargs:
|
|
431
|
+
:return:
|
|
432
|
+
"""
|
|
433
|
+
id_field = self.identifier_attribute_name
|
|
434
|
+
if not id_field:
|
|
435
|
+
raise ValueError(f"No identifier for {self.name}")
|
|
436
|
+
if len(ids) == 1:
|
|
437
|
+
return self.find({id_field: ids[0]})
|
|
438
|
+
else:
|
|
439
|
+
return self.find({id_field: {"$in": ids}})
|
|
440
|
+
|
|
441
|
+
def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
|
|
442
|
+
"""
|
|
443
|
+
Get one object by ID.
|
|
444
|
+
|
|
445
|
+
:param id:
|
|
446
|
+
:param kwargs:
|
|
447
|
+
:return:
|
|
448
|
+
"""
|
|
449
|
+
if not id:
|
|
450
|
+
raise ValueError("Must pass an ID")
|
|
451
|
+
id_field = self.identifier_attribute_name
|
|
452
|
+
if not id_field:
|
|
453
|
+
raise ValueError(f"No identifier for {self.name}")
|
|
454
|
+
w = {id_field: id}
|
|
455
|
+
qr = self.find(w)
|
|
456
|
+
if qr.num_rows == 1:
|
|
457
|
+
return qr.rows[0]
|
|
458
|
+
return None
|
|
459
|
+
|
|
460
|
+
def find(
|
|
461
|
+
self,
|
|
462
|
+
where: Optional[Any] = None,
|
|
463
|
+
select_cols: Optional[List[str]] = None,
|
|
464
|
+
**kwargs,
|
|
465
|
+
) -> QueryResult:
|
|
466
|
+
"""
|
|
467
|
+
Find objects in the collection using a where query.
|
|
468
|
+
|
|
469
|
+
As an example, first load a collection:
|
|
470
|
+
|
|
471
|
+
>>> from linkml_store import Client
|
|
472
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
473
|
+
>>> client = Client()
|
|
474
|
+
>>> db = client.attach_database("duckdb")
|
|
475
|
+
>>> collection = db.create_collection("Country")
|
|
476
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
477
|
+
>>> collection.insert(objs)
|
|
478
|
+
|
|
479
|
+
Now let's find all objects:
|
|
480
|
+
|
|
481
|
+
>>> qr = collection.find({})
|
|
482
|
+
>>> qr.num_rows
|
|
483
|
+
20
|
|
484
|
+
|
|
485
|
+
We can do a more restrictive query:
|
|
486
|
+
|
|
487
|
+
>>> qr = collection.find({"code": "FR"})
|
|
488
|
+
>>> qr.num_rows
|
|
489
|
+
1
|
|
490
|
+
>>> qr.rows[0]["name"]
|
|
491
|
+
'France'
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
:param where:
|
|
495
|
+
:param select_cols:
|
|
496
|
+
:param kwargs:
|
|
497
|
+
:return:
|
|
498
|
+
"""
|
|
499
|
+
query = self._create_query(
|
|
500
|
+
where_clause=where,
|
|
501
|
+
select_cols=select_cols,
|
|
502
|
+
)
|
|
503
|
+
self._pre_query_hook(query)
|
|
504
|
+
return self.query(query, **kwargs)
|
|
505
|
+
|
|
506
|
+
def find_iter(self, where: Optional[Any] = None, page_size=100, **kwargs) -> Iterator[OBJECT]:
|
|
507
|
+
"""
|
|
508
|
+
Find objects in the collection using a where query.
|
|
509
|
+
|
|
510
|
+
:param where:
|
|
511
|
+
:param kwargs:
|
|
512
|
+
:return:
|
|
513
|
+
"""
|
|
514
|
+
total_rows = None
|
|
515
|
+
offset = 0
|
|
516
|
+
if page_size < 1:
|
|
517
|
+
raise ValueError(f"Invalid page size: {page_size}")
|
|
518
|
+
while True:
|
|
519
|
+
qr = self.find(where=where, offset=offset, limit=page_size, **kwargs)
|
|
520
|
+
if total_rows is None:
|
|
521
|
+
total_rows = qr.num_rows
|
|
522
|
+
if not qr.rows:
|
|
523
|
+
return
|
|
524
|
+
for row in qr.rows:
|
|
525
|
+
yield row
|
|
526
|
+
offset += page_size
|
|
527
|
+
if offset >= total_rows:
|
|
528
|
+
break
|
|
529
|
+
return
|
|
530
|
+
|
|
531
|
+
def search(
|
|
532
|
+
self,
|
|
533
|
+
query: str,
|
|
534
|
+
where: Optional[Any] = None,
|
|
535
|
+
index_name: Optional[str] = None,
|
|
536
|
+
limit: Optional[int] = None,
|
|
537
|
+
select_cols: Optional[List[str]] = None,
|
|
538
|
+
mmr_relevance_factor: Optional[float] = None,
|
|
539
|
+
**kwargs,
|
|
540
|
+
) -> QueryResult:
|
|
541
|
+
"""
|
|
542
|
+
Search the collection using a text-based index index.
|
|
543
|
+
|
|
544
|
+
Example:
|
|
545
|
+
|
|
546
|
+
>>> from linkml_store import Client
|
|
547
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
548
|
+
>>> client = Client()
|
|
549
|
+
>>> db = client.attach_database("duckdb")
|
|
550
|
+
>>> collection = db.create_collection("Country")
|
|
551
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
552
|
+
>>> collection.insert(objs)
|
|
553
|
+
|
|
554
|
+
Now let's index, using the simple trigram-based index
|
|
555
|
+
|
|
556
|
+
>>> index = get_indexer("simple")
|
|
557
|
+
>>> _ = collection.attach_indexer(index)
|
|
558
|
+
|
|
559
|
+
Now let's find all objects:
|
|
560
|
+
|
|
561
|
+
>>> qr = collection.search("France")
|
|
562
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
563
|
+
>>> assert score > 0.1
|
|
564
|
+
>>> top_obj["code"]
|
|
565
|
+
'FR'
|
|
566
|
+
|
|
567
|
+
:param query:
|
|
568
|
+
:param where:
|
|
569
|
+
:param index_name:
|
|
570
|
+
:param limit:
|
|
571
|
+
:param select_cols:
|
|
572
|
+
:param kwargs:
|
|
573
|
+
:return:
|
|
574
|
+
"""
|
|
575
|
+
self._pre_query_hook()
|
|
576
|
+
if index_name is None:
|
|
577
|
+
if len(self.indexers) == 1:
|
|
578
|
+
index_name = list(self.indexers.keys())[0]
|
|
579
|
+
else:
|
|
580
|
+
logger.warning("Multiple indexes found. Using default index.")
|
|
581
|
+
index_name = self.default_index_name
|
|
582
|
+
ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
|
|
583
|
+
if index_name not in self.indexers:
|
|
584
|
+
logger.debug(f"Indexer not found: {index_name} -- creating")
|
|
585
|
+
ix = get_indexer(index_name)
|
|
586
|
+
if not self._indexers:
|
|
587
|
+
self._indexers = {}
|
|
588
|
+
self._indexers[index_name] = ix
|
|
589
|
+
ix = self.indexers.get(index_name)
|
|
590
|
+
if not ix:
|
|
591
|
+
raise ValueError(f"No index named {index_name}")
|
|
592
|
+
logger.debug(f"Using indexer {type(ix)} with name {index_name}")
|
|
593
|
+
if ix_coll.size() == 0:
|
|
594
|
+
logger.info(f"Index {index_name} is empty; indexing all objects")
|
|
595
|
+
all_objs = self.find(limit=-1).rows
|
|
596
|
+
if all_objs:
|
|
597
|
+
# print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
|
|
598
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
599
|
+
assert ix_coll.size() > 0
|
|
600
|
+
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
601
|
+
index_col = ix.index_field
|
|
602
|
+
|
|
603
|
+
# TODO: optimize this for large indexes
|
|
604
|
+
def row2array(row):
|
|
605
|
+
v = row[index_col]
|
|
606
|
+
if isinstance(v, str):
|
|
607
|
+
# sqlite stores arrays as strings
|
|
608
|
+
v = json.loads(v)
|
|
609
|
+
return np.array(v, dtype=float)
|
|
610
|
+
|
|
611
|
+
vector_pairs = [(row, row2array(row)) for row in qr.rows]
|
|
612
|
+
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
|
|
613
|
+
for r in results:
|
|
614
|
+
del r[1][index_col]
|
|
615
|
+
if select_cols:
|
|
616
|
+
new_results = []
|
|
617
|
+
for r in results:
|
|
618
|
+
new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
|
|
619
|
+
results = new_results
|
|
620
|
+
new_qr = QueryResult(num_rows=len(results))
|
|
621
|
+
new_qr.ranked_rows = results
|
|
622
|
+
new_qr.rows = [r[1] for r in results]
|
|
623
|
+
return new_qr
|
|
624
|
+
|
|
625
|
+
def group_by(
|
|
626
|
+
self,
|
|
627
|
+
group_by_fields: List[str],
|
|
628
|
+
inlined_field="objects",
|
|
629
|
+
agg_map: Optional[Dict[str, str]] = None,
|
|
630
|
+
where: Optional[Dict] = None,
|
|
631
|
+
**kwargs,
|
|
632
|
+
) -> QueryResult:
|
|
633
|
+
"""
|
|
634
|
+
Group objects in the collection by a column.
|
|
635
|
+
|
|
636
|
+
:param group_by:
|
|
637
|
+
:param where:
|
|
638
|
+
:param kwargs:
|
|
639
|
+
:return:
|
|
640
|
+
"""
|
|
641
|
+
if isinstance(group_by_fields, str):
|
|
642
|
+
group_by_fields = [group_by_fields]
|
|
643
|
+
df = self.find(where=where, limit=-1).rows_dataframe
|
|
644
|
+
|
|
645
|
+
# Handle the case where agg_map is None
|
|
646
|
+
if agg_map is None:
|
|
647
|
+
agg_map = {}
|
|
648
|
+
|
|
649
|
+
pk_fields = agg_map.get("first", []) + group_by_fields
|
|
650
|
+
list_fields = agg_map.get("list", [])
|
|
651
|
+
if not list_fields:
|
|
652
|
+
list_fields = [a for a in df.columns if a not in pk_fields]
|
|
653
|
+
|
|
654
|
+
grouped_objs = defaultdict(list)
|
|
655
|
+
for _, row in df.iterrows():
|
|
656
|
+
pk = tuple(row[pk_fields])
|
|
657
|
+
grouped_objs[pk].append({k: row[k] for k in list_fields})
|
|
658
|
+
results = []
|
|
659
|
+
for pk, objs in grouped_objs.items():
|
|
660
|
+
top_obj = {k: v for k, v in zip(pk_fields, pk)}
|
|
661
|
+
top_obj[inlined_field] = objs
|
|
662
|
+
results.append(top_obj)
|
|
663
|
+
r = QueryResult(num_rows=len(results), rows=results)
|
|
664
|
+
return r
|
|
665
|
+
|
|
666
|
+
@property
|
|
667
|
+
def is_internal(self) -> bool:
|
|
668
|
+
"""
|
|
669
|
+
Check if the collection is internal.
|
|
670
|
+
|
|
671
|
+
Internal collections are hidden by default. Examples of internal collections
|
|
672
|
+
include shadow "index" collections
|
|
673
|
+
|
|
674
|
+
:return:
|
|
675
|
+
"""
|
|
676
|
+
if not self.alias:
|
|
677
|
+
raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
|
|
678
|
+
return self.alias.startswith("internal__")
|
|
679
|
+
|
|
680
|
+
def exists(self) -> Optional[bool]:
|
|
681
|
+
"""
|
|
682
|
+
Check if the collection exists.
|
|
683
|
+
|
|
684
|
+
:return:
|
|
685
|
+
"""
|
|
686
|
+
cd = self.class_definition()
|
|
687
|
+
return cd is not None and cd.attributes
|
|
688
|
+
|
|
689
|
+
def load_from_source(self, load_if_exists=False):
|
|
690
|
+
"""
|
|
691
|
+
Load objects from the source location.
|
|
692
|
+
|
|
693
|
+
:param load_if_exists:
|
|
694
|
+
:return:
|
|
695
|
+
"""
|
|
696
|
+
if not load_if_exists and self.exists():
|
|
697
|
+
return
|
|
698
|
+
metadata = self.metadata
|
|
699
|
+
if metadata.source:
|
|
700
|
+
source = metadata.source
|
|
701
|
+
kwargs = source.arguments or {}
|
|
702
|
+
if source.local_path:
|
|
703
|
+
objects = load_objects(
|
|
704
|
+
metadata.source.local_path,
|
|
705
|
+
format=source.format,
|
|
706
|
+
expected_type=source.expected_type,
|
|
707
|
+
compression=source.compression,
|
|
708
|
+
select_query=source.select_query,
|
|
709
|
+
**kwargs,
|
|
710
|
+
)
|
|
711
|
+
elif metadata.source.url:
|
|
712
|
+
objects = load_objects_from_url(
|
|
713
|
+
metadata.source.url,
|
|
714
|
+
format=source.format,
|
|
715
|
+
expected_type=source.expected_type,
|
|
716
|
+
compression=source.compression,
|
|
717
|
+
select_query=source.select_query,
|
|
718
|
+
**kwargs,
|
|
719
|
+
)
|
|
720
|
+
else:
|
|
721
|
+
raise ValueError("No source local_path or url provided")
|
|
722
|
+
self.insert(objects)
|
|
723
|
+
|
|
724
|
+
def _check_if_initialized(self) -> bool:
|
|
725
|
+
return self._initialized
|
|
726
|
+
|
|
727
|
+
def _materialize_derivations(self, **kwargs):
|
|
728
|
+
metadata = self.metadata
|
|
729
|
+
if not metadata.derived_from:
|
|
730
|
+
logger.info(f"No metadata for {self.alias}; no derivations")
|
|
731
|
+
return
|
|
732
|
+
if self._check_if_initialized():
|
|
733
|
+
logger.info(f"Already initialized {self.alias}; no derivations")
|
|
734
|
+
return
|
|
735
|
+
parent_db = self.parent
|
|
736
|
+
client = parent_db.parent
|
|
737
|
+
# cd = self.class_definition()
|
|
738
|
+
for derivation in metadata.derived_from:
|
|
739
|
+
# TODO: optimize this; utilize underlying engine
|
|
740
|
+
logger.info(f"Deriving from {derivation}")
|
|
741
|
+
if derivation.database:
|
|
742
|
+
db = client.get_database(derivation.database)
|
|
743
|
+
else:
|
|
744
|
+
db = parent_db
|
|
745
|
+
if derivation.collection:
|
|
746
|
+
coll = db.get_collection(derivation.collection)
|
|
747
|
+
else:
|
|
748
|
+
coll = self
|
|
749
|
+
coll.class_definition()
|
|
750
|
+
source_obj_iter = coll.find_iter(derivation.where or {})
|
|
751
|
+
mappings = derivation.mappings
|
|
752
|
+
if not mappings:
|
|
753
|
+
raise ValueError(f"No mappings for {self.name}")
|
|
754
|
+
target_class_name = self.target_class_name
|
|
755
|
+
from linkml_map.session import Session
|
|
756
|
+
|
|
757
|
+
session = Session()
|
|
758
|
+
session.set_source_schema(db.schema_view.schema)
|
|
759
|
+
session.set_object_transformer(
|
|
760
|
+
{
|
|
761
|
+
"class_derivations": {
|
|
762
|
+
target_class_name: {
|
|
763
|
+
"populated_from": coll.target_class_name,
|
|
764
|
+
"slot_derivations": mappings,
|
|
765
|
+
},
|
|
766
|
+
}
|
|
767
|
+
},
|
|
768
|
+
)
|
|
769
|
+
logger.debug(f"Session Spec: {session.object_transformer}")
|
|
770
|
+
tr_objs = []
|
|
771
|
+
for source_obj in source_obj_iter:
|
|
772
|
+
tr_obj = session.transform(source_obj, source_type=coll.target_class_name)
|
|
773
|
+
tr_objs.append(tr_obj)
|
|
774
|
+
if not tr_objs:
|
|
775
|
+
raise ValueError(f"No objects derived from {coll.name}")
|
|
776
|
+
self.insert(tr_objs)
|
|
777
|
+
self.commit()
|
|
778
|
+
|
|
779
|
+
def size(self) -> int:
|
|
780
|
+
"""
|
|
781
|
+
Return the number of objects in the collection.
|
|
782
|
+
|
|
783
|
+
:return: The number of objects in the collection.
|
|
784
|
+
"""
|
|
785
|
+
return self.find({}, limit=1).num_rows
|
|
786
|
+
|
|
787
|
+
def rows_iter(self) -> Iterable[OBJECT]:
|
|
788
|
+
"""
|
|
789
|
+
Return an iterator over the objects in the collection.
|
|
790
|
+
|
|
791
|
+
:return:
|
|
792
|
+
"""
|
|
793
|
+
yield from self.find({}, limit=-1).rows
|
|
794
|
+
|
|
795
|
+
@property
|
|
796
|
+
def rows(self) -> List[OBJECT]:
|
|
797
|
+
"""
|
|
798
|
+
Return a list of objects in the collection.
|
|
799
|
+
|
|
800
|
+
:return:
|
|
801
|
+
"""
|
|
802
|
+
return list(self.rows_iter())
|
|
803
|
+
|
|
804
|
+
def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
|
|
805
|
+
"""
|
|
806
|
+
Return a list of objects in the collection, with scores.
|
|
807
|
+
"""
|
|
808
|
+
return [(n, obj) for n, obj in enumerate(self.rows_iter())]
|
|
809
|
+
|
|
810
|
+
def attach_indexer(
|
|
811
|
+
self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
|
|
812
|
+
) -> Indexer:
|
|
813
|
+
"""
|
|
814
|
+
Attach an index to the collection.
|
|
815
|
+
|
|
816
|
+
As an example, first let's create a collection in a database:
|
|
817
|
+
|
|
818
|
+
>>> from linkml_store import Client
|
|
819
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
820
|
+
>>> client = Client()
|
|
821
|
+
>>> db = client.attach_database("duckdb")
|
|
822
|
+
>>> collection = db.create_collection("Country")
|
|
823
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
824
|
+
>>> collection.insert(objs)
|
|
825
|
+
|
|
826
|
+
We will create two indexes - one that indexes the whole object
|
|
827
|
+
(default behavior), the other one indexes the name only
|
|
828
|
+
|
|
829
|
+
>>> full_index = get_indexer("simple")
|
|
830
|
+
>>> full_index.name = "full"
|
|
831
|
+
>>> name_index = get_indexer("simple", text_template="{name}")
|
|
832
|
+
>>> name_index.name = "name"
|
|
833
|
+
>>> _ = collection.attach_indexer(full_index)
|
|
834
|
+
>>> _ = collection.attach_indexer(name_index)
|
|
835
|
+
|
|
836
|
+
Now let's find objects using the full index, using the string "France".
|
|
837
|
+
We expect the country France to be the top hit, but the score will
|
|
838
|
+
be less than zero because we did not match all fields in the object.
|
|
839
|
+
|
|
840
|
+
>>> qr = collection.search("France", index_name="full")
|
|
841
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
842
|
+
>>> assert score > 0.1
|
|
843
|
+
>>> assert score < 0.5
|
|
844
|
+
>>> top_obj["code"]
|
|
845
|
+
'FR'
|
|
846
|
+
|
|
847
|
+
Now using the name index
|
|
848
|
+
|
|
849
|
+
>>> qr = collection.search("France", index_name="name")
|
|
850
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
851
|
+
>>> assert score > 0.99
|
|
852
|
+
>>> top_obj["code"]
|
|
853
|
+
'FR'
|
|
854
|
+
|
|
855
|
+
:param index:
|
|
856
|
+
:param name:
|
|
857
|
+
:param auto_index: Automatically index all objects in the collection
|
|
858
|
+
:param kwargs:
|
|
859
|
+
:return:
|
|
860
|
+
"""
|
|
861
|
+
if isinstance(index, str):
|
|
862
|
+
index = get_indexer(index)
|
|
863
|
+
if name:
|
|
864
|
+
index.name = name
|
|
865
|
+
if not index.name:
|
|
866
|
+
index.name = type(index).__name__.lower()
|
|
867
|
+
index_name = index.name
|
|
868
|
+
if not index_name:
|
|
869
|
+
raise ValueError("Index must have a name")
|
|
870
|
+
if not self._indexers:
|
|
871
|
+
self._indexers = {}
|
|
872
|
+
self._indexers[index_name] = index
|
|
873
|
+
if auto_index:
|
|
874
|
+
all_objs = self.find(limit=-1).rows
|
|
875
|
+
logger.info(f"Auto-indexing {len(all_objs)} objects")
|
|
876
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
877
|
+
return index
|
|
878
|
+
|
|
879
|
+
def get_index_collection_name(self, indexer: Indexer) -> str:
|
|
880
|
+
return self._index_collection_name(indexer.name)
|
|
881
|
+
|
|
882
|
+
def _index_collection_name(self, index_name: str) -> str:
|
|
883
|
+
"""
|
|
884
|
+
Create a name for a special collection that holds index data
|
|
885
|
+
|
|
886
|
+
:param index_name:
|
|
887
|
+
:param indexer:
|
|
888
|
+
:return:
|
|
889
|
+
"""
|
|
890
|
+
return f"internal__index__{self.alias}__{index_name}"
|
|
891
|
+
|
|
892
|
+
def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
|
|
893
|
+
"""
|
|
894
|
+
Index a list of objects using a specified index.
|
|
895
|
+
|
|
896
|
+
By default, the indexed objects will be stored in a shadow
|
|
897
|
+
collection in the same database, with additional fields for the index vector
|
|
898
|
+
|
|
899
|
+
TODO: Support batch_size parameter for processing large collections
|
|
900
|
+
TODO: Implement parallel indexing for multiple objects
|
|
901
|
+
TODO: Add progress reporting for long-running index operations
|
|
902
|
+
TODO: Support incremental indexing (only index new/changed items)
|
|
903
|
+
|
|
904
|
+
:param objs:
|
|
905
|
+
:param index_name: e.g. simple, llm
|
|
906
|
+
:param replace:
|
|
907
|
+
:param kwargs:
|
|
908
|
+
:return:
|
|
909
|
+
"""
|
|
910
|
+
ix = self._indexers.get(index_name, None)
|
|
911
|
+
if not ix:
|
|
912
|
+
raise ValueError(f"No index named {index_name}")
|
|
913
|
+
ix_coll_name = self._index_collection_name(index_name)
|
|
914
|
+
ix_coll = self.parent.get_collection(ix_coll_name, create_if_not_exists=True)
|
|
915
|
+
if not ix_coll.metadata:
|
|
916
|
+
ix_coll.metadata = CollectionConfig()
|
|
917
|
+
if not ix_coll.metadata.additional_properties:
|
|
918
|
+
ix_coll.metadata.additional_properties = {}
|
|
919
|
+
#for k in ["name", "index_type", "index_field", "index_value_field"]:
|
|
920
|
+
# ix_coll.metadata.additional_properties[k] = getattr(ix, k)
|
|
921
|
+
for k, v in ix.model_dump().items():
|
|
922
|
+
ix_coll.metadata.additional_properties[k] = v
|
|
923
|
+
ix_coll.store_metadata()
|
|
924
|
+
# TODO: Process vectors in batches rather than all at once
|
|
925
|
+
vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
|
|
926
|
+
objects_with_ix = []
|
|
927
|
+
index_col = ix.index_field
|
|
928
|
+
# TODO: implement this
|
|
929
|
+
index_value_col = ix.index_value_field
|
|
930
|
+
for obj, vector in zip(objs, vectors):
|
|
931
|
+
# TODO: id field
|
|
932
|
+
objects_with_ix.append({**obj, **{index_col: vector}})
|
|
933
|
+
if replace:
|
|
934
|
+
schema = self.parent.schema_view.schema
|
|
935
|
+
logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
|
|
936
|
+
if ix_coll_name in schema.classes:
|
|
937
|
+
ix_coll.delete_where()
|
|
938
|
+
|
|
939
|
+
# TODO: Use bulk insert operations for better performance
|
|
940
|
+
logger.info(f"Inserting {len(objects_with_ix)} objects into {ix_coll_name}")
|
|
941
|
+
ix_coll.insert(objects_with_ix, **kwargs)
|
|
942
|
+
ix_coll.commit()
|
|
943
|
+
|
|
944
|
+
def list_index_names(self) -> List[str]:
|
|
945
|
+
"""
|
|
946
|
+
Return a list of index names
|
|
947
|
+
|
|
948
|
+
:return:
|
|
949
|
+
"""
|
|
950
|
+
return list(self._indexers.keys())
|
|
951
|
+
|
|
952
|
+
@property
|
|
953
|
+
def indexers(self) -> Dict[str, Indexer]:
|
|
954
|
+
"""
|
|
955
|
+
Return a list of indexers
|
|
956
|
+
|
|
957
|
+
:return:
|
|
958
|
+
"""
|
|
959
|
+
return self._indexers if self._indexers else {}
|
|
960
|
+
|
|
961
|
+
def peek(self, limit: Optional[int] = None) -> QueryResult:
|
|
962
|
+
"""
|
|
963
|
+
Return the first N objects in the collection
|
|
964
|
+
|
|
965
|
+
:param limit:
|
|
966
|
+
:return:
|
|
967
|
+
"""
|
|
968
|
+
q = self._create_query()
|
|
969
|
+
return self.query(q, limit=limit)
|
|
970
|
+
|
|
971
|
+
def class_definition(self) -> Optional[ClassDefinition]:
|
|
972
|
+
"""
|
|
973
|
+
Return the class definition for the collection.
|
|
974
|
+
|
|
975
|
+
If no schema has been explicitly set, and the native database does not
|
|
976
|
+
have a schema, then a schema will be induced from the objects in the collection.
|
|
977
|
+
|
|
978
|
+
:return:
|
|
979
|
+
"""
|
|
980
|
+
sv: SchemaView = self.parent.schema_view
|
|
981
|
+
if sv:
|
|
982
|
+
cls = sv.get_class(self.target_class_name)
|
|
983
|
+
# if not cls:
|
|
984
|
+
# logger.warning(f"{self.target_class_name} not in {sv.all_classes().keys()} ")
|
|
985
|
+
# cls = sv.schema.classes[self.target_class_name]
|
|
986
|
+
if cls and not cls.attributes:
|
|
987
|
+
if not sv.class_induced_slots(cls.name):
|
|
988
|
+
for att in self._induce_attributes():
|
|
989
|
+
cls.attributes[att.name] = att
|
|
990
|
+
sv.set_modified()
|
|
991
|
+
return cls
|
|
992
|
+
return None
|
|
993
|
+
|
|
994
|
+
def _induce_attributes(self) -> List[SlotDefinition]:
|
|
995
|
+
result = self.find({}, limit=-1)
|
|
996
|
+
cd = self.induce_class_definition_from_objects(result.rows, max_sample_size=None)
|
|
997
|
+
return list(cd.attributes.values())
|
|
998
|
+
|
|
999
|
+
@property
|
|
1000
|
+
def identifier_attribute_name(self) -> Optional[str]:
|
|
1001
|
+
"""
|
|
1002
|
+
Return the name of the identifier attribute for the collection.
|
|
1003
|
+
|
|
1004
|
+
AKA the primary key.
|
|
1005
|
+
|
|
1006
|
+
:return: The name of the identifier attribute, if one exists.
|
|
1007
|
+
"""
|
|
1008
|
+
cd = self.class_definition()
|
|
1009
|
+
if cd:
|
|
1010
|
+
for att in self.parent.schema_view.class_induced_slots(cd.name):
|
|
1011
|
+
if att.identifier:
|
|
1012
|
+
return att.name
|
|
1013
|
+
return None
|
|
1014
|
+
|
|
1015
|
+
def set_identifier_attribute_name(self, name: str):
|
|
1016
|
+
"""
|
|
1017
|
+
Set the name of the identifier attribute for the collection.
|
|
1018
|
+
|
|
1019
|
+
AKA the primary key.
|
|
1020
|
+
|
|
1021
|
+
:param name: The name of the identifier attribute.
|
|
1022
|
+
"""
|
|
1023
|
+
cd = self.class_definition()
|
|
1024
|
+
if not cd:
|
|
1025
|
+
raise ValueError(f"Cannot find class definition for {self.target_class_name}")
|
|
1026
|
+
id_att = None
|
|
1027
|
+
candidates = []
|
|
1028
|
+
sv: SchemaView = self.parent.schema_view
|
|
1029
|
+
cls = sv.get_class(cd.name)
|
|
1030
|
+
existing_id_slot = sv.get_identifier_slot(cls.name)
|
|
1031
|
+
if existing_id_slot:
|
|
1032
|
+
if existing_id_slot.name == name:
|
|
1033
|
+
return
|
|
1034
|
+
existing_id_slot.identifier = False
|
|
1035
|
+
for att in cls.attributes.values():
|
|
1036
|
+
candidates.append(att.name)
|
|
1037
|
+
if att.name == name:
|
|
1038
|
+
att.identifier = True
|
|
1039
|
+
id_att = att
|
|
1040
|
+
else:
|
|
1041
|
+
att.identifier = False
|
|
1042
|
+
if not id_att:
|
|
1043
|
+
raise ValueError(f"No attribute found with name {name} in {candidates}")
|
|
1044
|
+
sv.set_modified()
|
|
1045
|
+
|
|
1046
|
+
def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
|
|
1047
|
+
"""
|
|
1048
|
+
Return the identifier for an object.
|
|
1049
|
+
|
|
1050
|
+
:param obj:
|
|
1051
|
+
:param auto: If True, generate an identifier if one does not exist.
|
|
1052
|
+
:return:
|
|
1053
|
+
"""
|
|
1054
|
+
pk = self.identifier_attribute_name
|
|
1055
|
+
if pk in obj:
|
|
1056
|
+
return obj[pk]
|
|
1057
|
+
elif auto:
|
|
1058
|
+
# TODO: use other unique keys if no primary key
|
|
1059
|
+
as_str = str(obj)
|
|
1060
|
+
md5 = hashlib.md5(as_str.encode()).hexdigest()
|
|
1061
|
+
return md5
|
|
1062
|
+
else:
|
|
1063
|
+
return None
|
|
1064
|
+
|
|
1065
|
+
def induce_class_definition_from_objects(
|
|
1066
|
+
self, objs: List[OBJECT], max_sample_size: Optional[int] = None
|
|
1067
|
+
) -> ClassDefinition:
|
|
1068
|
+
"""
|
|
1069
|
+
Induce a class definition from a list of objects.
|
|
1070
|
+
|
|
1071
|
+
This uses a heuristic procedure to infer the class definition from a list of objects.
|
|
1072
|
+
In general it is recommended you explicitly provide a schema.
|
|
1073
|
+
|
|
1074
|
+
:param objs:
|
|
1075
|
+
:param max_sample_size:
|
|
1076
|
+
:return:
|
|
1077
|
+
"""
|
|
1078
|
+
# TODO: use schemaview
|
|
1079
|
+
if max_sample_size is None:
|
|
1080
|
+
max_sample_size = 10
|
|
1081
|
+
if not self.target_class_name:
|
|
1082
|
+
raise ValueError(f"No target_class_name for {self.alias}")
|
|
1083
|
+
cd = ClassDefinition(self.target_class_name)
|
|
1084
|
+
keys = defaultdict(list)
|
|
1085
|
+
for obj in objs[0:max_sample_size]:
|
|
1086
|
+
if isinstance(obj, BaseModel):
|
|
1087
|
+
obj = obj.model_dump()
|
|
1088
|
+
if not isinstance(obj, dict):
|
|
1089
|
+
logger.warning(f"Skipping non-dict object: {obj}")
|
|
1090
|
+
continue
|
|
1091
|
+
for k, v in obj.items():
|
|
1092
|
+
keys[k].append(v)
|
|
1093
|
+
for k, vs in keys.items():
|
|
1094
|
+
if k == "_id":
|
|
1095
|
+
continue
|
|
1096
|
+
multivalueds = []
|
|
1097
|
+
inlineds = []
|
|
1098
|
+
rngs = []
|
|
1099
|
+
exact_dimensions_list = []
|
|
1100
|
+
for v in vs:
|
|
1101
|
+
if v is None:
|
|
1102
|
+
continue
|
|
1103
|
+
if isinstance(v, np.ndarray):
|
|
1104
|
+
rngs.append("float")
|
|
1105
|
+
exact_dimensions_list.append(v.shape)
|
|
1106
|
+
break
|
|
1107
|
+
if isinstance(v, list):
|
|
1108
|
+
# sample first item. TODO: more robust strategy
|
|
1109
|
+
v = v[0] if v else None
|
|
1110
|
+
multivalueds.append(True)
|
|
1111
|
+
elif isinstance(v, dict):
|
|
1112
|
+
pass
|
|
1113
|
+
# TODO: check if this is a nested object or key-value list
|
|
1114
|
+
# v = list(v.values())[0]
|
|
1115
|
+
# multivalueds.append(True)
|
|
1116
|
+
else:
|
|
1117
|
+
multivalueds.append(False)
|
|
1118
|
+
if not v:
|
|
1119
|
+
continue
|
|
1120
|
+
if isinstance(v, str):
|
|
1121
|
+
rng = "string"
|
|
1122
|
+
elif isinstance(v, bool):
|
|
1123
|
+
rng = "boolean"
|
|
1124
|
+
elif isinstance(v, int):
|
|
1125
|
+
rng = "integer"
|
|
1126
|
+
elif isinstance(v, float):
|
|
1127
|
+
rng = "float"
|
|
1128
|
+
elif isinstance(v, dict):
|
|
1129
|
+
rng = None
|
|
1130
|
+
inlineds.append(True)
|
|
1131
|
+
else:
|
|
1132
|
+
# raise ValueError(f"No mappings for {type(v)} // v={v}")
|
|
1133
|
+
rng = None
|
|
1134
|
+
inlineds.append(False)
|
|
1135
|
+
rngs.append(rng)
|
|
1136
|
+
multivalued = any(multivalueds)
|
|
1137
|
+
inlined = any(inlineds)
|
|
1138
|
+
if multivalued and False in multivalueds:
|
|
1139
|
+
logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
|
|
1140
|
+
# if not rngs:
|
|
1141
|
+
# raise AssertionError(f"Empty rngs for {k} = {vs}")
|
|
1142
|
+
rng = rngs[0] if rngs else None
|
|
1143
|
+
for other_rng in rngs:
|
|
1144
|
+
coercions = {
|
|
1145
|
+
("integer", "float"): "float",
|
|
1146
|
+
}
|
|
1147
|
+
if rng != other_rng:
|
|
1148
|
+
if (rng, other_rng) in coercions:
|
|
1149
|
+
rng = coercions[(rng, other_rng)]
|
|
1150
|
+
elif (other_rng, rng) in coercions:
|
|
1151
|
+
rng = coercions[(other_rng, rng)]
|
|
1152
|
+
else:
|
|
1153
|
+
raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
|
|
1154
|
+
logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
|
|
1155
|
+
inlined_as_list = inlined and multivalued
|
|
1156
|
+
cd.attributes[k] = SlotDefinition(
|
|
1157
|
+
k, range=rng, multivalued=multivalued, inlined=inlined, inlined_as_list=inlined_as_list
|
|
1158
|
+
)
|
|
1159
|
+
if exact_dimensions_list:
|
|
1160
|
+
array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
|
|
1161
|
+
cd.attributes[k].array = array_expr
|
|
1162
|
+
sv = self.parent.schema_view
|
|
1163
|
+
sv.schema.classes[self.target_class_name] = cd
|
|
1164
|
+
sv.set_modified()
|
|
1165
|
+
return cd
|
|
1166
|
+
|
|
1167
|
+
def import_data(self, location: Union[Path, str, TextIO], **kwargs):
|
|
1168
|
+
"""
|
|
1169
|
+
Import data from a file or stream
|
|
1170
|
+
|
|
1171
|
+
:param location:
|
|
1172
|
+
:param kwargs:
|
|
1173
|
+
:return:
|
|
1174
|
+
"""
|
|
1175
|
+
raise NotImplementedError
|
|
1176
|
+
|
|
1177
|
+
def export_data(self, location: Union[Path, str, TextIO], **kwargs):
|
|
1178
|
+
"""
|
|
1179
|
+
Export data to a file or stream
|
|
1180
|
+
|
|
1181
|
+
:param location:
|
|
1182
|
+
:param kwargs:
|
|
1183
|
+
:return:
|
|
1184
|
+
"""
|
|
1185
|
+
raise NotImplementedError
|
|
1186
|
+
|
|
1187
|
+
def apply_patches(self, patches: List[PatchDict], **kwargs):
|
|
1188
|
+
"""
|
|
1189
|
+
Apply a patch to the collection.
|
|
1190
|
+
|
|
1191
|
+
Patches conform to the JSON Patch format.
|
|
1192
|
+
|
|
1193
|
+
:param patches:
|
|
1194
|
+
:param kwargs:
|
|
1195
|
+
:return:
|
|
1196
|
+
"""
|
|
1197
|
+
all_objs = self.find(limit=-1).rows
|
|
1198
|
+
primary_key = self.identifier_attribute_name
|
|
1199
|
+
if not primary_key:
|
|
1200
|
+
raise ValueError(f"No primary key for {self.target_class_name}")
|
|
1201
|
+
new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
|
|
1202
|
+
self.replace(new_objs)
|
|
1203
|
+
|
|
1204
|
+
def diff(self, other: "Collection", **kwargs) -> List[PatchDict]:
|
|
1205
|
+
"""
|
|
1206
|
+
Diff two collections.
|
|
1207
|
+
|
|
1208
|
+
:param other: The collection to diff against
|
|
1209
|
+
:param kwargs:
|
|
1210
|
+
:return:
|
|
1211
|
+
"""
|
|
1212
|
+
src_objs = self.find(limit=-1).rows
|
|
1213
|
+
tgt_objs = other.find(limit=-1).rows
|
|
1214
|
+
primary_key = self.identifier_attribute_name
|
|
1215
|
+
if not primary_key:
|
|
1216
|
+
raise ValueError(f"No primary key for {self.target_class_name}")
|
|
1217
|
+
patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
|
|
1218
|
+
return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
|
|
1219
|
+
|
|
1220
|
+
def iter_validate_collection(
|
|
1221
|
+
self, objects: Optional[Iterable[OBJECT]] = None, **kwargs
|
|
1222
|
+
) -> Iterator["ValidationResult"]:
|
|
1223
|
+
"""
|
|
1224
|
+
Validate the contents of the collection
|
|
1225
|
+
|
|
1226
|
+
:param kwargs:
|
|
1227
|
+
:param objects: objects to validate
|
|
1228
|
+
:return: iterator over validation results
|
|
1229
|
+
"""
|
|
1230
|
+
from linkml.validator import JsonschemaValidationPlugin, Validator
|
|
1231
|
+
|
|
1232
|
+
validation_plugins = [JsonschemaValidationPlugin(closed=True)]
|
|
1233
|
+
validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
|
|
1234
|
+
cd = self.class_definition()
|
|
1235
|
+
if not cd:
|
|
1236
|
+
raise ValueError(f"Cannot find class definition for {self.target_class_name}")
|
|
1237
|
+
type_designator = None
|
|
1238
|
+
for att in self.parent.schema_view.class_induced_slots(cd.name):
|
|
1239
|
+
if att.designates_type:
|
|
1240
|
+
type_designator = att.name
|
|
1241
|
+
class_name = cd.name
|
|
1242
|
+
if objects is None:
|
|
1243
|
+
objects = self.find_iter(**kwargs)
|
|
1244
|
+
for obj in objects:
|
|
1245
|
+
obj = clean_empties(obj)
|
|
1246
|
+
v_class_name = class_name
|
|
1247
|
+
if type_designator is not None:
|
|
1248
|
+
# TODO: move type designator logic to core linkml
|
|
1249
|
+
this_class_name = obj.get(type_designator)
|
|
1250
|
+
if this_class_name:
|
|
1251
|
+
if ":" in this_class_name:
|
|
1252
|
+
this_class_name = this_class_name.split(":")[-1]
|
|
1253
|
+
v_class_name = this_class_name
|
|
1254
|
+
yield from validator.iter_results(obj, v_class_name)
|
|
1255
|
+
|
|
1256
|
+
def commit(self):
|
|
1257
|
+
"""
|
|
1258
|
+
Commit changes to the collection.
|
|
1259
|
+
|
|
1260
|
+
:return:
|
|
1261
|
+
"""
|
|
1262
|
+
pass
|
|
1263
|
+
|
|
1264
|
+
def _broadcast(self, *args, **kwargs):
|
|
1265
|
+
self.parent.broadcast(self, *args, **kwargs)
|
|
1266
|
+
|
|
1267
|
+
def store_metadata(self, replace=True):
|
|
1268
|
+
"""
|
|
1269
|
+
Store the metadata for the collection.
|
|
1270
|
+
"""
|
|
1271
|
+
if not self.metadata:
|
|
1272
|
+
return
|
|
1273
|
+
this_collection_name = self.alias
|
|
1274
|
+
metadata_collection_name = f"{this_collection_name}__metadata"
|
|
1275
|
+
metadata_collection = self.parent.get_collection(metadata_collection_name, create_if_not_exists=True)
|
|
1276
|
+
metadata_dict = self.metadata.model_dump()
|
|
1277
|
+
if replace:
|
|
1278
|
+
metadata_collection.replace(metadata_dict)
|
|
1279
|
+
else:
|
|
1280
|
+
metadata_collection.insert(metadata_dict)
|