linkml-store 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +76 -11
- linkml_store/api/collection.py +223 -40
- linkml_store/api/config.py +59 -9
- linkml_store/api/database.py +45 -27
- linkml_store/api/stores/duckdb/duckdb_collection.py +21 -3
- linkml_store/api/stores/duckdb/duckdb_database.py +36 -3
- linkml_store/api/stores/filesystem/filesystem_collection.py +13 -4
- linkml_store/api/stores/filesystem/filesystem_database.py +10 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +80 -34
- linkml_store/api/stores/mongodb/mongodb_database.py +1 -36
- linkml_store/api/stores/solr/solr_collection.py +4 -4
- linkml_store/cli.py +44 -18
- linkml_store/index/__init__.py +21 -5
- linkml_store/index/implementations/llm_indexer.py +2 -1
- linkml_store/index/indexer.py +20 -4
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +69 -8
- linkml_store/utils/pandas_utils.py +40 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sql_utils.py +2 -1
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +46 -0
- linkml_store/webapi/main.py +572 -0
- linkml_store-0.1.11.dist-info/METADATA +171 -0
- linkml_store-0.1.11.dist-info/RECORD +60 -0
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/entry_points.txt +1 -0
- linkml_store-0.1.9.dist-info/METADATA +0 -61
- linkml_store-0.1.9.dist-info/RECORD +0 -49
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/WHEEL +0 -0
linkml_store/api/client.py
CHANGED
|
@@ -98,7 +98,7 @@ class Client:
|
|
|
98
98
|
"""
|
|
99
99
|
return self.metadata.base_dir
|
|
100
100
|
|
|
101
|
-
def from_config(self, config: Union[ClientConfig, str, Path], base_dir=None, **kwargs):
|
|
101
|
+
def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, **kwargs):
|
|
102
102
|
"""
|
|
103
103
|
Create a client from a configuration.
|
|
104
104
|
|
|
@@ -118,11 +118,13 @@ class Client:
|
|
|
118
118
|
:return:
|
|
119
119
|
|
|
120
120
|
"""
|
|
121
|
+
if isinstance(config, dict):
|
|
122
|
+
config = ClientConfig(**config)
|
|
121
123
|
if isinstance(config, Path):
|
|
122
124
|
config = str(config)
|
|
123
125
|
if isinstance(config, str):
|
|
124
|
-
if not base_dir:
|
|
125
|
-
|
|
126
|
+
# if not base_dir:
|
|
127
|
+
# base_dir = Path(config).parent
|
|
126
128
|
parsed_obj = yaml.safe_load(open(config))
|
|
127
129
|
config = ClientConfig(**parsed_obj)
|
|
128
130
|
self.metadata = config
|
|
@@ -133,8 +135,15 @@ class Client:
|
|
|
133
135
|
|
|
134
136
|
def _initialize_databases(self, **kwargs):
|
|
135
137
|
for name, db_config in self.metadata.databases.items():
|
|
136
|
-
|
|
138
|
+
base_dir = self.base_dir
|
|
139
|
+
logger.info(f"Initializing database: {name}, base_dir: {base_dir}")
|
|
140
|
+
if not base_dir:
|
|
141
|
+
base_dir = Path.cwd()
|
|
142
|
+
logger.info(f"Using current working directory: {base_dir}")
|
|
143
|
+
handle = db_config.handle.format(base_dir=base_dir)
|
|
137
144
|
db_config.handle = handle
|
|
145
|
+
if db_config.schema_location:
|
|
146
|
+
db_config.schema_location = db_config.schema_location.format(base_dir=base_dir)
|
|
138
147
|
db = self.attach_database(handle, alias=name, **kwargs)
|
|
139
148
|
db.from_config(db_config)
|
|
140
149
|
|
|
@@ -233,7 +242,7 @@ class Client:
|
|
|
233
242
|
Return all attached databases
|
|
234
243
|
|
|
235
244
|
Examples
|
|
236
|
-
|
|
245
|
+
|
|
237
246
|
>>> client = Client()
|
|
238
247
|
>>> _ = client.attach_database("duckdb", alias="test1")
|
|
239
248
|
>>> _ = client.attach_database("duckdb", alias="test2")
|
|
@@ -259,25 +268,81 @@ class Client:
|
|
|
259
268
|
"""
|
|
260
269
|
Drop a database.
|
|
261
270
|
|
|
271
|
+
Example (in-memory):
|
|
272
|
+
|
|
273
|
+
>>> client = Client()
|
|
274
|
+
>>> db1 = client.attach_database("duckdb", alias="test1")
|
|
275
|
+
>>> db2 = client.attach_database("duckdb", alias="test2")
|
|
276
|
+
>>> len(client.databases)
|
|
277
|
+
2
|
|
278
|
+
>>> client.drop_database("test1")
|
|
279
|
+
>>> len(client.databases)
|
|
280
|
+
1
|
|
281
|
+
|
|
282
|
+
Databases that persist on disk:
|
|
283
|
+
|
|
284
|
+
>>> client = Client()
|
|
285
|
+
>>> path = Path("tmp/test.db")
|
|
286
|
+
>>> path.parent.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
>>> db = client.attach_database(f"duckdb:///{path}", alias="test")
|
|
288
|
+
>>> len(client.databases)
|
|
289
|
+
1
|
|
290
|
+
>>> db.store({"persons": [{"id": "P1", "name": "John"}]})
|
|
291
|
+
>>> db.commit()
|
|
292
|
+
>>> Path("tmp/test.db").exists()
|
|
293
|
+
True
|
|
294
|
+
>>> client.drop_database("test")
|
|
295
|
+
>>> len(client.databases)
|
|
296
|
+
0
|
|
297
|
+
>>> Path("tmp/test.db").exists()
|
|
298
|
+
False
|
|
299
|
+
|
|
300
|
+
Dropping a non-existent database:
|
|
301
|
+
|
|
302
|
+
>>> client = Client()
|
|
303
|
+
>>> client.drop_database("duckdb:///tmp/made-up1", missing_ok=True)
|
|
304
|
+
>>> client.drop_database("duckdb:///tmp/made-up2", missing_ok=False)
|
|
305
|
+
Traceback (most recent call last):
|
|
306
|
+
...
|
|
307
|
+
ValueError: Database duckdb:///tmp/made-up2 not found
|
|
308
|
+
|
|
262
309
|
:param name:
|
|
263
310
|
:param missing_ok:
|
|
264
311
|
:return:
|
|
265
312
|
"""
|
|
266
|
-
if
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
313
|
+
if self._databases:
|
|
314
|
+
if name in self._databases:
|
|
315
|
+
db = self._databases[name]
|
|
316
|
+
db.drop(**kwargs)
|
|
317
|
+
del self._databases[name]
|
|
318
|
+
else:
|
|
319
|
+
if not missing_ok:
|
|
320
|
+
raise ValueError(f"Database {name} not found")
|
|
270
321
|
else:
|
|
271
|
-
|
|
272
|
-
|
|
322
|
+
db = self.get_database(name, create_if_not_exists=True)
|
|
323
|
+
db.drop(**kwargs)
|
|
273
324
|
|
|
274
325
|
def drop_all_databases(self, **kwargs):
|
|
275
326
|
"""
|
|
276
327
|
Drop all databases.
|
|
277
328
|
|
|
329
|
+
Example (in-memory):
|
|
330
|
+
|
|
331
|
+
>>> client = Client()
|
|
332
|
+
>>> db1 = client.attach_database("duckdb", alias="test1")
|
|
333
|
+
>>> assert "test1" in client.databases
|
|
334
|
+
>>> db2 = client.attach_database("duckdb", alias="test2")
|
|
335
|
+
>>> assert "test2" in client.databases
|
|
336
|
+
>>> client.drop_all_databases()
|
|
337
|
+
>>> len(client.databases)
|
|
338
|
+
0
|
|
339
|
+
|
|
340
|
+
|
|
278
341
|
:param missing_ok:
|
|
279
342
|
:return:
|
|
280
343
|
"""
|
|
344
|
+
if not self._databases:
|
|
345
|
+
return
|
|
281
346
|
for name in list(self._databases.keys()):
|
|
282
347
|
self.drop_database(name, missing_ok=False, **kwargs)
|
|
283
348
|
self._databases = {}
|
linkml_store/api/collection.py
CHANGED
|
@@ -4,7 +4,7 @@ import hashlib
|
|
|
4
4
|
import logging
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
|
|
7
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from linkml_runtime import SchemaView
|
|
@@ -14,7 +14,7 @@ from pydantic import BaseModel
|
|
|
14
14
|
|
|
15
15
|
from linkml_store.api.types import DatabaseType
|
|
16
16
|
from linkml_store.index import get_indexer
|
|
17
|
-
from linkml_store.utils.format_utils import load_objects
|
|
17
|
+
from linkml_store.utils.format_utils import load_objects, load_objects_from_url
|
|
18
18
|
from linkml_store.utils.object_utils import clean_empties
|
|
19
19
|
from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
|
|
20
20
|
|
|
@@ -61,9 +61,11 @@ class Collection(Generic[DatabaseType]):
|
|
|
61
61
|
# name: str
|
|
62
62
|
parent: Optional[DatabaseType] = None
|
|
63
63
|
_indexers: Optional[Dict[str, Indexer]] = None
|
|
64
|
+
_initialized: Optional[bool] = None
|
|
64
65
|
# hidden: Optional[bool] = False
|
|
65
66
|
|
|
66
67
|
metadata: Optional[CollectionConfig] = None
|
|
68
|
+
default_index_name: ClassVar[str] = "simple"
|
|
67
69
|
|
|
68
70
|
def __init__(
|
|
69
71
|
self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
|
|
@@ -72,7 +74,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
72
74
|
if metadata:
|
|
73
75
|
self.metadata = metadata
|
|
74
76
|
else:
|
|
75
|
-
self.metadata = CollectionConfig(
|
|
77
|
+
self.metadata = CollectionConfig(type=name, **kwargs)
|
|
76
78
|
if not self.metadata.alias:
|
|
77
79
|
self.metadata.alias = name
|
|
78
80
|
if not self.metadata.type:
|
|
@@ -80,17 +82,6 @@ class Collection(Generic[DatabaseType]):
|
|
|
80
82
|
# if name is not None and self.metadata.name is not None and name != self.metadata.name:
|
|
81
83
|
# raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
|
|
82
84
|
|
|
83
|
-
@property
|
|
84
|
-
def name(self) -> str:
|
|
85
|
-
"""
|
|
86
|
-
Return the name of the collection.
|
|
87
|
-
|
|
88
|
-
TODO: deprecate in favor of Type
|
|
89
|
-
|
|
90
|
-
:return: name of the collection
|
|
91
|
-
"""
|
|
92
|
-
return self.metadata.name
|
|
93
|
-
|
|
94
85
|
@property
|
|
95
86
|
def hidden(self) -> bool:
|
|
96
87
|
"""
|
|
@@ -117,12 +108,18 @@ class Collection(Generic[DatabaseType]):
|
|
|
117
108
|
>>> collection.target_class_name
|
|
118
109
|
'Person'
|
|
119
110
|
|
|
111
|
+
>>> collection = db.create_collection("Organization")
|
|
112
|
+
>>> collection.target_class_name
|
|
113
|
+
'Organization'
|
|
114
|
+
>>> collection.alias
|
|
115
|
+
'Organization'
|
|
116
|
+
|
|
120
117
|
:return: name of the class which members of this collection instantiate
|
|
121
118
|
"""
|
|
122
119
|
# TODO: this is a shim layer until we can normalize on this
|
|
123
120
|
if self.metadata.type:
|
|
124
121
|
return self.metadata.type
|
|
125
|
-
return self.
|
|
122
|
+
return self.alias
|
|
126
123
|
|
|
127
124
|
@property
|
|
128
125
|
def alias(self):
|
|
@@ -160,10 +157,9 @@ class Collection(Generic[DatabaseType]):
|
|
|
160
157
|
:return:
|
|
161
158
|
"""
|
|
162
159
|
# TODO: this is a shim layer until we can normalize on this
|
|
163
|
-
# TODO: this is a shim layer until we can normalize on this
|
|
164
160
|
if self.metadata.alias:
|
|
165
161
|
return self.metadata.alias
|
|
166
|
-
return self.
|
|
162
|
+
return self.target_class_name
|
|
167
163
|
|
|
168
164
|
def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
169
165
|
"""
|
|
@@ -200,7 +196,14 @@ class Collection(Generic[DatabaseType]):
|
|
|
200
196
|
"""
|
|
201
197
|
raise NotImplementedError
|
|
202
198
|
|
|
199
|
+
def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
|
|
200
|
+
logger.info(f"Pre-query hook (state: {self._initialized}; Q= {query}")
|
|
201
|
+
if not self._initialized:
|
|
202
|
+
self._materialize_derivations()
|
|
203
|
+
self._initialized = True
|
|
204
|
+
|
|
203
205
|
def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
|
|
206
|
+
self._initialized = True
|
|
204
207
|
patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
|
|
205
208
|
self._broadcast(patches, **kwargs)
|
|
206
209
|
|
|
@@ -304,6 +307,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
304
307
|
:param kwargs:
|
|
305
308
|
:return:
|
|
306
309
|
"""
|
|
310
|
+
self._pre_query_hook()
|
|
307
311
|
return self.parent.query(query, **kwargs)
|
|
308
312
|
|
|
309
313
|
def query_facets(
|
|
@@ -339,7 +343,6 @@ class Collection(Generic[DatabaseType]):
|
|
|
339
343
|
:param kwargs:
|
|
340
344
|
:return:
|
|
341
345
|
"""
|
|
342
|
-
# TODO
|
|
343
346
|
id_field = self.identifier_attribute_name
|
|
344
347
|
if not id_field:
|
|
345
348
|
raise ValueError(f"No identifier for {self.name}")
|
|
@@ -398,9 +401,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
398
401
|
:return:
|
|
399
402
|
"""
|
|
400
403
|
query = self._create_query(where_clause=where)
|
|
404
|
+
self._pre_query_hook(query)
|
|
401
405
|
return self.query(query, **kwargs)
|
|
402
406
|
|
|
403
|
-
def find_iter(self, where: Optional[Any] = None, **kwargs) -> Iterator[OBJECT]:
|
|
407
|
+
def find_iter(self, where: Optional[Any] = None, page_size=100, **kwargs) -> Iterator[OBJECT]:
|
|
404
408
|
"""
|
|
405
409
|
Find objects in the collection using a where query.
|
|
406
410
|
|
|
@@ -408,9 +412,22 @@ class Collection(Generic[DatabaseType]):
|
|
|
408
412
|
:param kwargs:
|
|
409
413
|
:return:
|
|
410
414
|
"""
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
415
|
+
total_rows = None
|
|
416
|
+
offset = 0
|
|
417
|
+
if page_size < 1:
|
|
418
|
+
raise ValueError(f"Invalid page size: {page_size}")
|
|
419
|
+
while True:
|
|
420
|
+
qr = self.find(where=where, offset=offset, limit=page_size, **kwargs)
|
|
421
|
+
if total_rows is None:
|
|
422
|
+
total_rows = qr.num_rows
|
|
423
|
+
if not qr.rows:
|
|
424
|
+
return
|
|
425
|
+
for row in qr.rows:
|
|
426
|
+
yield row
|
|
427
|
+
offset += page_size
|
|
428
|
+
if offset >= total_rows:
|
|
429
|
+
break
|
|
430
|
+
return
|
|
414
431
|
|
|
415
432
|
def search(
|
|
416
433
|
self,
|
|
@@ -421,7 +438,30 @@ class Collection(Generic[DatabaseType]):
|
|
|
421
438
|
**kwargs,
|
|
422
439
|
) -> QueryResult:
|
|
423
440
|
"""
|
|
424
|
-
Search the collection using a
|
|
441
|
+
Search the collection using a text-based index index.
|
|
442
|
+
|
|
443
|
+
Example:
|
|
444
|
+
|
|
445
|
+
>>> from linkml_store import Client
|
|
446
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
447
|
+
>>> client = Client()
|
|
448
|
+
>>> db = client.attach_database("duckdb")
|
|
449
|
+
>>> collection = db.create_collection("Country")
|
|
450
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
451
|
+
>>> collection.insert(objs)
|
|
452
|
+
|
|
453
|
+
Now let's index, using the simple trigram-based index
|
|
454
|
+
|
|
455
|
+
>>> index = get_indexer("simple")
|
|
456
|
+
>>> collection.attach_indexer(index)
|
|
457
|
+
|
|
458
|
+
Now let's find all objects:
|
|
459
|
+
|
|
460
|
+
>>> qr = collection.search("France")
|
|
461
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
462
|
+
>>> assert score > 0.1
|
|
463
|
+
>>> top_obj["code"]
|
|
464
|
+
'FR'
|
|
425
465
|
|
|
426
466
|
:param query:
|
|
427
467
|
:param where:
|
|
@@ -430,13 +470,20 @@ class Collection(Generic[DatabaseType]):
|
|
|
430
470
|
:param kwargs:
|
|
431
471
|
:return:
|
|
432
472
|
"""
|
|
473
|
+
self._pre_query_hook()
|
|
433
474
|
if index_name is None:
|
|
434
|
-
if len(self.
|
|
435
|
-
index_name = list(self.
|
|
475
|
+
if len(self.indexers) == 1:
|
|
476
|
+
index_name = list(self.indexers.keys())[0]
|
|
436
477
|
else:
|
|
437
|
-
|
|
478
|
+
logger.warning("Multiple indexes found. Using default index.")
|
|
479
|
+
index_name = self.default_index_name
|
|
438
480
|
ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
|
|
439
|
-
|
|
481
|
+
if index_name not in self.indexers:
|
|
482
|
+
ix = get_indexer(index_name)
|
|
483
|
+
if not self._indexers:
|
|
484
|
+
self._indexers = {}
|
|
485
|
+
self._indexers[index_name] = ix
|
|
486
|
+
ix = self.indexers.get(index_name)
|
|
440
487
|
if not ix:
|
|
441
488
|
raise ValueError(f"No index named {index_name}")
|
|
442
489
|
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
@@ -453,7 +500,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
453
500
|
@property
|
|
454
501
|
def is_internal(self) -> bool:
|
|
455
502
|
"""
|
|
456
|
-
Check if the collection is internal
|
|
503
|
+
Check if the collection is internal.
|
|
504
|
+
|
|
505
|
+
Internal collections are hidden by default. Examples of internal collections
|
|
506
|
+
include shadow "index" collections
|
|
457
507
|
|
|
458
508
|
:return:
|
|
459
509
|
"""
|
|
@@ -461,14 +511,136 @@ class Collection(Generic[DatabaseType]):
|
|
|
461
511
|
raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
|
|
462
512
|
return self.alias.startswith("internal__")
|
|
463
513
|
|
|
464
|
-
def
|
|
465
|
-
|
|
514
|
+
def exists(self) -> Optional[bool]:
|
|
515
|
+
"""
|
|
516
|
+
Check if the collection exists.
|
|
517
|
+
|
|
518
|
+
:return:
|
|
519
|
+
"""
|
|
520
|
+
cd = self.class_definition()
|
|
521
|
+
return cd is not None
|
|
522
|
+
|
|
523
|
+
def load_from_source(self, load_if_exists=False):
|
|
524
|
+
"""
|
|
525
|
+
Load objects from the source location.
|
|
526
|
+
|
|
527
|
+
:param load_if_exists:
|
|
528
|
+
:return:
|
|
529
|
+
"""
|
|
530
|
+
if not load_if_exists and self.exists():
|
|
531
|
+
return
|
|
532
|
+
metadata = self.metadata
|
|
533
|
+
if metadata.source:
|
|
534
|
+
source = metadata.source
|
|
535
|
+
kwargs = source.arguments or {}
|
|
536
|
+
if source.local_path:
|
|
537
|
+
objects = load_objects(
|
|
538
|
+
metadata.source.local_path, format=source.format, expected_type=source.expected_type, **kwargs
|
|
539
|
+
)
|
|
540
|
+
elif metadata.source.url:
|
|
541
|
+
objects = load_objects_from_url(
|
|
542
|
+
metadata.source.url, format=source.format, expected_type=source.expected_type, **kwargs
|
|
543
|
+
)
|
|
466
544
|
self.insert(objects)
|
|
467
545
|
|
|
546
|
+
def _check_if_initialized(self) -> bool:
|
|
547
|
+
return self._initialized
|
|
548
|
+
|
|
549
|
+
def _materialize_derivations(self, **kwargs):
|
|
550
|
+
metadata = self.metadata
|
|
551
|
+
if not metadata.derived_from:
|
|
552
|
+
logger.info(f"No metadata for {self.alias}; no derivations")
|
|
553
|
+
return
|
|
554
|
+
if self._check_if_initialized():
|
|
555
|
+
logger.info(f"Already initialized {self.alias}; no derivations")
|
|
556
|
+
return
|
|
557
|
+
parent_db = self.parent
|
|
558
|
+
client = parent_db.parent
|
|
559
|
+
# cd = self.class_definition()
|
|
560
|
+
for derivation in metadata.derived_from:
|
|
561
|
+
# TODO: optimize this; utilize underlying engine
|
|
562
|
+
logger.info(f"Deriving from {derivation}")
|
|
563
|
+
if derivation.database:
|
|
564
|
+
db = client.get_database(derivation.database)
|
|
565
|
+
else:
|
|
566
|
+
db = parent_db
|
|
567
|
+
if derivation.collection:
|
|
568
|
+
coll = db.get_collection(derivation.collection)
|
|
569
|
+
else:
|
|
570
|
+
coll = self
|
|
571
|
+
coll.class_definition()
|
|
572
|
+
source_obj_iter = coll.find_iter(derivation.where or {})
|
|
573
|
+
mappings = derivation.mappings
|
|
574
|
+
if not mappings:
|
|
575
|
+
raise ValueError(f"No mappings for {self.name}")
|
|
576
|
+
target_class_name = self.target_class_name
|
|
577
|
+
from linkml_map.session import Session
|
|
578
|
+
|
|
579
|
+
session = Session()
|
|
580
|
+
session.set_source_schema(db.schema_view.schema)
|
|
581
|
+
session.set_object_transformer(
|
|
582
|
+
{
|
|
583
|
+
"class_derivations": {
|
|
584
|
+
target_class_name: {
|
|
585
|
+
"populated_from": coll.target_class_name,
|
|
586
|
+
"slot_derivations": mappings,
|
|
587
|
+
},
|
|
588
|
+
}
|
|
589
|
+
},
|
|
590
|
+
)
|
|
591
|
+
logger.debug(f"Session Spec: {session.object_transformer}")
|
|
592
|
+
tr_objs = []
|
|
593
|
+
for source_obj in source_obj_iter:
|
|
594
|
+
tr_obj = session.transform(source_obj, source_type=coll.target_class_name)
|
|
595
|
+
tr_objs.append(tr_obj)
|
|
596
|
+
if not tr_objs:
|
|
597
|
+
raise ValueError(f"No objects derived from {coll.name}")
|
|
598
|
+
self.insert(tr_objs)
|
|
599
|
+
self.commit()
|
|
600
|
+
|
|
468
601
|
def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
|
|
469
602
|
"""
|
|
470
603
|
Attach an index to the collection.
|
|
471
604
|
|
|
605
|
+
As an example, first let's create a collection in a database:
|
|
606
|
+
|
|
607
|
+
>>> from linkml_store import Client
|
|
608
|
+
>>> from linkml_store.utils.format_utils import load_objects
|
|
609
|
+
>>> client = Client()
|
|
610
|
+
>>> db = client.attach_database("duckdb")
|
|
611
|
+
>>> collection = db.create_collection("Country")
|
|
612
|
+
>>> objs = load_objects("tests/input/countries/countries.jsonl")
|
|
613
|
+
>>> collection.insert(objs)
|
|
614
|
+
|
|
615
|
+
We will create two indexes - one that indexes the whole object
|
|
616
|
+
(default behavior), the other one indexes the name only
|
|
617
|
+
|
|
618
|
+
>>> full_index = get_indexer("simple")
|
|
619
|
+
>>> full_index.name = "full"
|
|
620
|
+
>>> name_index = get_indexer("simple", text_template="{name}")
|
|
621
|
+
>>> name_index.name = "name"
|
|
622
|
+
>>> collection.attach_indexer(full_index)
|
|
623
|
+
>>> collection.attach_indexer(name_index)
|
|
624
|
+
|
|
625
|
+
Now let's find objects using the full index, using the string "France".
|
|
626
|
+
We expect the country France to be the top hit, but the score will
|
|
627
|
+
be less than zero because we did not match all fields in the object.
|
|
628
|
+
|
|
629
|
+
>>> qr = collection.search("France", index_name="full")
|
|
630
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
631
|
+
>>> assert score > 0.1
|
|
632
|
+
>>> assert score < 0.5
|
|
633
|
+
>>> top_obj["code"]
|
|
634
|
+
'FR'
|
|
635
|
+
|
|
636
|
+
Now using the name index
|
|
637
|
+
|
|
638
|
+
>>> qr = collection.search("France", index_name="name")
|
|
639
|
+
>>> score, top_obj = qr.ranked_rows[0]
|
|
640
|
+
>>> assert score > 0.99
|
|
641
|
+
>>> top_obj["code"]
|
|
642
|
+
'FR'
|
|
643
|
+
|
|
472
644
|
:param index:
|
|
473
645
|
:param name:
|
|
474
646
|
:param auto_index: Automatically index all objects in the collection
|
|
@@ -500,19 +672,22 @@ class Collection(Generic[DatabaseType]):
|
|
|
500
672
|
:param indexer:
|
|
501
673
|
:return:
|
|
502
674
|
"""
|
|
503
|
-
return f"internal__index__{self.
|
|
675
|
+
return f"internal__index__{self.alias}__{index_name}"
|
|
504
676
|
|
|
505
677
|
def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
|
|
506
678
|
"""
|
|
507
|
-
Index a list of objects
|
|
679
|
+
Index a list of objects using a specified index.
|
|
680
|
+
|
|
681
|
+
By default, the indexed objects will be stored in a shadow
|
|
682
|
+
collection in the same database, with additional fields for the index vector
|
|
508
683
|
|
|
509
684
|
:param objs:
|
|
510
|
-
:param index_name:
|
|
685
|
+
:param index_name: e.g. simple, llm
|
|
511
686
|
:param replace:
|
|
512
687
|
:param kwargs:
|
|
513
688
|
:return:
|
|
514
689
|
"""
|
|
515
|
-
ix = self._indexers.get(index_name)
|
|
690
|
+
ix = self._indexers.get(index_name, None)
|
|
516
691
|
if not ix:
|
|
517
692
|
raise ValueError(f"No index named {index_name}")
|
|
518
693
|
ix_coll_name = self._index_collection_name(index_name)
|
|
@@ -563,6 +738,9 @@ class Collection(Generic[DatabaseType]):
|
|
|
563
738
|
"""
|
|
564
739
|
Return the class definition for the collection.
|
|
565
740
|
|
|
741
|
+
If no schema has been explicitly set, and the native database does not
|
|
742
|
+
have a schema, then a schema will be induced from the objects in the collection.
|
|
743
|
+
|
|
566
744
|
:return:
|
|
567
745
|
"""
|
|
568
746
|
sv: SchemaView = self.parent.schema_view
|
|
@@ -647,7 +825,9 @@ class Collection(Generic[DatabaseType]):
|
|
|
647
825
|
else:
|
|
648
826
|
return None
|
|
649
827
|
|
|
650
|
-
def induce_class_definition_from_objects(
|
|
828
|
+
def induce_class_definition_from_objects(
|
|
829
|
+
self, objs: List[OBJECT], max_sample_size: Optional[int] = None
|
|
830
|
+
) -> ClassDefinition:
|
|
651
831
|
"""
|
|
652
832
|
Induce a class definition from a list of objects.
|
|
653
833
|
|
|
@@ -658,6 +838,9 @@ class Collection(Generic[DatabaseType]):
|
|
|
658
838
|
:param max_sample_size:
|
|
659
839
|
:return:
|
|
660
840
|
"""
|
|
841
|
+
# TODO: use schemaview
|
|
842
|
+
if max_sample_size is None:
|
|
843
|
+
max_sample_size = 10
|
|
661
844
|
if not self.target_class_name:
|
|
662
845
|
raise ValueError(f"No target_class_name for {self.alias}")
|
|
663
846
|
cd = ClassDefinition(self.target_class_name)
|
|
@@ -720,6 +903,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
720
903
|
for other_rng in rngs:
|
|
721
904
|
if rng != other_rng:
|
|
722
905
|
raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
|
|
906
|
+
logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
|
|
723
907
|
cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
|
|
724
908
|
if exact_dimensions_list:
|
|
725
909
|
array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
|
|
@@ -753,7 +937,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
753
937
|
"""
|
|
754
938
|
Apply a patch to the collection.
|
|
755
939
|
|
|
756
|
-
Patches conform to the JSON Patch format
|
|
940
|
+
Patches conform to the JSON Patch format.
|
|
757
941
|
|
|
758
942
|
:param patches:
|
|
759
943
|
:param kwargs:
|
|
@@ -766,11 +950,11 @@ class Collection(Generic[DatabaseType]):
|
|
|
766
950
|
new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
|
|
767
951
|
self.replace(new_objs)
|
|
768
952
|
|
|
769
|
-
def diff(self, other: "Collection", **kwargs):
|
|
953
|
+
def diff(self, other: "Collection", **kwargs) -> List[PatchDict]:
|
|
770
954
|
"""
|
|
771
955
|
Diff two collections.
|
|
772
956
|
|
|
773
|
-
:param other:
|
|
957
|
+
:param other: The collection to diff against
|
|
774
958
|
:param kwargs:
|
|
775
959
|
:return:
|
|
776
960
|
"""
|
|
@@ -797,8 +981,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
797
981
|
if not cd:
|
|
798
982
|
raise ValueError(f"Cannot find class definition for {self.target_class_name}")
|
|
799
983
|
class_name = cd.name
|
|
800
|
-
|
|
801
|
-
for obj in result.rows:
|
|
984
|
+
for obj in self.find_iter(**kwargs):
|
|
802
985
|
obj = clean_empties(obj)
|
|
803
986
|
yield from validator.iter_results(obj, class_name)
|
|
804
987
|
|
linkml_store/api/config.py
CHANGED
|
@@ -3,11 +3,44 @@ from typing import Any, Dict, List, Optional
|
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
class ConfiguredBaseModel(BaseModel, extra="forbid"):
|
|
7
|
+
"""
|
|
8
|
+
Base class for all configuration models.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DerivationConfiguration(ConfiguredBaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Configuration for a derivation
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
database: Optional[str] = None
|
|
20
|
+
collection: Optional[str] = None
|
|
21
|
+
mappings: Optional[Dict[str, Any]] = None
|
|
22
|
+
where: Optional[Dict[str, Any]] = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CollectionSource(ConfiguredBaseModel):
|
|
26
|
+
"""
|
|
27
|
+
Metadata about a source
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
url: Optional[str] = None
|
|
31
|
+
local_path: Optional[str] = None
|
|
32
|
+
source_location: Optional[str] = None
|
|
33
|
+
refresh_interval_days: Optional[float] = None
|
|
34
|
+
expected_type: Optional[str] = None
|
|
35
|
+
format: Optional[str] = None
|
|
36
|
+
arguments: Optional[Dict[str, Any]] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CollectionConfig(ConfiguredBaseModel):
|
|
40
|
+
"""
|
|
41
|
+
Configuration for a collection
|
|
42
|
+
"""
|
|
43
|
+
|
|
11
44
|
alias: Optional[str] = Field(
|
|
12
45
|
default=None,
|
|
13
46
|
description="An optional alias for the collection",
|
|
@@ -36,13 +69,22 @@ class CollectionConfig(BaseModel):
|
|
|
36
69
|
default=False,
|
|
37
70
|
description="Whether the collection is prepopulated",
|
|
38
71
|
)
|
|
39
|
-
|
|
72
|
+
source: Optional[CollectionSource] = Field(
|
|
73
|
+
default=None,
|
|
74
|
+
description="Metadata about the source",
|
|
75
|
+
)
|
|
76
|
+
# TODO: derived_from
|
|
77
|
+
derived_from: Optional[List[DerivationConfiguration]] = Field(
|
|
40
78
|
default=None,
|
|
41
|
-
description="
|
|
79
|
+
description="LinkML-Map derivations",
|
|
42
80
|
)
|
|
43
81
|
|
|
44
82
|
|
|
45
|
-
class DatabaseConfig(
|
|
83
|
+
class DatabaseConfig(ConfiguredBaseModel):
|
|
84
|
+
"""
|
|
85
|
+
Configuration for a database
|
|
86
|
+
"""
|
|
87
|
+
|
|
46
88
|
handle: str = Field(
|
|
47
89
|
default="duckdb:///:memory:",
|
|
48
90
|
description="The database handle, e.g., 'duckdb:///:memory:' or 'mongodb://localhost:27017'",
|
|
@@ -86,7 +128,11 @@ class DatabaseConfig(BaseModel):
|
|
|
86
128
|
)
|
|
87
129
|
|
|
88
130
|
|
|
89
|
-
class ClientConfig(
|
|
131
|
+
class ClientConfig(ConfiguredBaseModel):
|
|
132
|
+
"""
|
|
133
|
+
Configuration for a client
|
|
134
|
+
"""
|
|
135
|
+
|
|
90
136
|
handle: Optional[str] = Field(
|
|
91
137
|
default=None,
|
|
92
138
|
description="The client handle",
|
|
@@ -95,6 +141,10 @@ class ClientConfig(BaseModel):
|
|
|
95
141
|
default={},
|
|
96
142
|
description="A dictionary of database configurations",
|
|
97
143
|
)
|
|
144
|
+
default_database: Optional[str] = Field(
|
|
145
|
+
default=None,
|
|
146
|
+
description="The default database",
|
|
147
|
+
)
|
|
98
148
|
schema_path: Optional[str] = Field(
|
|
99
149
|
default=None,
|
|
100
150
|
description="The path to the LinkML schema file",
|