linkml-store 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (35) hide show
  1. linkml_store/api/client.py +76 -11
  2. linkml_store/api/collection.py +223 -40
  3. linkml_store/api/config.py +59 -9
  4. linkml_store/api/database.py +45 -27
  5. linkml_store/api/stores/duckdb/duckdb_collection.py +21 -3
  6. linkml_store/api/stores/duckdb/duckdb_database.py +36 -3
  7. linkml_store/api/stores/filesystem/filesystem_collection.py +13 -4
  8. linkml_store/api/stores/filesystem/filesystem_database.py +10 -1
  9. linkml_store/api/stores/mongodb/mongodb_collection.py +80 -34
  10. linkml_store/api/stores/mongodb/mongodb_database.py +1 -36
  11. linkml_store/api/stores/solr/solr_collection.py +4 -4
  12. linkml_store/cli.py +44 -18
  13. linkml_store/index/__init__.py +21 -5
  14. linkml_store/index/implementations/llm_indexer.py +2 -1
  15. linkml_store/index/indexer.py +20 -4
  16. linkml_store/utils/file_utils.py +37 -0
  17. linkml_store/utils/format_utils.py +69 -8
  18. linkml_store/utils/pandas_utils.py +40 -0
  19. linkml_store/utils/schema_utils.py +23 -0
  20. linkml_store/utils/sql_utils.py +2 -1
  21. linkml_store/webapi/__init__.py +0 -0
  22. linkml_store/webapi/html/__init__.py +3 -0
  23. linkml_store/webapi/html/base.html.j2 +24 -0
  24. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  25. linkml_store/webapi/html/database_details.html.j2 +16 -0
  26. linkml_store/webapi/html/databases.html.j2 +14 -0
  27. linkml_store/webapi/html/generic.html.j2 +46 -0
  28. linkml_store/webapi/main.py +572 -0
  29. linkml_store-0.1.11.dist-info/METADATA +171 -0
  30. linkml_store-0.1.11.dist-info/RECORD +60 -0
  31. {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/entry_points.txt +1 -0
  32. linkml_store-0.1.9.dist-info/METADATA +0 -61
  33. linkml_store-0.1.9.dist-info/RECORD +0 -49
  34. {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/LICENSE +0 -0
  35. {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/WHEEL +0 -0
@@ -98,7 +98,7 @@ class Client:
98
98
  """
99
99
  return self.metadata.base_dir
100
100
 
101
- def from_config(self, config: Union[ClientConfig, str, Path], base_dir=None, **kwargs):
101
+ def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, **kwargs):
102
102
  """
103
103
  Create a client from a configuration.
104
104
 
@@ -118,11 +118,13 @@ class Client:
118
118
  :return:
119
119
 
120
120
  """
121
+ if isinstance(config, dict):
122
+ config = ClientConfig(**config)
121
123
  if isinstance(config, Path):
122
124
  config = str(config)
123
125
  if isinstance(config, str):
124
- if not base_dir:
125
- base_dir = Path(config).parent
126
+ # if not base_dir:
127
+ # base_dir = Path(config).parent
126
128
  parsed_obj = yaml.safe_load(open(config))
127
129
  config = ClientConfig(**parsed_obj)
128
130
  self.metadata = config
@@ -133,8 +135,15 @@ class Client:
133
135
 
134
136
  def _initialize_databases(self, **kwargs):
135
137
  for name, db_config in self.metadata.databases.items():
136
- handle = db_config.handle.format(base_dir=self.base_dir)
138
+ base_dir = self.base_dir
139
+ logger.info(f"Initializing database: {name}, base_dir: {base_dir}")
140
+ if not base_dir:
141
+ base_dir = Path.cwd()
142
+ logger.info(f"Using current working directory: {base_dir}")
143
+ handle = db_config.handle.format(base_dir=base_dir)
137
144
  db_config.handle = handle
145
+ if db_config.schema_location:
146
+ db_config.schema_location = db_config.schema_location.format(base_dir=base_dir)
138
147
  db = self.attach_database(handle, alias=name, **kwargs)
139
148
  db.from_config(db_config)
140
149
 
@@ -233,7 +242,7 @@ class Client:
233
242
  Return all attached databases
234
243
 
235
244
  Examples
236
- --------
245
+
237
246
  >>> client = Client()
238
247
  >>> _ = client.attach_database("duckdb", alias="test1")
239
248
  >>> _ = client.attach_database("duckdb", alias="test2")
@@ -259,25 +268,81 @@ class Client:
259
268
  """
260
269
  Drop a database.
261
270
 
271
+ Example (in-memory):
272
+
273
+ >>> client = Client()
274
+ >>> db1 = client.attach_database("duckdb", alias="test1")
275
+ >>> db2 = client.attach_database("duckdb", alias="test2")
276
+ >>> len(client.databases)
277
+ 2
278
+ >>> client.drop_database("test1")
279
+ >>> len(client.databases)
280
+ 1
281
+
282
+ Databases that persist on disk:
283
+
284
+ >>> client = Client()
285
+ >>> path = Path("tmp/test.db")
286
+ >>> path.parent.mkdir(parents=True, exist_ok=True)
287
+ >>> db = client.attach_database(f"duckdb:///{path}", alias="test")
288
+ >>> len(client.databases)
289
+ 1
290
+ >>> db.store({"persons": [{"id": "P1", "name": "John"}]})
291
+ >>> db.commit()
292
+ >>> Path("tmp/test.db").exists()
293
+ True
294
+ >>> client.drop_database("test")
295
+ >>> len(client.databases)
296
+ 0
297
+ >>> Path("tmp/test.db").exists()
298
+ False
299
+
300
+ Dropping a non-existent database:
301
+
302
+ >>> client = Client()
303
+ >>> client.drop_database("duckdb:///tmp/made-up1", missing_ok=True)
304
+ >>> client.drop_database("duckdb:///tmp/made-up2", missing_ok=False)
305
+ Traceback (most recent call last):
306
+ ...
307
+ ValueError: Database duckdb:///tmp/made-up2 not found
308
+
262
309
  :param name:
263
310
  :param missing_ok:
264
311
  :return:
265
312
  """
266
- if name in self._databases:
267
- db = self._databases[name]
268
- db.drop(**kwargs)
269
- del self._databases[name]
313
+ if self._databases:
314
+ if name in self._databases:
315
+ db = self._databases[name]
316
+ db.drop(**kwargs)
317
+ del self._databases[name]
318
+ else:
319
+ if not missing_ok:
320
+ raise ValueError(f"Database {name} not found")
270
321
  else:
271
- if not missing_ok:
272
- raise ValueError(f"Database {name} not found")
322
+ db = self.get_database(name, create_if_not_exists=True)
323
+ db.drop(**kwargs)
273
324
 
274
325
  def drop_all_databases(self, **kwargs):
275
326
  """
276
327
  Drop all databases.
277
328
 
329
+ Example (in-memory):
330
+
331
+ >>> client = Client()
332
+ >>> db1 = client.attach_database("duckdb", alias="test1")
333
+ >>> assert "test1" in client.databases
334
+ >>> db2 = client.attach_database("duckdb", alias="test2")
335
+ >>> assert "test2" in client.databases
336
+ >>> client.drop_all_databases()
337
+ >>> len(client.databases)
338
+ 0
339
+
340
+
278
341
  :param missing_ok:
279
342
  :return:
280
343
  """
344
+ if not self._databases:
345
+ return
281
346
  for name in list(self._databases.keys()):
282
347
  self.drop_database(name, missing_ok=False, **kwargs)
283
348
  self._databases = {}
@@ -4,7 +4,7 @@ import hashlib
4
4
  import logging
5
5
  from collections import defaultdict
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
7
+ from typing import TYPE_CHECKING, Any, ClassVar, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
8
8
 
9
9
  import numpy as np
10
10
  from linkml_runtime import SchemaView
@@ -14,7 +14,7 @@ from pydantic import BaseModel
14
14
 
15
15
  from linkml_store.api.types import DatabaseType
16
16
  from linkml_store.index import get_indexer
17
- from linkml_store.utils.format_utils import load_objects
17
+ from linkml_store.utils.format_utils import load_objects, load_objects_from_url
18
18
  from linkml_store.utils.object_utils import clean_empties
19
19
  from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
20
20
 
@@ -61,9 +61,11 @@ class Collection(Generic[DatabaseType]):
61
61
  # name: str
62
62
  parent: Optional[DatabaseType] = None
63
63
  _indexers: Optional[Dict[str, Indexer]] = None
64
+ _initialized: Optional[bool] = None
64
65
  # hidden: Optional[bool] = False
65
66
 
66
67
  metadata: Optional[CollectionConfig] = None
68
+ default_index_name: ClassVar[str] = "simple"
67
69
 
68
70
  def __init__(
69
71
  self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
@@ -72,7 +74,7 @@ class Collection(Generic[DatabaseType]):
72
74
  if metadata:
73
75
  self.metadata = metadata
74
76
  else:
75
- self.metadata = CollectionConfig(name=name, **kwargs)
77
+ self.metadata = CollectionConfig(type=name, **kwargs)
76
78
  if not self.metadata.alias:
77
79
  self.metadata.alias = name
78
80
  if not self.metadata.type:
@@ -80,17 +82,6 @@ class Collection(Generic[DatabaseType]):
80
82
  # if name is not None and self.metadata.name is not None and name != self.metadata.name:
81
83
  # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
82
84
 
83
- @property
84
- def name(self) -> str:
85
- """
86
- Return the name of the collection.
87
-
88
- TODO: deprecate in favor of Type
89
-
90
- :return: name of the collection
91
- """
92
- return self.metadata.name
93
-
94
85
  @property
95
86
  def hidden(self) -> bool:
96
87
  """
@@ -117,12 +108,18 @@ class Collection(Generic[DatabaseType]):
117
108
  >>> collection.target_class_name
118
109
  'Person'
119
110
 
111
+ >>> collection = db.create_collection("Organization")
112
+ >>> collection.target_class_name
113
+ 'Organization'
114
+ >>> collection.alias
115
+ 'Organization'
116
+
120
117
  :return: name of the class which members of this collection instantiate
121
118
  """
122
119
  # TODO: this is a shim layer until we can normalize on this
123
120
  if self.metadata.type:
124
121
  return self.metadata.type
125
- return self.name
122
+ return self.alias
126
123
 
127
124
  @property
128
125
  def alias(self):
@@ -160,10 +157,9 @@ class Collection(Generic[DatabaseType]):
160
157
  :return:
161
158
  """
162
159
  # TODO: this is a shim layer until we can normalize on this
163
- # TODO: this is a shim layer until we can normalize on this
164
160
  if self.metadata.alias:
165
161
  return self.metadata.alias
166
- return self.name
162
+ return self.target_class_name
167
163
 
168
164
  def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
169
165
  """
@@ -200,7 +196,14 @@ class Collection(Generic[DatabaseType]):
200
196
  """
201
197
  raise NotImplementedError
202
198
 
199
+ def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
200
+ logger.info(f"Pre-query hook (state: {self._initialized}; Q= {query}")
201
+ if not self._initialized:
202
+ self._materialize_derivations()
203
+ self._initialized = True
204
+
203
205
  def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
206
+ self._initialized = True
204
207
  patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
205
208
  self._broadcast(patches, **kwargs)
206
209
 
@@ -304,6 +307,7 @@ class Collection(Generic[DatabaseType]):
304
307
  :param kwargs:
305
308
  :return:
306
309
  """
310
+ self._pre_query_hook()
307
311
  return self.parent.query(query, **kwargs)
308
312
 
309
313
  def query_facets(
@@ -339,7 +343,6 @@ class Collection(Generic[DatabaseType]):
339
343
  :param kwargs:
340
344
  :return:
341
345
  """
342
- # TODO
343
346
  id_field = self.identifier_attribute_name
344
347
  if not id_field:
345
348
  raise ValueError(f"No identifier for {self.name}")
@@ -398,9 +401,10 @@ class Collection(Generic[DatabaseType]):
398
401
  :return:
399
402
  """
400
403
  query = self._create_query(where_clause=where)
404
+ self._pre_query_hook(query)
401
405
  return self.query(query, **kwargs)
402
406
 
403
- def find_iter(self, where: Optional[Any] = None, **kwargs) -> Iterator[OBJECT]:
407
+ def find_iter(self, where: Optional[Any] = None, page_size=100, **kwargs) -> Iterator[OBJECT]:
404
408
  """
405
409
  Find objects in the collection using a where query.
406
410
 
@@ -408,9 +412,22 @@ class Collection(Generic[DatabaseType]):
408
412
  :param kwargs:
409
413
  :return:
410
414
  """
411
- qr = self.find(where=where, limit=-1, **kwargs)
412
- for row in qr.rows:
413
- yield row
415
+ total_rows = None
416
+ offset = 0
417
+ if page_size < 1:
418
+ raise ValueError(f"Invalid page size: {page_size}")
419
+ while True:
420
+ qr = self.find(where=where, offset=offset, limit=page_size, **kwargs)
421
+ if total_rows is None:
422
+ total_rows = qr.num_rows
423
+ if not qr.rows:
424
+ return
425
+ for row in qr.rows:
426
+ yield row
427
+ offset += page_size
428
+ if offset >= total_rows:
429
+ break
430
+ return
414
431
 
415
432
  def search(
416
433
  self,
@@ -421,7 +438,30 @@ class Collection(Generic[DatabaseType]):
421
438
  **kwargs,
422
439
  ) -> QueryResult:
423
440
  """
424
- Search the collection using a full-text search index.
441
+ Search the collection using a text-based index index.
442
+
443
+ Example:
444
+
445
+ >>> from linkml_store import Client
446
+ >>> from linkml_store.utils.format_utils import load_objects
447
+ >>> client = Client()
448
+ >>> db = client.attach_database("duckdb")
449
+ >>> collection = db.create_collection("Country")
450
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
451
+ >>> collection.insert(objs)
452
+
453
+ Now let's index, using the simple trigram-based index
454
+
455
+ >>> index = get_indexer("simple")
456
+ >>> collection.attach_indexer(index)
457
+
458
+ Now let's find all objects:
459
+
460
+ >>> qr = collection.search("France")
461
+ >>> score, top_obj = qr.ranked_rows[0]
462
+ >>> assert score > 0.1
463
+ >>> top_obj["code"]
464
+ 'FR'
425
465
 
426
466
  :param query:
427
467
  :param where:
@@ -430,13 +470,20 @@ class Collection(Generic[DatabaseType]):
430
470
  :param kwargs:
431
471
  :return:
432
472
  """
473
+ self._pre_query_hook()
433
474
  if index_name is None:
434
- if len(self._indexers) == 1:
435
- index_name = list(self._indexers.keys())[0]
475
+ if len(self.indexers) == 1:
476
+ index_name = list(self.indexers.keys())[0]
436
477
  else:
437
- raise ValueError("Multiple indexes found. Please specify an index name.")
478
+ logger.warning("Multiple indexes found. Using default index.")
479
+ index_name = self.default_index_name
438
480
  ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
439
- ix = self._indexers.get(index_name)
481
+ if index_name not in self.indexers:
482
+ ix = get_indexer(index_name)
483
+ if not self._indexers:
484
+ self._indexers = {}
485
+ self._indexers[index_name] = ix
486
+ ix = self.indexers.get(index_name)
440
487
  if not ix:
441
488
  raise ValueError(f"No index named {index_name}")
442
489
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
@@ -453,7 +500,10 @@ class Collection(Generic[DatabaseType]):
453
500
  @property
454
501
  def is_internal(self) -> bool:
455
502
  """
456
- Check if the collection is internal
503
+ Check if the collection is internal.
504
+
505
+ Internal collections are hidden by default. Examples of internal collections
506
+ include shadow "index" collections
457
507
 
458
508
  :return:
459
509
  """
@@ -461,14 +511,136 @@ class Collection(Generic[DatabaseType]):
461
511
  raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
462
512
  return self.alias.startswith("internal__")
463
513
 
464
- def load_from_source(self):
465
- objects = load_objects(self.metadata.source_location)
514
+ def exists(self) -> Optional[bool]:
515
+ """
516
+ Check if the collection exists.
517
+
518
+ :return:
519
+ """
520
+ cd = self.class_definition()
521
+ return cd is not None
522
+
523
+ def load_from_source(self, load_if_exists=False):
524
+ """
525
+ Load objects from the source location.
526
+
527
+ :param load_if_exists:
528
+ :return:
529
+ """
530
+ if not load_if_exists and self.exists():
531
+ return
532
+ metadata = self.metadata
533
+ if metadata.source:
534
+ source = metadata.source
535
+ kwargs = source.arguments or {}
536
+ if source.local_path:
537
+ objects = load_objects(
538
+ metadata.source.local_path, format=source.format, expected_type=source.expected_type, **kwargs
539
+ )
540
+ elif metadata.source.url:
541
+ objects = load_objects_from_url(
542
+ metadata.source.url, format=source.format, expected_type=source.expected_type, **kwargs
543
+ )
466
544
  self.insert(objects)
467
545
 
546
+ def _check_if_initialized(self) -> bool:
547
+ return self._initialized
548
+
549
+ def _materialize_derivations(self, **kwargs):
550
+ metadata = self.metadata
551
+ if not metadata.derived_from:
552
+ logger.info(f"No metadata for {self.alias}; no derivations")
553
+ return
554
+ if self._check_if_initialized():
555
+ logger.info(f"Already initialized {self.alias}; no derivations")
556
+ return
557
+ parent_db = self.parent
558
+ client = parent_db.parent
559
+ # cd = self.class_definition()
560
+ for derivation in metadata.derived_from:
561
+ # TODO: optimize this; utilize underlying engine
562
+ logger.info(f"Deriving from {derivation}")
563
+ if derivation.database:
564
+ db = client.get_database(derivation.database)
565
+ else:
566
+ db = parent_db
567
+ if derivation.collection:
568
+ coll = db.get_collection(derivation.collection)
569
+ else:
570
+ coll = self
571
+ coll.class_definition()
572
+ source_obj_iter = coll.find_iter(derivation.where or {})
573
+ mappings = derivation.mappings
574
+ if not mappings:
575
+ raise ValueError(f"No mappings for {self.name}")
576
+ target_class_name = self.target_class_name
577
+ from linkml_map.session import Session
578
+
579
+ session = Session()
580
+ session.set_source_schema(db.schema_view.schema)
581
+ session.set_object_transformer(
582
+ {
583
+ "class_derivations": {
584
+ target_class_name: {
585
+ "populated_from": coll.target_class_name,
586
+ "slot_derivations": mappings,
587
+ },
588
+ }
589
+ },
590
+ )
591
+ logger.debug(f"Session Spec: {session.object_transformer}")
592
+ tr_objs = []
593
+ for source_obj in source_obj_iter:
594
+ tr_obj = session.transform(source_obj, source_type=coll.target_class_name)
595
+ tr_objs.append(tr_obj)
596
+ if not tr_objs:
597
+ raise ValueError(f"No objects derived from {coll.name}")
598
+ self.insert(tr_objs)
599
+ self.commit()
600
+
468
601
  def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
469
602
  """
470
603
  Attach an index to the collection.
471
604
 
605
+ As an example, first let's create a collection in a database:
606
+
607
+ >>> from linkml_store import Client
608
+ >>> from linkml_store.utils.format_utils import load_objects
609
+ >>> client = Client()
610
+ >>> db = client.attach_database("duckdb")
611
+ >>> collection = db.create_collection("Country")
612
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
613
+ >>> collection.insert(objs)
614
+
615
+ We will create two indexes - one that indexes the whole object
616
+ (default behavior), the other one indexes the name only
617
+
618
+ >>> full_index = get_indexer("simple")
619
+ >>> full_index.name = "full"
620
+ >>> name_index = get_indexer("simple", text_template="{name}")
621
+ >>> name_index.name = "name"
622
+ >>> collection.attach_indexer(full_index)
623
+ >>> collection.attach_indexer(name_index)
624
+
625
+ Now let's find objects using the full index, using the string "France".
626
+ We expect the country France to be the top hit, but the score will
627
+ be less than zero because we did not match all fields in the object.
628
+
629
+ >>> qr = collection.search("France", index_name="full")
630
+ >>> score, top_obj = qr.ranked_rows[0]
631
+ >>> assert score > 0.1
632
+ >>> assert score < 0.5
633
+ >>> top_obj["code"]
634
+ 'FR'
635
+
636
+ Now using the name index
637
+
638
+ >>> qr = collection.search("France", index_name="name")
639
+ >>> score, top_obj = qr.ranked_rows[0]
640
+ >>> assert score > 0.99
641
+ >>> top_obj["code"]
642
+ 'FR'
643
+
472
644
  :param index:
473
645
  :param name:
474
646
  :param auto_index: Automatically index all objects in the collection
@@ -500,19 +672,22 @@ class Collection(Generic[DatabaseType]):
500
672
  :param indexer:
501
673
  :return:
502
674
  """
503
- return f"internal__index__{self.name}__{index_name}"
675
+ return f"internal__index__{self.alias}__{index_name}"
504
676
 
505
677
  def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
506
678
  """
507
- Index a list of objects
679
+ Index a list of objects using a specified index.
680
+
681
+ By default, the indexed objects will be stored in a shadow
682
+ collection in the same database, with additional fields for the index vector
508
683
 
509
684
  :param objs:
510
- :param index_name:
685
+ :param index_name: e.g. simple, llm
511
686
  :param replace:
512
687
  :param kwargs:
513
688
  :return:
514
689
  """
515
- ix = self._indexers.get(index_name)
690
+ ix = self._indexers.get(index_name, None)
516
691
  if not ix:
517
692
  raise ValueError(f"No index named {index_name}")
518
693
  ix_coll_name = self._index_collection_name(index_name)
@@ -563,6 +738,9 @@ class Collection(Generic[DatabaseType]):
563
738
  """
564
739
  Return the class definition for the collection.
565
740
 
741
+ If no schema has been explicitly set, and the native database does not
742
+ have a schema, then a schema will be induced from the objects in the collection.
743
+
566
744
  :return:
567
745
  """
568
746
  sv: SchemaView = self.parent.schema_view
@@ -647,7 +825,9 @@ class Collection(Generic[DatabaseType]):
647
825
  else:
648
826
  return None
649
827
 
650
- def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_size=10) -> ClassDefinition:
828
+ def induce_class_definition_from_objects(
829
+ self, objs: List[OBJECT], max_sample_size: Optional[int] = None
830
+ ) -> ClassDefinition:
651
831
  """
652
832
  Induce a class definition from a list of objects.
653
833
 
@@ -658,6 +838,9 @@ class Collection(Generic[DatabaseType]):
658
838
  :param max_sample_size:
659
839
  :return:
660
840
  """
841
+ # TODO: use schemaview
842
+ if max_sample_size is None:
843
+ max_sample_size = 10
661
844
  if not self.target_class_name:
662
845
  raise ValueError(f"No target_class_name for {self.alias}")
663
846
  cd = ClassDefinition(self.target_class_name)
@@ -720,6 +903,7 @@ class Collection(Generic[DatabaseType]):
720
903
  for other_rng in rngs:
721
904
  if rng != other_rng:
722
905
  raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
906
+ logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
723
907
  cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
724
908
  if exact_dimensions_list:
725
909
  array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
@@ -753,7 +937,7 @@ class Collection(Generic[DatabaseType]):
753
937
  """
754
938
  Apply a patch to the collection.
755
939
 
756
- Patches conform to the JSON Patch format,
940
+ Patches conform to the JSON Patch format.
757
941
 
758
942
  :param patches:
759
943
  :param kwargs:
@@ -766,11 +950,11 @@ class Collection(Generic[DatabaseType]):
766
950
  new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
767
951
  self.replace(new_objs)
768
952
 
769
- def diff(self, other: "Collection", **kwargs):
953
+ def diff(self, other: "Collection", **kwargs) -> List[PatchDict]:
770
954
  """
771
955
  Diff two collections.
772
956
 
773
- :param other:
957
+ :param other: The collection to diff against
774
958
  :param kwargs:
775
959
  :return:
776
960
  """
@@ -797,8 +981,7 @@ class Collection(Generic[DatabaseType]):
797
981
  if not cd:
798
982
  raise ValueError(f"Cannot find class definition for {self.target_class_name}")
799
983
  class_name = cd.name
800
- result = self.find(**kwargs)
801
- for obj in result.rows:
984
+ for obj in self.find_iter(**kwargs):
802
985
  obj = clean_empties(obj)
803
986
  yield from validator.iter_results(obj, class_name)
804
987
 
@@ -3,11 +3,44 @@ from typing import Any, Dict, List, Optional
3
3
  from pydantic import BaseModel, Field
4
4
 
5
5
 
6
- class CollectionConfig(BaseModel):
7
- name: Optional[str] = Field(
8
- default=None,
9
- description="An optional name for the collection",
10
- )
6
+ class ConfiguredBaseModel(BaseModel, extra="forbid"):
7
+ """
8
+ Base class for all configuration models.
9
+ """
10
+
11
+ pass
12
+
13
+
14
+ class DerivationConfiguration(ConfiguredBaseModel):
15
+ """
16
+ Configuration for a derivation
17
+ """
18
+
19
+ database: Optional[str] = None
20
+ collection: Optional[str] = None
21
+ mappings: Optional[Dict[str, Any]] = None
22
+ where: Optional[Dict[str, Any]] = None
23
+
24
+
25
+ class CollectionSource(ConfiguredBaseModel):
26
+ """
27
+ Metadata about a source
28
+ """
29
+
30
+ url: Optional[str] = None
31
+ local_path: Optional[str] = None
32
+ source_location: Optional[str] = None
33
+ refresh_interval_days: Optional[float] = None
34
+ expected_type: Optional[str] = None
35
+ format: Optional[str] = None
36
+ arguments: Optional[Dict[str, Any]] = None
37
+
38
+
39
+ class CollectionConfig(ConfiguredBaseModel):
40
+ """
41
+ Configuration for a collection
42
+ """
43
+
11
44
  alias: Optional[str] = Field(
12
45
  default=None,
13
46
  description="An optional alias for the collection",
@@ -36,13 +69,22 @@ class CollectionConfig(BaseModel):
36
69
  default=False,
37
70
  description="Whether the collection is prepopulated",
38
71
  )
39
- source_location: Optional[str] = Field(
72
+ source: Optional[CollectionSource] = Field(
73
+ default=None,
74
+ description="Metadata about the source",
75
+ )
76
+ # TODO: derived_from
77
+ derived_from: Optional[List[DerivationConfiguration]] = Field(
40
78
  default=None,
41
- description="Filesystem or remote URL that stores the data",
79
+ description="LinkML-Map derivations",
42
80
  )
43
81
 
44
82
 
45
- class DatabaseConfig(BaseModel):
83
+ class DatabaseConfig(ConfiguredBaseModel):
84
+ """
85
+ Configuration for a database
86
+ """
87
+
46
88
  handle: str = Field(
47
89
  default="duckdb:///:memory:",
48
90
  description="The database handle, e.g., 'duckdb:///:memory:' or 'mongodb://localhost:27017'",
@@ -86,7 +128,11 @@ class DatabaseConfig(BaseModel):
86
128
  )
87
129
 
88
130
 
89
- class ClientConfig(BaseModel):
131
+ class ClientConfig(ConfiguredBaseModel):
132
+ """
133
+ Configuration for a client
134
+ """
135
+
90
136
  handle: Optional[str] = Field(
91
137
  default=None,
92
138
  description="The client handle",
@@ -95,6 +141,10 @@ class ClientConfig(BaseModel):
95
141
  default={},
96
142
  description="A dictionary of database configurations",
97
143
  )
144
+ default_database: Optional[str] = Field(
145
+ default=None,
146
+ description="The default database",
147
+ )
98
148
  schema_path: Optional[str] = Field(
99
149
  default=None,
100
150
  description="The path to the LinkML schema file",