linkml-store 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (32) hide show
  1. linkml_store/api/client.py +32 -5
  2. linkml_store/api/collection.py +276 -27
  3. linkml_store/api/config.py +6 -2
  4. linkml_store/api/database.py +264 -21
  5. linkml_store/api/stores/chromadb/__init__.py +5 -1
  6. linkml_store/api/stores/duckdb/__init__.py +9 -0
  7. linkml_store/api/stores/duckdb/duckdb_collection.py +7 -4
  8. linkml_store/api/stores/duckdb/duckdb_database.py +19 -5
  9. linkml_store/api/stores/duckdb/mappings.py +1 -0
  10. linkml_store/api/stores/filesystem/__init__.py +15 -0
  11. linkml_store/api/stores/filesystem/filesystem_collection.py +177 -0
  12. linkml_store/api/stores/filesystem/filesystem_database.py +72 -0
  13. linkml_store/api/stores/hdf5/__init__.py +7 -0
  14. linkml_store/api/stores/mongodb/__init__.py +25 -0
  15. linkml_store/api/stores/mongodb/mongodb_collection.py +31 -10
  16. linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
  17. linkml_store/api/types.py +4 -0
  18. linkml_store/cli.py +150 -15
  19. linkml_store/index/__init__.py +6 -2
  20. linkml_store/index/implementations/llm_indexer.py +83 -5
  21. linkml_store/index/implementations/simple_indexer.py +2 -2
  22. linkml_store/index/indexer.py +32 -8
  23. linkml_store/utils/change_utils.py +17 -0
  24. linkml_store/utils/format_utils.py +139 -8
  25. linkml_store/utils/patch_utils.py +126 -0
  26. linkml_store/utils/query_utils.py +89 -0
  27. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/METADATA +7 -1
  28. linkml_store-0.1.9.dist-info/RECORD +49 -0
  29. linkml_store-0.1.7.dist-info/RECORD +0 -42
  30. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
  31. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
  32. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
  from typing import Dict, Optional, Union
3
4
 
@@ -8,14 +9,19 @@ from linkml_store.api import Database
8
9
  from linkml_store.api.config import ClientConfig
9
10
  from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
10
11
  from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
+ from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
11
13
  from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
12
14
  from linkml_store.api.stores.solr.solr_database import SolrDatabase
13
15
 
16
+ logger = logging.getLogger(__name__)
17
+
18
+
14
19
  HANDLE_MAP = {
15
20
  "duckdb": DuckDBDatabase,
16
21
  "solr": SolrDatabase,
17
22
  "mongodb": MongoDBDatabase,
18
23
  "chromadb": ChromaDBDatabase,
24
+ "file": FileSystemDatabase,
19
25
  }
20
26
 
21
27
 
@@ -23,14 +29,27 @@ class Client:
23
29
  """
24
30
  A client is the top-level object for interacting with databases.
25
31
 
26
- A client has access to one or more :class:`Database` objects.
27
-
28
- Each database consists of a number of :class:`.Collection` objects.
32
+ * A client has access to one or more :class:`.Database` objects.
33
+ * Each database consists of a number of :class:`.Collection` objects.
29
34
 
30
- Examples
31
- --------
35
+ Creating a client
36
+ -----------------
32
37
  >>> client = Client()
38
+
39
+ Attaching a database
40
+ --------------------
33
41
  >>> db = client.attach_database("duckdb", alias="test")
42
+
43
+ Note that normally a handle would be specified by a locator such as ``duckdb:///<PATH>``, but
44
+ for convenience, an in-memory duckdb object can be specified without a full locator
45
+
46
+ We can check the actual handle:
47
+
48
+ >>> db.handle
49
+ 'duckdb:///:memory:'
50
+
51
+ Creating a new collection
52
+ -------------------------
34
53
  >>> collection = db.create_collection("Person")
35
54
  >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
36
55
  >>> collection.insert(objs)
@@ -151,6 +170,8 @@ class Client:
151
170
  if ":" not in handle:
152
171
  scheme = handle
153
172
  handle = None
173
+ if alias is None:
174
+ alias = scheme
154
175
  else:
155
176
  scheme, _ = handle.split(":", 1)
156
177
  if scheme not in HANDLE_MAP:
@@ -165,6 +186,11 @@ class Client:
165
186
  self._databases = {}
166
187
  self._databases[alias] = db
167
188
  db.parent = self
189
+ if db.alias:
190
+ if db.alias != alias:
191
+ raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
192
+ else:
193
+ db.metadata.alias = alias
168
194
  return db
169
195
 
170
196
  def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -195,6 +221,7 @@ class Client:
195
221
  self._databases = {}
196
222
  if name not in self._databases:
197
223
  if create_if_not_exists:
224
+ logger.info(f"Creating database: {name}")
198
225
  self.attach_database(name, **kwargs)
199
226
  else:
200
227
  raise ValueError(f"Database {name} does not exist")
@@ -1,16 +1,22 @@
1
+ """A structure for representing collections of similar objects."""
2
+
1
3
  import hashlib
2
4
  import logging
3
5
  from collections import defaultdict
4
6
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
7
+ from typing import TYPE_CHECKING, Any, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
6
8
 
7
9
  import numpy as np
10
+ from linkml_runtime import SchemaView
8
11
  from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
9
12
  from linkml_runtime.linkml_model.meta import ArrayExpression
10
13
  from pydantic import BaseModel
11
14
 
15
+ from linkml_store.api.types import DatabaseType
12
16
  from linkml_store.index import get_indexer
17
+ from linkml_store.utils.format_utils import load_objects
13
18
  from linkml_store.utils.object_utils import clean_empties
19
+ from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
14
20
 
15
21
  try:
16
22
  from linkml.validator.report import ValidationResult
@@ -33,17 +39,27 @@ IDENTIFIER = str
33
39
  FIELD_NAME = str
34
40
 
35
41
 
36
- class Collection:
42
+ class Collection(Generic[DatabaseType]):
37
43
  """
38
44
  A collection is an organized set of objects of the same or similar type.
39
45
 
40
46
  - For relational databases, a collection is typically a table
41
47
  - For document databases such as MongoDB, a collection is the native type
42
- - For a file system, a collection could be a single tabular file such as Parquet or CSV
48
+ - For a file system, a collection could be a single tabular file such as Parquet or CSV.
49
+
50
+ Collection objects are typically not created directly - instead they are generated
51
+ from a parent :class:`.Database` object:
52
+
53
+ >>> from linkml_store import Client
54
+ >>> client = Client()
55
+ >>> db = client.attach_database("duckdb", alias="test")
56
+ >>> collection = db.create_collection("Person")
57
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
58
+ >>> collection.insert(objs)
43
59
  """
44
60
 
45
61
  # name: str
46
- parent: Optional["Database"] = None
62
+ parent: Optional[DatabaseType] = None
47
63
  _indexers: Optional[Dict[str, Indexer]] = None
48
64
  # hidden: Optional[bool] = False
49
65
 
@@ -57,15 +73,21 @@ class Collection:
57
73
  self.metadata = metadata
58
74
  else:
59
75
  self.metadata = CollectionConfig(name=name, **kwargs)
60
- if name is not None and self.metadata.name is not None and name != self.metadata.name:
61
- raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
76
+ if not self.metadata.alias:
77
+ self.metadata.alias = name
78
+ if not self.metadata.type:
79
+ self.metadata.type = name
80
+ # if name is not None and self.metadata.name is not None and name != self.metadata.name:
81
+ # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
62
82
 
63
83
  @property
64
84
  def name(self) -> str:
65
85
  """
66
- Return the name of the collection
86
+ Return the name of the collection.
67
87
 
68
- :return:
88
+ TODO: deprecate in favor of Type
89
+
90
+ :return: name of the collection
69
91
  """
70
92
  return self.metadata.name
71
93
 
@@ -79,7 +101,7 @@ class Collection:
79
101
 
80
102
  :return: True if the collection is hidden
81
103
  """
82
- return self.metadata.hidden
104
+ # return self.metadata.hidden
83
105
 
84
106
  @property
85
107
  def target_class_name(self):
@@ -88,7 +110,14 @@ class Collection:
88
110
 
89
111
  This MUST be a LinkML class name
90
112
 
91
- :return:
113
+ >>> from linkml_store import Client
114
+ >>> client = Client()
115
+ >>> db = client.attach_database("duckdb", alias="test")
116
+ >>> collection = db.create_collection("Person", alias="persons")
117
+ >>> collection.target_class_name
118
+ 'Person'
119
+
120
+ :return: name of the class which members of this collection instantiate
92
121
  """
93
122
  # TODO: this is a shim layer until we can normalize on this
94
123
  if self.metadata.type:
@@ -104,15 +133,34 @@ class Collection:
104
133
  to have an alias, for example "persons" which collects all instances
105
134
  of class Person.
106
135
 
107
- The _alias SHOULD be used for Table names in SQL.
136
+ >>> from linkml_store import Client
137
+ >>> client = Client()
138
+ >>> db = client.attach_database("duckdb", alias="test")
139
+ >>> collection = db.create_collection("Person", alias="persons")
140
+ >>> collection.alias
141
+ 'persons'
142
+
143
+ If no explicit alias is provided, then the target class name is used:
144
+
145
+ >>> from linkml_store import Client
146
+ >>> client = Client()
147
+ >>> db = client.attach_database("duckdb", alias="test")
148
+ >>> collection = db.create_collection("Person")
149
+ >>> collection.alias
150
+ 'Person'
151
+
152
+ The alias SHOULD be used for Table names in SQL.
108
153
 
109
154
  For nested data, the alias SHOULD be used as the key; e.g
110
155
 
111
- ``{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }``
156
+ .. code-block:: json
157
+
158
+ { "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }
112
159
 
113
160
  :return:
114
161
  """
115
162
  # TODO: this is a shim layer until we can normalize on this
163
+ # TODO: this is a shim layer until we can normalize on this
116
164
  if self.metadata.alias:
117
165
  return self.metadata.alias
118
166
  return self.name
@@ -121,6 +169,13 @@ class Collection:
121
169
  """
122
170
  Replace entire collection with objects.
123
171
 
172
+ >>> from linkml_store import Client
173
+ >>> client = Client()
174
+ >>> db = client.attach_database("duckdb", alias="test")
175
+ >>> collection = db.create_collection("Person")
176
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
177
+ >>> collection.insert(objs)
178
+
124
179
  :param objs:
125
180
  :param kwargs:
126
181
  :return:
@@ -130,7 +185,14 @@ class Collection:
130
185
 
131
186
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
132
187
  """
133
- Add one or more objects to the collection
188
+ Add one or more objects to the collection.
189
+
190
+ >>> from linkml_store import Client
191
+ >>> client = Client()
192
+ >>> db = client.attach_database("duckdb", alias="test")
193
+ >>> collection = db.create_collection("Person")
194
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
195
+ >>> collection.insert(objs)
134
196
 
135
197
  :param objs:
136
198
  :param kwargs:
@@ -138,9 +200,36 @@ class Collection:
138
200
  """
139
201
  raise NotImplementedError
140
202
 
141
- def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
203
+ def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
204
+ patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
205
+ self._broadcast(patches, **kwargs)
206
+
207
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
142
208
  """
143
- Delete one or more objects from the collection
209
+ Delete one or more objects from the collection.
210
+
211
+ First let's set up a collection:
212
+
213
+ >>> from linkml_store import Client
214
+ >>> client = Client()
215
+ >>> db = client.attach_database("duckdb", alias="test")
216
+ >>> collection = db.create_collection("Person")
217
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
218
+ >>> collection.insert(objs)
219
+ >>> collection.find({}).num_rows
220
+ 2
221
+
222
+ Now let's delete an object:
223
+
224
+ >>> collection.delete(objs[0])
225
+ >>> collection.find({}).num_rows
226
+ 1
227
+
228
+ Deleting the same object again should have no effect:
229
+
230
+ >>> collection.delete(objs[0])
231
+ >>> collection.find({}).num_rows
232
+ 1
144
233
 
145
234
  :param objs:
146
235
  :param kwargs:
@@ -148,9 +237,30 @@ class Collection:
148
237
  """
149
238
  raise NotImplementedError
150
239
 
151
- def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
240
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
152
241
  """
153
- Delete objects that match a query
242
+ Delete objects that match a query.
243
+
244
+ First let's set up a collection:
245
+
246
+ >>> from linkml_store import Client
247
+ >>> client = Client()
248
+ >>> db = client.attach_database("duckdb", alias="test")
249
+ >>> collection = db.create_collection("Person")
250
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
251
+ >>> collection.insert(objs)
252
+
253
+ Now let's delete an object:
254
+
255
+ >>> collection.delete_where({"id": "P1"})
256
+ >>> collection.find({}).num_rows
257
+ 1
258
+
259
+ Match everything:
260
+
261
+ >>> collection.delete_where({})
262
+ >>> collection.find({}).num_rows
263
+ 0
154
264
 
155
265
  :param where: where conditions
156
266
  :param missing_ok: if True, do not raise an error if the collection does not exist
@@ -161,7 +271,7 @@ class Collection:
161
271
 
162
272
  def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
163
273
  """
164
- Update one or more objects in the collection
274
+ Update one or more objects in the collection.
165
275
 
166
276
  :param objs:
167
277
  :param kwargs:
@@ -174,7 +284,21 @@ class Collection:
174
284
 
175
285
  def query(self, query: Query, **kwargs) -> QueryResult:
176
286
  """
177
- Run a query against the collection
287
+ Run a query against the collection.
288
+
289
+ First let's load a collection:
290
+
291
+ >>> from linkml_store import Client
292
+ >>> from linkml_store.utils.format_utils import load_objects
293
+ >>> client = Client()
294
+ >>> db = client.attach_database("duckdb")
295
+ >>> collection = db.create_collection("Country")
296
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
297
+ >>> collection.insert(objs)
298
+
299
+ Now let's run a query:
300
+
301
+ TODO
178
302
 
179
303
  :param query:
180
304
  :param kwargs:
@@ -184,7 +308,7 @@ class Collection:
184
308
 
185
309
  def query_facets(
186
310
  self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
187
- ) -> Dict[str, Dict[str, int]]:
311
+ ) -> Dict[str, List[Tuple[Any, int]]]:
188
312
  """
189
313
  Run a query to get facet counts for one or more columns.
190
314
 
@@ -202,12 +326,12 @@ class Collection:
202
326
  :param query: A Query object representing the base query.
203
327
  :param facet_columns: A list of column names to get facet counts for.
204
328
  :param facet_limit:
205
- :return: A dictionary where keys are column names and values are pandas DataFrames
329
+ :return: A dictionary where keys are column names and values are tuples
206
330
  containing the facet counts for each unique value in the respective column.
207
331
  """
208
332
  raise NotImplementedError
209
333
 
210
- def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
334
+ def get(self, ids: Optional[List[IDENTIFIER]], **kwargs) -> QueryResult:
211
335
  """
212
336
  Get one or more objects by ID.
213
337
 
@@ -217,6 +341,8 @@ class Collection:
217
341
  """
218
342
  # TODO
219
343
  id_field = self.identifier_attribute_name
344
+ if not id_field:
345
+ raise ValueError(f"No identifier for {self.name}")
220
346
  return self.find({id_field: ids})
221
347
 
222
348
  def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
@@ -242,6 +368,31 @@ class Collection:
242
368
  """
243
369
  Find objects in the collection using a where query.
244
370
 
371
+ As an example, first load a collection:
372
+
373
+ >>> from linkml_store import Client
374
+ >>> from linkml_store.utils.format_utils import load_objects
375
+ >>> client = Client()
376
+ >>> db = client.attach_database("duckdb")
377
+ >>> collection = db.create_collection("Country")
378
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
379
+ >>> collection.insert(objs)
380
+
381
+ Now let's find all objects:
382
+
383
+ >>> qr = collection.find({})
384
+ >>> qr.num_rows
385
+ 20
386
+
387
+ We can do a more restrictive query:
388
+
389
+ >>> qr = collection.find({"code": "FR"})
390
+ >>> qr.num_rows
391
+ 1
392
+ >>> qr.rows[0]["name"]
393
+ 'France'
394
+
395
+
245
396
  :param where:
246
397
  :param kwargs:
247
398
  :return:
@@ -290,6 +441,7 @@ class Collection:
290
441
  raise ValueError(f"No index named {index_name}")
291
442
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
292
443
  index_col = ix.index_field
444
+ # TODO: optimize this for large indexes
293
445
  vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
294
446
  results = ix.search(query, vector_pairs, limit=limit)
295
447
  for r in results:
@@ -305,11 +457,15 @@ class Collection:
305
457
 
306
458
  :return:
307
459
  """
308
- if not self.name:
309
- raise ValueError(f"Collection has no name: {self} // {self.metadata}")
310
- return self.name.startswith("internal__")
460
+ if not self.alias:
461
+ raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
462
+ return self.alias.startswith("internal__")
463
+
464
+ def load_from_source(self):
465
+ objects = load_objects(self.metadata.source_location)
466
+ self.insert(objects)
311
467
 
312
- def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = True, auto_index=True, **kwargs):
468
+ def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
313
469
  """
314
470
  Attach an index to the collection.
315
471
 
@@ -333,6 +489,7 @@ class Collection:
333
489
  self._indexers[index_name] = index
334
490
  if auto_index:
335
491
  all_objs = self.find(limit=-1).rows
492
+ logger.info(f"Auto-indexing {len(all_objs)} objects")
336
493
  self.index_objects(all_objs, index_name, replace=True, **kwargs)
337
494
 
338
495
  def _index_collection_name(self, index_name: str) -> str:
@@ -340,6 +497,7 @@ class Collection:
340
497
  Create a name for a special collection that holds index data
341
498
 
342
499
  :param index_name:
500
+ :param indexer:
343
501
  :return:
344
502
  """
345
503
  return f"internal__index__{self.name}__{index_name}"
@@ -370,7 +528,9 @@ class Collection:
370
528
  logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
371
529
  if ix_coll_name in schema.classes:
372
530
  ix_coll.delete_where()
531
+
373
532
  ix_coll.insert(objects_with_ix, **kwargs)
533
+ ix_coll.commit()
374
534
 
375
535
  def list_index_names(self) -> List[str]:
376
536
  """
@@ -405,12 +565,22 @@ class Collection:
405
565
 
406
566
  :return:
407
567
  """
408
- sv = self.parent.schema_view
568
+ sv: SchemaView = self.parent.schema_view
409
569
  if sv:
410
570
  cls = sv.get_class(self.target_class_name)
571
+ if cls and not cls.attributes:
572
+ if not sv.class_induced_slots(cls.name):
573
+ for att in self._induce_attributes():
574
+ cls.attributes[att.name] = att
575
+ sv.set_modified()
411
576
  return cls
412
577
  return None
413
578
 
579
+ def _induce_attributes(self) -> List[SlotDefinition]:
580
+ result = self.find({}, limit=-1)
581
+ cd = self.induce_class_definition_from_objects(result.rows, max_sample_size=None)
582
+ return list(cd.attributes.values())
583
+
414
584
  @property
415
585
  def identifier_attribute_name(self) -> Optional[str]:
416
586
  """
@@ -427,6 +597,37 @@ class Collection:
427
597
  return att.name
428
598
  return None
429
599
 
600
+ def set_identifier_attribute_name(self, name: str):
601
+ """
602
+ Set the name of the identifier attribute for the collection.
603
+
604
+ AKA the primary key.
605
+
606
+ :param name: The name of the identifier attribute.
607
+ """
608
+ cd = self.class_definition()
609
+ if not cd:
610
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
611
+ id_att = None
612
+ candidates = []
613
+ sv: SchemaView = self.parent.schema_view
614
+ cls = sv.get_class(cd.name)
615
+ existing_id_slot = sv.get_identifier_slot(cls.name)
616
+ if existing_id_slot:
617
+ if existing_id_slot.name == name:
618
+ return
619
+ existing_id_slot.identifier = False
620
+ for att in cls.attributes.values():
621
+ candidates.append(att.name)
622
+ if att.name == name:
623
+ att.identifier = True
624
+ id_att = att
625
+ else:
626
+ att.identifier = False
627
+ if not id_att:
628
+ raise ValueError(f"No attribute found with name {name} in {candidates}")
629
+ sv.set_modified()
630
+
430
631
  def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
431
632
  """
432
633
  Return the identifier for an object.
@@ -457,6 +658,8 @@ class Collection:
457
658
  :param max_sample_size:
458
659
  :return:
459
660
  """
661
+ if not self.target_class_name:
662
+ raise ValueError(f"No target_class_name for {self.alias}")
460
663
  cd = ClassDefinition(self.target_class_name)
461
664
  keys = defaultdict(list)
462
665
  for obj in objs[0:max_sample_size]:
@@ -468,6 +671,8 @@ class Collection:
468
671
  for k, v in obj.items():
469
672
  keys[k].append(v)
470
673
  for k, vs in keys.items():
674
+ if k == "_id":
675
+ continue
471
676
  multivalueds = []
472
677
  inlineds = []
473
678
  rngs = []
@@ -544,6 +749,39 @@ class Collection:
544
749
  """
545
750
  raise NotImplementedError
546
751
 
752
+ def apply_patches(self, patches: List[PatchDict], **kwargs):
753
+ """
754
+ Apply a patch to the collection.
755
+
756
+ Patches conform to the JSON Patch format,
757
+
758
+ :param patches:
759
+ :param kwargs:
760
+ :return:
761
+ """
762
+ all_objs = self.find(limit=-1).rows
763
+ primary_key = self.identifier_attribute_name
764
+ if not primary_key:
765
+ raise ValueError(f"No primary key for {self.target_class_name}")
766
+ new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
767
+ self.replace(new_objs)
768
+
769
+ def diff(self, other: "Collection", **kwargs):
770
+ """
771
+ Diff two collections.
772
+
773
+ :param other:
774
+ :param kwargs:
775
+ :return:
776
+ """
777
+ src_objs = self.find(limit=-1).rows
778
+ tgt_objs = other.find(limit=-1).rows
779
+ primary_key = self.identifier_attribute_name
780
+ if not primary_key:
781
+ raise ValueError(f"No primary key for {self.target_class_name}")
782
+ patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
783
+ return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
784
+
547
785
  def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
548
786
  """
549
787
  Validate the contents of the collection
@@ -563,3 +801,14 @@ class Collection:
563
801
  for obj in result.rows:
564
802
  obj = clean_empties(obj)
565
803
  yield from validator.iter_results(obj, class_name)
804
+
805
+ def commit(self):
806
+ """
807
+ Commit changes to the collection.
808
+
809
+ :return:
810
+ """
811
+ pass
812
+
813
+ def _broadcast(self, *args, **kwargs):
814
+ self.parent.broadcast(self, *args, **kwargs)
@@ -16,7 +16,7 @@ class CollectionConfig(BaseModel):
16
16
  default=None,
17
17
  description="The type of object in the collection. TODO; use this instead of name",
18
18
  )
19
- metadata: Optional[Dict] = Field(
19
+ additional_properties: Optional[Dict] = Field(
20
20
  default=None,
21
21
  description="Optional metadata for the collection",
22
22
  )
@@ -36,6 +36,10 @@ class CollectionConfig(BaseModel):
36
36
  default=False,
37
37
  description="Whether the collection is prepopulated",
38
38
  )
39
+ source_location: Optional[str] = Field(
40
+ default=None,
41
+ description="Filesystem or remote URL that stores the data",
42
+ )
39
43
 
40
44
 
41
45
  class DatabaseConfig(BaseModel):
@@ -55,7 +59,7 @@ class DatabaseConfig(BaseModel):
55
59
  default=None,
56
60
  description="The LinkML schema as a dictionary",
57
61
  )
58
- collections: Dict[str, CollectionConfig] = Field(
62
+ collections: Optional[Dict[str, CollectionConfig]] = Field(
59
63
  default={},
60
64
  description="A dictionary of collection configurations",
61
65
  )