linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (37) hide show
  1. linkml_store/api/__init__.py +2 -2
  2. linkml_store/api/client.py +113 -8
  3. linkml_store/api/collection.py +272 -34
  4. linkml_store/api/config.py +101 -0
  5. linkml_store/api/database.py +282 -18
  6. linkml_store/api/queries.py +12 -1
  7. linkml_store/api/stores/chromadb/__init__.py +3 -0
  8. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  9. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  10. linkml_store/api/stores/duckdb/__init__.py +7 -0
  11. linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
  12. linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
  13. linkml_store/api/stores/hdf5/__init__.py +0 -0
  14. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  15. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  16. linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
  17. linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
  18. linkml_store/api/stores/solr/__init__.py +3 -0
  19. linkml_store/api/stores/solr/solr_collection.py +133 -0
  20. linkml_store/api/stores/solr/solr_database.py +83 -0
  21. linkml_store/api/stores/solr/solr_utils.py +0 -0
  22. linkml_store/cli.py +369 -0
  23. linkml_store/index/__init__.py +33 -0
  24. linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
  25. linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
  26. linkml_store/index/{index.py → indexer.py} +7 -4
  27. linkml_store/utils/format_utils.py +93 -0
  28. linkml_store/utils/object_utils.py +81 -0
  29. linkml_store/utils/sql_utils.py +46 -7
  30. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
  31. linkml_store-0.1.7.dist-info/RECORD +42 -0
  32. linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
  33. linkml_store/api/metadata.py +0 -5
  34. linkml_store-0.0.0.dist-info/RECORD +0 -29
  35. linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
  36. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
  37. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
@@ -1,8 +1,8 @@
1
1
  # flake8: noqa: E402
2
2
  from linkml_store.api.collection import Collection
3
3
  from linkml_store.api.database import Database
4
- from linkml_store.api.metadata import MetaData
5
4
  from linkml_store.api.client import Client
5
+
6
6
  # flake8: noqa
7
7
 
8
- __all__ = ["Client", "Database", "MetaData", "Collection"]
8
+ __all__ = ["Client", "Database", "Collection"]
@@ -1,20 +1,31 @@
1
- from dataclasses import dataclass
2
- from typing import Dict, Optional
1
+ from pathlib import Path
2
+ from typing import Dict, Optional, Union
3
3
 
4
+ import yaml
4
5
  from linkml_runtime import SchemaView
5
6
 
6
7
  from linkml_store.api import Database
8
+ from linkml_store.api.config import ClientConfig
9
+ from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
7
10
  from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
11
+ from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
12
+ from linkml_store.api.stores.solr.solr_database import SolrDatabase
8
13
 
9
14
  HANDLE_MAP = {
10
15
  "duckdb": DuckDBDatabase,
16
+ "solr": SolrDatabase,
17
+ "mongodb": MongoDBDatabase,
18
+ "chromadb": ChromaDBDatabase,
11
19
  }
12
20
 
13
21
 
14
- @dataclass
15
22
  class Client:
16
23
  """
17
- A client provides access to named collections.
24
+ A client is the top-level object for interacting with databases.
25
+
26
+ A client has access to one or more :class:`Database` objects.
27
+
28
+ Each database consists of a number of :class:`.Collection` objects.
18
29
 
19
30
  Examples
20
31
  --------
@@ -22,7 +33,7 @@ class Client:
22
33
  >>> db = client.attach_database("duckdb", alias="test")
23
34
  >>> collection = db.create_collection("Person")
24
35
  >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
25
- >>> collection.add(objs)
36
+ >>> collection.insert(objs)
26
37
  >>> qr = collection.find()
27
38
  >>> len(qr.rows)
28
39
  2
@@ -38,9 +49,76 @@ class Client:
38
49
 
39
50
  """
40
51
 
41
- handle: Optional[str] = None
52
+ metadata: Optional[ClientConfig] = None
42
53
  _databases: Optional[Dict[str, Database]] = None
43
54
 
55
+ def __init__(self, handle: Optional[str] = None, metadata: Optional[ClientConfig] = None):
56
+ """
57
+ Initialize a client.
58
+
59
+ :param handle:
60
+ :param metadata:
61
+ """
62
+ self.metadata = metadata
63
+ if not self.metadata:
64
+ self.metadata = ClientConfig()
65
+ self.metadata.handle = handle
66
+
67
+ @property
68
+ def handle(self) -> Optional[str]:
69
+ return self.metadata.handle
70
+
71
+ @property
72
+ def base_dir(self) -> Optional[str]:
73
+ """
74
+ Get the base directory for the client.
75
+
76
+ Wraps metadata.base_dir.
77
+
78
+ :return:
79
+ """
80
+ return self.metadata.base_dir
81
+
82
+ def from_config(self, config: Union[ClientConfig, str, Path], base_dir=None, **kwargs):
83
+ """
84
+ Create a client from a configuration.
85
+
86
+ Examples
87
+ --------
88
+ >>> from linkml_store.api.config import ClientConfig
89
+ >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
90
+ >>> len(client.databases)
91
+ 1
92
+ >>> "test" in client.databases
93
+ True
94
+ >>> client.databases["test"].handle
95
+ 'duckdb:///:memory:'
96
+
97
+ :param config:
98
+ :param kwargs:
99
+ :return:
100
+
101
+ """
102
+ if isinstance(config, Path):
103
+ config = str(config)
104
+ if isinstance(config, str):
105
+ if not base_dir:
106
+ base_dir = Path(config).parent
107
+ parsed_obj = yaml.safe_load(open(config))
108
+ config = ClientConfig(**parsed_obj)
109
+ self.metadata = config
110
+ if base_dir:
111
+ self.metadata.base_dir = base_dir
112
+ self._initialize_databases(**kwargs)
113
+ return self
114
+
115
+ def _initialize_databases(self, **kwargs):
116
+ for name, db_config in self.metadata.databases.items():
117
+ handle = db_config.handle.format(base_dir=self.base_dir)
118
+ db_config.handle = handle
119
+ db = self.attach_database(handle, alias=name, **kwargs)
120
+ db.from_config(db_config)
121
+
44
122
  def attach_database(
45
123
  self,
46
124
  handle: str,
@@ -69,7 +147,6 @@ class Client:
69
147
  :param schema_view: schema view to associate with the database
70
148
  :param kwargs:
71
149
  :return:
72
-
73
150
  """
74
151
  if ":" not in handle:
75
152
  scheme = handle
@@ -87,6 +164,7 @@ class Client:
87
164
  if not self._databases:
88
165
  self._databases = {}
89
166
  self._databases[alias] = db
167
+ db.parent = self
90
168
  return db
91
169
 
92
170
  def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -101,7 +179,7 @@ class Client:
101
179
  >>> db == retrieved_db
102
180
  True
103
181
 
104
- :param name:
182
+ :param name: if None, there must be a single database attached
105
183
  :param create_if_not_exists:
106
184
  :param kwargs:
107
185
  :return:
@@ -149,3 +227,30 @@ class Client:
149
227
  if not self._databases:
150
228
  self._databases = {}
151
229
  return self._databases
230
+
231
+ def drop_database(self, name: str, missing_ok=False, **kwargs):
232
+ """
233
+ Drop a database.
234
+
235
+ :param name:
236
+ :param missing_ok:
237
+ :return:
238
+ """
239
+ if name in self._databases:
240
+ db = self._databases[name]
241
+ db.drop(**kwargs)
242
+ del self._databases[name]
243
+ else:
244
+ if not missing_ok:
245
+ raise ValueError(f"Database {name} not found")
246
+
247
+ def drop_all_databases(self, **kwargs):
248
+ """
249
+ Drop all databases.
250
+
251
+ :param missing_ok:
252
+ :return:
253
+ """
254
+ for name in list(self._databases.keys()):
255
+ self.drop_database(name, missing_ok=False, **kwargs)
256
+ self._databases = {}
@@ -1,16 +1,25 @@
1
+ import hashlib
1
2
  import logging
2
3
  from collections import defaultdict
3
- from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Type, Union
5
+ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
6
6
 
7
7
  import numpy as np
8
8
  from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
9
9
  from linkml_runtime.linkml_model.meta import ArrayExpression
10
10
  from pydantic import BaseModel
11
11
 
12
+ from linkml_store.index import get_indexer
13
+ from linkml_store.utils.object_utils import clean_empties
14
+
15
+ try:
16
+ from linkml.validator.report import ValidationResult
17
+ except ImportError:
18
+ ValidationResult = None
19
+
20
+ from linkml_store.api.config import CollectionConfig
12
21
  from linkml_store.api.queries import Query, QueryResult
13
- from linkml_store.index.index import Index
22
+ from linkml_store.index.indexer import Indexer
14
23
 
15
24
  if TYPE_CHECKING:
16
25
  from linkml_store.api.database import Database
@@ -19,11 +28,11 @@ logger = logging.getLogger(__name__)
19
28
 
20
29
  OBJECT = Union[Dict[str, Any], BaseModel, Type]
21
30
 
31
+ DEFAULT_FACET_LIMIT = 100
22
32
  IDENTIFIER = str
23
33
  FIELD_NAME = str
24
34
 
25
35
 
26
- @dataclass
27
36
  class Collection:
28
37
  """
29
38
  A collection is an organized set of objects of the same or similar type.
@@ -33,12 +42,93 @@ class Collection:
33
42
  - For a file system, a collection could be a single tabular file such as Parquet or CSV
34
43
  """
35
44
 
36
- name: str
45
+ # name: str
37
46
  parent: Optional["Database"] = None
38
- _indexes: Optional[Dict[str, Index]] = None
39
- hidden: Optional[bool] = False
47
+ _indexers: Optional[Dict[str, Indexer]] = None
48
+ # hidden: Optional[bool] = False
49
+
50
+ metadata: Optional[CollectionConfig] = None
51
+
52
+ def __init__(
53
+ self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
54
+ ):
55
+ self.parent = parent
56
+ if metadata:
57
+ self.metadata = metadata
58
+ else:
59
+ self.metadata = CollectionConfig(name=name, **kwargs)
60
+ if name is not None and self.metadata.name is not None and name != self.metadata.name:
61
+ raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
62
+
63
+ @property
64
+ def name(self) -> str:
65
+ """
66
+ Return the name of the collection
40
67
 
41
- def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
68
+ :return:
69
+ """
70
+ return self.metadata.name
71
+
72
+ @property
73
+ def hidden(self) -> bool:
74
+ """
75
+ True if the collection is hidden.
76
+
77
+ An example of a hidden collection is a collection that indexes another
78
+ collection
79
+
80
+ :return: True if the collection is hidden
81
+ """
82
+ return self.metadata.hidden
83
+
84
+ @property
85
+ def target_class_name(self):
86
+ """
87
+ Return the name of the class that this collection represents
88
+
89
+ This MUST be a LinkML class name
90
+
91
+ :return:
92
+ """
93
+ # TODO: this is a shim layer until we can normalize on this
94
+ if self.metadata.type:
95
+ return self.metadata.type
96
+ return self.name
97
+
98
+ @property
99
+ def alias(self):
100
+ """
101
+ Return the primary name/alias used for the collection.
102
+
103
+ This MAY be the name of the LinkML class, but it may be desirable
104
+ to have an alias, for example "persons" which collects all instances
105
+ of class Person.
106
+
107
+ The _alias SHOULD be used for Table names in SQL.
108
+
109
+ For nested data, the alias SHOULD be used as the key; e.g
110
+
111
+ ``{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }``
112
+
113
+ :return:
114
+ """
115
+ # TODO: this is a shim layer until we can normalize on this
116
+ if self.metadata.alias:
117
+ return self.metadata.alias
118
+ return self.name
119
+
120
+ def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
121
+ """
122
+ Replace entire collection with objects.
123
+
124
+ :param objs:
125
+ :param kwargs:
126
+ :return:
127
+ """
128
+ self.delete_where({})
129
+ self.insert(objs, **kwargs)
130
+
131
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
42
132
  """
43
133
  Add one or more objects to the collection
44
134
 
@@ -58,13 +148,14 @@ class Collection:
58
148
  """
59
149
  raise NotImplementedError
60
150
 
61
- def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
151
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
62
152
  """
63
153
  Delete objects that match a query
64
154
 
65
- :param where:
155
+ :param where: where conditions
156
+ :param missing_ok: if True, do not raise an error if the collection does not exist
66
157
  :param kwargs:
67
- :return:
158
+ :return: number of objects deleted (or -1 if unsupported)
68
159
  """
69
160
  raise NotImplementedError
70
161
 
@@ -79,7 +170,7 @@ class Collection:
79
170
  raise NotImplementedError
80
171
 
81
172
  def _create_query(self, **kwargs) -> Query:
82
- return Query(from_table=self.name, **kwargs)
173
+ return Query(from_table=self.alias, **kwargs)
83
174
 
84
175
  def query(self, query: Query, **kwargs) -> QueryResult:
85
176
  """
@@ -91,7 +182,9 @@ class Collection:
91
182
  """
92
183
  return self.parent.query(query, **kwargs)
93
184
 
94
- def query_facets(self, where: Optional[Dict] = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
185
+ def query_facets(
186
+ self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
187
+ ) -> Dict[str, Dict[str, int]]:
95
188
  """
96
189
  Run a query to get facet counts for one or more columns.
97
190
 
@@ -108,20 +201,66 @@ class Collection:
108
201
  :param con: A DuckDB database connection.
109
202
  :param query: A Query object representing the base query.
110
203
  :param facet_columns: A list of column names to get facet counts for.
204
+ :param facet_limit:
111
205
  :return: A dictionary where keys are column names and values are pandas DataFrames
112
206
  containing the facet counts for each unique value in the respective column.
113
207
  """
114
208
  raise NotImplementedError
115
209
 
116
210
  def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
117
- id_field = self.identifier_field
118
- q = self._create_query(where_clause={id_field: ids})
119
- return self.query(q, **kwargs)
211
+ """
212
+ Get one or more objects by ID.
213
+
214
+ :param ids:
215
+ :param kwargs:
216
+ :return:
217
+ """
218
+ # TODO
219
+ id_field = self.identifier_attribute_name
220
+ return self.find({id_field: ids})
221
+
222
+ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
223
+ """
224
+ Get one object by ID.
225
+
226
+ :param id:
227
+ :param kwargs:
228
+ :return:
229
+ """
230
+ if not id:
231
+ raise ValueError("Must pass an ID")
232
+ id_field = self.identifier_attribute_name
233
+ if not id_field:
234
+ raise ValueError(f"No identifier for {self.name}")
235
+ w = {id_field: id}
236
+ qr = self.find(w)
237
+ if qr.num_rows == 1:
238
+ return qr.rows[0]
239
+ return None
120
240
 
121
241
  def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
242
+ """
243
+ Find objects in the collection using a where query.
244
+
245
+ :param where:
246
+ :param kwargs:
247
+ :return:
248
+ """
122
249
  query = self._create_query(where_clause=where)
123
250
  return self.query(query, **kwargs)
124
251
 
252
+ def find_iter(self, where: Optional[Any] = None, **kwargs) -> Iterator[OBJECT]:
253
+ """
254
+ Find objects in the collection using a where query.
255
+
256
+ :param where:
257
+ :param kwargs:
258
+ :return:
259
+ """
260
+ qr = self.find(where=where, limit=-1, **kwargs)
261
+ for row in qr.rows:
262
+ yield row
263
+
125
264
  def search(
126
265
  self,
127
266
  query: str,
@@ -141,66 +280,122 @@ class Collection:
141
280
  :return:
142
281
  """
143
282
  if index_name is None:
144
- if len(self._indexes) == 1:
145
- index_name = list(self._indexes.keys())[0]
283
+ if len(self._indexers) == 1:
284
+ index_name = list(self._indexers.keys())[0]
146
285
  else:
147
286
  raise ValueError("Multiple indexes found. Please specify an index name.")
148
287
  ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
149
- ix = self._indexes.get(index_name)
288
+ ix = self._indexers.get(index_name)
150
289
  if not ix:
151
290
  raise ValueError(f"No index named {index_name}")
152
291
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
153
292
  index_col = ix.index_field
154
293
  vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
155
294
  results = ix.search(query, vector_pairs, limit=limit)
295
+ for r in results:
296
+ del r[1][index_col]
156
297
  new_qr = QueryResult(num_rows=len(results))
157
298
  new_qr.ranked_rows = results
158
299
  return new_qr
159
300
 
160
- def attach_index(self, index: Index, auto_index=True, **kwargs):
301
+ @property
302
+ def is_internal(self) -> bool:
303
+ """
304
+ Check if the collection is internal
305
+
306
+ :return:
307
+ """
308
+ if not self.name:
309
+ raise ValueError(f"Collection has no name: {self} // {self.metadata}")
310
+ return self.name.startswith("internal__")
311
+
312
+ def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = True, auto_index=True, **kwargs):
161
313
  """
162
314
  Attach an index to the collection.
163
315
 
164
316
  :param index:
165
- :param auto_index:
317
+ :param name:
318
+ :param auto_index: Automatically index all objects in the collection
166
319
  :param kwargs:
167
320
  :return:
168
321
  """
322
+ if isinstance(index, str):
323
+ index = get_indexer(index)
324
+ if name:
325
+ index.name = name
326
+ if not index.name:
327
+ index.name = type(index).__name__.lower()
169
328
  index_name = index.name
170
329
  if not index_name:
171
330
  raise ValueError("Index must have a name")
172
- if not self._indexes:
173
- self._indexes = {}
174
- self._indexes[index_name] = index
331
+ if not self._indexers:
332
+ self._indexers = {}
333
+ self._indexers[index_name] = index
175
334
  if auto_index:
176
335
  all_objs = self.find(limit=-1).rows
177
- self.index_objects(all_objs, index_name, **kwargs)
336
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
178
337
 
179
338
  def _index_collection_name(self, index_name: str) -> str:
180
- return f"index__{self.name}_{index_name}"
339
+ """
340
+ Create a name for a special collection that holds index data
341
+
342
+ :param index_name:
343
+ :return:
344
+ """
345
+ return f"internal__index__{self.name}__{index_name}"
181
346
 
182
- def index_objects(self, objs: List[OBJECT], index_name: str, **kwargs):
347
+ def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
183
348
  """
184
349
  Index a list of objects
185
350
 
186
351
  :param objs:
187
352
  :param index_name:
353
+ :param replace:
188
354
  :param kwargs:
189
355
  :return:
190
356
  """
191
- ix = self._indexes.get(index_name)
357
+ ix = self._indexers.get(index_name)
192
358
  if not ix:
193
359
  raise ValueError(f"No index named {index_name}")
194
- ix_coll = self.parent.get_collection(self._index_collection_name(index_name), create_if_not_exists=True)
360
+ ix_coll_name = self._index_collection_name(index_name)
361
+ ix_coll = self.parent.get_collection(ix_coll_name, create_if_not_exists=True)
195
362
  vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
196
363
  objects_with_ix = []
197
364
  index_col = ix.index_field
198
365
  for obj, vector in zip(objs, vectors):
199
366
  # TODO: id field
200
367
  objects_with_ix.append({**obj, **{index_col: vector}})
201
- ix_coll.add(objects_with_ix, **kwargs)
368
+ if replace:
369
+ schema = self.parent.schema_view.schema
370
+ logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
371
+ if ix_coll_name in schema.classes:
372
+ ix_coll.delete_where()
373
+ ix_coll.insert(objects_with_ix, **kwargs)
374
+
375
+ def list_index_names(self) -> List[str]:
376
+ """
377
+ Return a list of index names
378
+
379
+ :return:
380
+ """
381
+ return list(self._indexers.keys())
382
+
383
+ @property
384
+ def indexers(self) -> Dict[str, Indexer]:
385
+ """
386
+ Return a list of indexers
387
+
388
+ :return:
389
+ """
390
+ return self._indexers if self._indexers else {}
202
391
 
203
392
  def peek(self, limit: Optional[int] = None) -> QueryResult:
393
+ """
394
+ Return the first N objects in the collection
395
+
396
+ :param limit:
397
+ :return:
398
+ """
204
399
  q = self._create_query()
205
400
  return self.query(q, limit=limit)
206
401
 
@@ -212,22 +407,45 @@ class Collection:
212
407
  """
213
408
  sv = self.parent.schema_view
214
409
  if sv:
215
- return sv.get_class(self.name)
410
+ cls = sv.get_class(self.target_class_name)
411
+ return cls
216
412
  return None
217
413
 
414
+ @property
218
415
  def identifier_attribute_name(self) -> Optional[str]:
219
416
  """
220
417
  Return the name of the identifier attribute for the collection.
221
418
 
419
+ AKA the primary key.
420
+
222
421
  :return: The name of the identifier attribute, if one exists.
223
422
  """
224
423
  cd = self.class_definition()
225
424
  if cd:
226
- for att in cd.attributes.values():
425
+ for att in self.parent.schema_view.class_induced_slots(cd.name):
227
426
  if att.identifier:
228
427
  return att.name
229
428
  return None
230
429
 
430
+ def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
431
+ """
432
+ Return the identifier for an object.
433
+
434
+ :param obj:
435
+ :param auto: If True, generate an identifier if one does not exist.
436
+ :return:
437
+ """
438
+ pk = self.identifier_attribute_name
439
+ if pk in obj:
440
+ return obj[pk]
441
+ elif auto:
442
+ # TODO: use other unique keys if no primary key
443
+ as_str = str(obj)
444
+ md5 = hashlib.md5(as_str.encode()).hexdigest()
445
+ return md5
446
+ else:
447
+ return None
448
+
231
449
  def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_size=10) -> ClassDefinition:
232
450
  """
233
451
  Induce a class definition from a list of objects.
@@ -239,7 +457,7 @@ class Collection:
239
457
  :param max_sample_size:
240
458
  :return:
241
459
  """
242
- cd = ClassDefinition(self.name)
460
+ cd = ClassDefinition(self.target_class_name)
243
461
  keys = defaultdict(list)
244
462
  for obj in objs[0:max_sample_size]:
245
463
  if isinstance(obj, BaseModel):
@@ -302,7 +520,7 @@ class Collection:
302
520
  array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
303
521
  cd.attributes[k].array = array_expr
304
522
  sv = self.parent.schema_view
305
- sv.schema.classes[self.name] = cd
523
+ sv.schema.classes[self.target_class_name] = cd
306
524
  sv.set_modified()
307
525
  return cd
308
526
 
@@ -325,3 +543,23 @@ class Collection:
325
543
  :return:
326
544
  """
327
545
  raise NotImplementedError
546
+
547
+ def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
548
+ """
549
+ Validate the contents of the collection
550
+
551
+ :param kwargs:
552
+ :return: iterator over validation results
553
+ """
554
+ from linkml.validator import JsonschemaValidationPlugin, Validator
555
+
556
+ validation_plugins = [JsonschemaValidationPlugin(closed=True)]
557
+ validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
558
+ cd = self.class_definition()
559
+ if not cd:
560
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
561
+ class_name = cd.name
562
+ result = self.find(**kwargs)
563
+ for obj in result.rows:
564
+ obj = clean_empties(obj)
565
+ yield from validator.iter_results(obj, class_name)