linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,1280 @@
1
+ """A structure for representing collections of similar objects."""
2
+
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ ClassVar,
12
+ Dict,
13
+ Generic,
14
+ Iterable,
15
+ Iterator,
16
+ List,
17
+ Optional,
18
+ TextIO,
19
+ Tuple,
20
+ Type,
21
+ Union,
22
+ )
23
+
24
+ import numpy as np
25
+ from linkml_runtime import SchemaView
26
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
27
+ from linkml_runtime.linkml_model.meta import ArrayExpression
28
+ from pydantic import BaseModel
29
+
30
+ from linkml_store.api.types import DatabaseType
31
+ from linkml_store.index import get_indexer
32
+ from linkml_store.utils.format_utils import load_objects, load_objects_from_url
33
+ from linkml_store.utils.object_utils import clean_empties
34
+ from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
35
+
36
+ try:
37
+ from linkml.validator.report import ValidationResult
38
+ except ImportError:
39
+ ValidationResult = None
40
+
41
+ from linkml_store.api.config import CollectionConfig
42
+ from linkml_store.api.queries import Query, QueryResult
43
+ from linkml_store.index.indexer import Indexer
44
+
45
+ if TYPE_CHECKING:
46
+ from linkml_store.api.database import Database
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+ OBJECT = Union[Dict[str, Any], BaseModel, Type]
51
+
52
+ DEFAULT_FACET_LIMIT = 100
53
+ IDENTIFIER = str
54
+ FIELD_NAME = str
55
+
56
+
57
+ class Collection(Generic[DatabaseType]):
58
+ """
59
+ A collection is an organized set of objects of the same or similar type.
60
+
61
+ - For relational databases, a collection is typically a table
62
+ - For document databases such as MongoDB, a collection is the native type
63
+ - For a file system, a collection could be a single tabular file such as Parquet or CSV.
64
+
65
+ Collection objects are typically not created directly - instead they are generated
66
+ from a parent :class:`.Database` object:
67
+
68
+ >>> from linkml_store import Client
69
+ >>> client = Client()
70
+ >>> db = client.attach_database("duckdb", alias="test")
71
+ >>> collection = db.create_collection("Person")
72
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
73
+ >>> collection.insert(objs)
74
+ """
75
+
76
+ # name: str
77
+ parent: Optional[DatabaseType] = None
78
+ _indexers: Optional[Dict[str, Indexer]] = None
79
+ _initialized: Optional[bool] = None
80
+ # hidden: Optional[bool] = False
81
+
82
+ metadata: Optional[CollectionConfig] = None
83
+ default_index_name: ClassVar[str] = "simple"
84
+
85
+ def __init__(
86
+ self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
87
+ ):
88
+ self.parent = parent
89
+ if metadata:
90
+ self.metadata = metadata
91
+ else:
92
+ self.metadata = CollectionConfig(type=name, **kwargs)
93
+ if not self.metadata.alias:
94
+ self.metadata.alias = name
95
+ if not self.metadata.type:
96
+ self.metadata.type = name
97
+ # if name is not None and self.metadata.name is not None and name != self.metadata.name:
98
+ # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
99
+
100
+ @property
101
+ def hidden(self) -> bool:
102
+ """
103
+ True if the collection is hidden.
104
+
105
+ An example of a hidden collection is a collection that indexes another
106
+ collection
107
+
108
+ :return: True if the collection is hidden
109
+ """
110
+ # return self.metadata.hidden
111
+
112
+ @property
113
+ def target_class_name(self):
114
+ """
115
+ Return the name of the class that this collection represents
116
+
117
+ This MUST be a LinkML class name
118
+
119
+ >>> from linkml_store import Client
120
+ >>> client = Client()
121
+ >>> db = client.attach_database("duckdb", alias="test")
122
+ >>> collection = db.create_collection("Person", alias="persons")
123
+ >>> collection.target_class_name
124
+ 'Person'
125
+
126
+ >>> collection = db.create_collection("Organization")
127
+ >>> collection.target_class_name
128
+ 'Organization'
129
+ >>> collection.alias
130
+ 'Organization'
131
+
132
+ :return: name of the class which members of this collection instantiate
133
+ """
134
+ # TODO: this is a shim layer until we can normalize on this
135
+ if self.metadata.type:
136
+ return self.metadata.type
137
+ return self.alias
138
+
139
+ @property
140
+ def alias(self):
141
+ """
142
+ Return the primary name/alias used for the collection.
143
+
144
+ This MAY be the name of the LinkML class, but it may be desirable
145
+ to have an alias, for example "persons" which collects all instances
146
+ of class Person.
147
+
148
+ >>> from linkml_store import Client
149
+ >>> client = Client()
150
+ >>> db = client.attach_database("duckdb", alias="test")
151
+ >>> collection = db.create_collection("Person", alias="persons")
152
+ >>> collection.alias
153
+ 'persons'
154
+
155
+ If no explicit alias is provided, then the target class name is used:
156
+
157
+ >>> from linkml_store import Client
158
+ >>> client = Client()
159
+ >>> db = client.attach_database("duckdb", alias="test")
160
+ >>> collection = db.create_collection("Person")
161
+ >>> collection.alias
162
+ 'Person'
163
+
164
+ The alias SHOULD be used for Table names in SQL.
165
+
166
+ For nested data, the alias SHOULD be used as the key; e.g
167
+
168
+ .. code-block:: json
169
+
170
+ { "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }
171
+
172
+ :return:
173
+ """
174
+ # TODO: this is a shim layer until we can normalize on this
175
+ if self.metadata.alias:
176
+ return self.metadata.alias
177
+ return self.target_class_name
178
+
179
+ def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
180
+ """
181
+ Replace entire collection with objects.
182
+
183
+ >>> from linkml_store import Client
184
+ >>> client = Client()
185
+ >>> db = client.attach_database("duckdb", alias="test")
186
+ >>> collection = db.create_collection("Person")
187
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
188
+ >>> collection.insert(objs)
189
+
190
+ :param objs:
191
+ :param kwargs:
192
+ :return:
193
+ """
194
+ self.delete_where({})
195
+ self.insert(objs, **kwargs)
196
+
197
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
198
+ """
199
+ Add one or more objects to the collection.
200
+
201
+ >>> from linkml_store import Client
202
+ >>> client = Client()
203
+ >>> db = client.attach_database("duckdb", alias="test")
204
+ >>> collection = db.create_collection("Person")
205
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
206
+ >>> collection.insert(objs)
207
+
208
+ :param objs:
209
+ :param kwargs:
210
+ :return:
211
+ """
212
+ raise NotImplementedError
213
+
214
+ def index(
215
+ self,
216
+ objs: Union[OBJECT, List[OBJECT]],
217
+ index_name: Optional[str] = None,
218
+ replace: bool = False,
219
+ unique: bool = False,
220
+ **kwargs,
221
+ ) -> None:
222
+ """
223
+ Index objects in the collection.
224
+
225
+ :param objs:
226
+ :param index_name:
227
+ :param replace: replace the index, or not
228
+ :param unique: boolean used to declare the index unique or not
229
+ :param kwargs:
230
+ :return:
231
+ """
232
+ raise NotImplementedError
233
+
234
+ def upsert(
235
+ self,
236
+ objs: Union[OBJECT, List[OBJECT]],
237
+ filter_fields: List[str],
238
+ update_fields: Union[List[str], None] = None,
239
+ **kwargs,
240
+ ):
241
+ """
242
+ Add one or more objects to the collection.
243
+
244
+ >>> from linkml_store import Client
245
+ >>> client = Client()
246
+ >>> db = client.attach_database("mongodb", alias="test")
247
+ >>> collection = db.create_collection("Person")
248
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
249
+ >>> collection.upsert(objs)
250
+
251
+ :param objs:
252
+ :param filter_fields: List of field names to use as the filter for matching existing collections.
253
+ :param update_fields: List of field names to include in the update. If None, all fields are updated.
254
+ :param kwargs:
255
+
256
+ :return:
257
+ """
258
+ raise NotImplementedError
259
+
260
+ def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
261
+ """
262
+ Pre-query hook.
263
+
264
+ This is called before a query is executed. It is used to materialize derivations and indexes.
265
+ :param query:
266
+ :param kwargs:
267
+ :return:
268
+ """
269
+ logger.debug(f"Pre-query hook (state: {self._initialized}; Q= {query}") # if logging.info, this is very noisy.
270
+ if not self._initialized:
271
+ self._materialize_derivations()
272
+ self._initialized = True
273
+
274
+ def _pre_insert_hook(self, objs: List[OBJECT], **kwargs):
275
+ if self.metadata.validate_modifications:
276
+ errors = list(self.iter_validate_collection(objs))
277
+ if errors:
278
+ raise ValueError(f"Validation errors: {errors}")
279
+
280
+ def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
281
+ self._initialized = True
282
+ patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
283
+ self._broadcast(patches, **kwargs)
284
+ self._post_modification_hook(**kwargs)
285
+
286
+ def _post_delete_hook(self, **kwargs):
287
+ self._post_modification_hook(**kwargs)
288
+
289
+ def _post_modification_hook(self, **kwargs):
290
+ for indexer in self.indexers.values():
291
+ ix_collection_name = self.get_index_collection_name(indexer)
292
+ ix_collection = self.parent.get_collection(ix_collection_name)
293
+ # Currently updating the source triggers complete reindexing
294
+ # TODO: make this more efficient by only deleting modified
295
+ ix_collection.delete_where({})
296
+
297
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
298
+ """
299
+ Delete one or more objects from the collection.
300
+
301
+ First let's set up a collection:
302
+
303
+ >>> from linkml_store import Client
304
+ >>> client = Client()
305
+ >>> db = client.attach_database("duckdb", alias="test")
306
+ >>> collection = db.create_collection("Person")
307
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
308
+ >>> collection.insert(objs)
309
+ >>> collection.find({}).num_rows
310
+ 2
311
+
312
+ Now let's delete an object:
313
+
314
+ >>> collection.delete(objs[0])
315
+ >>> collection.find({}).num_rows
316
+ 1
317
+
318
+ Deleting the same object again should have no effect:
319
+
320
+ >>> collection.delete(objs[0])
321
+ >>> collection.find({}).num_rows
322
+ 1
323
+
324
+ :param objs:
325
+ :param kwargs:
326
+ :return:
327
+ """
328
+ raise NotImplementedError
329
+
330
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
331
+ """
332
+ Delete objects that match a query.
333
+
334
+ First let's set up a collection:
335
+
336
+ >>> from linkml_store import Client
337
+ >>> client = Client()
338
+ >>> db = client.attach_database("duckdb", alias="test")
339
+ >>> collection = db.create_collection("Person")
340
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
341
+ >>> collection.insert(objs)
342
+
343
+ Now let's delete an object:
344
+
345
+ >>> collection.delete_where({"id": "P1"})
346
+ >>> collection.find({}).num_rows
347
+ 1
348
+
349
+ Match everything:
350
+
351
+ >>> collection.delete_where({})
352
+ >>> collection.find({}).num_rows
353
+ 0
354
+
355
+ :param where: where conditions
356
+ :param missing_ok: if True, do not raise an error if the collection does not exist
357
+ :param kwargs:
358
+ :return: number of objects deleted (or -1 if unsupported)
359
+ """
360
+ raise NotImplementedError
361
+
362
+ def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
363
+ """
364
+ Update one or more objects in the collection.
365
+
366
+ :param objs:
367
+ :param kwargs:
368
+ :return:
369
+ """
370
+ raise NotImplementedError
371
+
372
+ def _create_query(self, **kwargs) -> Query:
373
+ return Query(from_table=self.alias, **kwargs)
374
+
375
+ def query(self, query: Query, **kwargs) -> QueryResult:
376
+ """
377
+ Run a query against the collection.
378
+
379
+ First let's load a collection:
380
+
381
+ >>> from linkml_store import Client
382
+ >>> from linkml_store.utils.format_utils import load_objects
383
+ >>> client = Client()
384
+ >>> db = client.attach_database("duckdb")
385
+ >>> collection = db.create_collection("Country")
386
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
387
+ >>> collection.insert(objs)
388
+
389
+ Now let's run a query:
390
+
391
+ TODO
392
+
393
+ :param query:
394
+ :param kwargs:
395
+ :return:
396
+ """
397
+ self._pre_query_hook()
398
+ return self.parent.query(query, **kwargs)
399
+
400
+ def query_facets(
401
+ self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
402
+ ) -> Dict[str, List[Tuple[Any, int]]]:
403
+ """
404
+ Run a query to get facet counts for one or more columns.
405
+
406
+ This function takes a database connection, a Query object, and a list of column names.
407
+ It generates and executes a facet count query for each specified column and returns
408
+ the results as a dictionary where the keys are the column names and the values are
409
+ pandas DataFrames containing the facet counts.
410
+
411
+ The facet count query is generated by modifying the original query's WHERE clause
412
+ to exclude conditions directly related to the facet column. This allows for counting
413
+ the occurrences of each unique value in the facet column while still applying the
414
+ other filtering conditions.
415
+
416
+ :param con: A DuckDB database connection.
417
+ :param query: A Query object representing the base query.
418
+ :param facet_columns: A list of column names to get facet counts for.
419
+ :param facet_limit:
420
+ :return: A dictionary where keys are column names and values are tuples
421
+ containing the facet counts for each unique value in the respective column.
422
+ """
423
+ raise NotImplementedError
424
+
425
+ def get(self, ids: Optional[List[IDENTIFIER]], **kwargs) -> QueryResult:
426
+ """
427
+ Get one or more objects by ID.
428
+
429
+ :param ids:
430
+ :param kwargs:
431
+ :return:
432
+ """
433
+ id_field = self.identifier_attribute_name
434
+ if not id_field:
435
+ raise ValueError(f"No identifier for {self.name}")
436
+ if len(ids) == 1:
437
+ return self.find({id_field: ids[0]})
438
+ else:
439
+ return self.find({id_field: {"$in": ids}})
440
+
441
+ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
442
+ """
443
+ Get one object by ID.
444
+
445
+ :param id:
446
+ :param kwargs:
447
+ :return:
448
+ """
449
+ if not id:
450
+ raise ValueError("Must pass an ID")
451
+ id_field = self.identifier_attribute_name
452
+ if not id_field:
453
+ raise ValueError(f"No identifier for {self.name}")
454
+ w = {id_field: id}
455
+ qr = self.find(w)
456
+ if qr.num_rows == 1:
457
+ return qr.rows[0]
458
+ return None
459
+
460
+ def find(
461
+ self,
462
+ where: Optional[Any] = None,
463
+ select_cols: Optional[List[str]] = None,
464
+ **kwargs,
465
+ ) -> QueryResult:
466
+ """
467
+ Find objects in the collection using a where query.
468
+
469
+ As an example, first load a collection:
470
+
471
+ >>> from linkml_store import Client
472
+ >>> from linkml_store.utils.format_utils import load_objects
473
+ >>> client = Client()
474
+ >>> db = client.attach_database("duckdb")
475
+ >>> collection = db.create_collection("Country")
476
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
477
+ >>> collection.insert(objs)
478
+
479
+ Now let's find all objects:
480
+
481
+ >>> qr = collection.find({})
482
+ >>> qr.num_rows
483
+ 20
484
+
485
+ We can do a more restrictive query:
486
+
487
+ >>> qr = collection.find({"code": "FR"})
488
+ >>> qr.num_rows
489
+ 1
490
+ >>> qr.rows[0]["name"]
491
+ 'France'
492
+
493
+
494
+ :param where:
495
+ :param select_cols:
496
+ :param kwargs:
497
+ :return:
498
+ """
499
+ query = self._create_query(
500
+ where_clause=where,
501
+ select_cols=select_cols,
502
+ )
503
+ self._pre_query_hook(query)
504
+ return self.query(query, **kwargs)
505
+
506
+ def find_iter(self, where: Optional[Any] = None, page_size=100, **kwargs) -> Iterator[OBJECT]:
507
+ """
508
+ Find objects in the collection using a where query.
509
+
510
+ :param where:
511
+ :param kwargs:
512
+ :return:
513
+ """
514
+ total_rows = None
515
+ offset = 0
516
+ if page_size < 1:
517
+ raise ValueError(f"Invalid page size: {page_size}")
518
+ while True:
519
+ qr = self.find(where=where, offset=offset, limit=page_size, **kwargs)
520
+ if total_rows is None:
521
+ total_rows = qr.num_rows
522
+ if not qr.rows:
523
+ return
524
+ for row in qr.rows:
525
+ yield row
526
+ offset += page_size
527
+ if offset >= total_rows:
528
+ break
529
+ return
530
+
531
+ def search(
532
+ self,
533
+ query: str,
534
+ where: Optional[Any] = None,
535
+ index_name: Optional[str] = None,
536
+ limit: Optional[int] = None,
537
+ select_cols: Optional[List[str]] = None,
538
+ mmr_relevance_factor: Optional[float] = None,
539
+ **kwargs,
540
+ ) -> QueryResult:
541
+ """
542
+ Search the collection using a text-based index index.
543
+
544
+ Example:
545
+
546
+ >>> from linkml_store import Client
547
+ >>> from linkml_store.utils.format_utils import load_objects
548
+ >>> client = Client()
549
+ >>> db = client.attach_database("duckdb")
550
+ >>> collection = db.create_collection("Country")
551
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
552
+ >>> collection.insert(objs)
553
+
554
+ Now let's index, using the simple trigram-based index
555
+
556
+ >>> index = get_indexer("simple")
557
+ >>> _ = collection.attach_indexer(index)
558
+
559
+ Now let's find all objects:
560
+
561
+ >>> qr = collection.search("France")
562
+ >>> score, top_obj = qr.ranked_rows[0]
563
+ >>> assert score > 0.1
564
+ >>> top_obj["code"]
565
+ 'FR'
566
+
567
+ :param query:
568
+ :param where:
569
+ :param index_name:
570
+ :param limit:
571
+ :param select_cols:
572
+ :param kwargs:
573
+ :return:
574
+ """
575
+ self._pre_query_hook()
576
+ if index_name is None:
577
+ if len(self.indexers) == 1:
578
+ index_name = list(self.indexers.keys())[0]
579
+ else:
580
+ logger.warning("Multiple indexes found. Using default index.")
581
+ index_name = self.default_index_name
582
+ ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
583
+ if index_name not in self.indexers:
584
+ logger.debug(f"Indexer not found: {index_name} -- creating")
585
+ ix = get_indexer(index_name)
586
+ if not self._indexers:
587
+ self._indexers = {}
588
+ self._indexers[index_name] = ix
589
+ ix = self.indexers.get(index_name)
590
+ if not ix:
591
+ raise ValueError(f"No index named {index_name}")
592
+ logger.debug(f"Using indexer {type(ix)} with name {index_name}")
593
+ if ix_coll.size() == 0:
594
+ logger.info(f"Index {index_name} is empty; indexing all objects")
595
+ all_objs = self.find(limit=-1).rows
596
+ if all_objs:
597
+ # print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
598
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
599
+ assert ix_coll.size() > 0
600
+ qr = ix_coll.find(where=where, limit=-1, **kwargs)
601
+ index_col = ix.index_field
602
+
603
+ # TODO: optimize this for large indexes
604
+ def row2array(row):
605
+ v = row[index_col]
606
+ if isinstance(v, str):
607
+ # sqlite stores arrays as strings
608
+ v = json.loads(v)
609
+ return np.array(v, dtype=float)
610
+
611
+ vector_pairs = [(row, row2array(row)) for row in qr.rows]
612
+ results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
613
+ for r in results:
614
+ del r[1][index_col]
615
+ if select_cols:
616
+ new_results = []
617
+ for r in results:
618
+ new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
619
+ results = new_results
620
+ new_qr = QueryResult(num_rows=len(results))
621
+ new_qr.ranked_rows = results
622
+ new_qr.rows = [r[1] for r in results]
623
+ return new_qr
624
+
625
+ def group_by(
626
+ self,
627
+ group_by_fields: List[str],
628
+ inlined_field="objects",
629
+ agg_map: Optional[Dict[str, str]] = None,
630
+ where: Optional[Dict] = None,
631
+ **kwargs,
632
+ ) -> QueryResult:
633
+ """
634
+ Group objects in the collection by a column.
635
+
636
+ :param group_by:
637
+ :param where:
638
+ :param kwargs:
639
+ :return:
640
+ """
641
+ if isinstance(group_by_fields, str):
642
+ group_by_fields = [group_by_fields]
643
+ df = self.find(where=where, limit=-1).rows_dataframe
644
+
645
+ # Handle the case where agg_map is None
646
+ if agg_map is None:
647
+ agg_map = {}
648
+
649
+ pk_fields = agg_map.get("first", []) + group_by_fields
650
+ list_fields = agg_map.get("list", [])
651
+ if not list_fields:
652
+ list_fields = [a for a in df.columns if a not in pk_fields]
653
+
654
+ grouped_objs = defaultdict(list)
655
+ for _, row in df.iterrows():
656
+ pk = tuple(row[pk_fields])
657
+ grouped_objs[pk].append({k: row[k] for k in list_fields})
658
+ results = []
659
+ for pk, objs in grouped_objs.items():
660
+ top_obj = {k: v for k, v in zip(pk_fields, pk)}
661
+ top_obj[inlined_field] = objs
662
+ results.append(top_obj)
663
+ r = QueryResult(num_rows=len(results), rows=results)
664
+ return r
665
+
666
+ @property
667
+ def is_internal(self) -> bool:
668
+ """
669
+ Check if the collection is internal.
670
+
671
+ Internal collections are hidden by default. Examples of internal collections
672
+ include shadow "index" collections
673
+
674
+ :return:
675
+ """
676
+ if not self.alias:
677
+ raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
678
+ return self.alias.startswith("internal__")
679
+
680
+ def exists(self) -> Optional[bool]:
681
+ """
682
+ Check if the collection exists.
683
+
684
+ :return:
685
+ """
686
+ cd = self.class_definition()
687
+ return cd is not None and cd.attributes
688
+
689
+ def load_from_source(self, load_if_exists=False):
690
+ """
691
+ Load objects from the source location.
692
+
693
+ :param load_if_exists:
694
+ :return:
695
+ """
696
+ if not load_if_exists and self.exists():
697
+ return
698
+ metadata = self.metadata
699
+ if metadata.source:
700
+ source = metadata.source
701
+ kwargs = source.arguments or {}
702
+ if source.local_path:
703
+ objects = load_objects(
704
+ metadata.source.local_path,
705
+ format=source.format,
706
+ expected_type=source.expected_type,
707
+ compression=source.compression,
708
+ select_query=source.select_query,
709
+ **kwargs,
710
+ )
711
+ elif metadata.source.url:
712
+ objects = load_objects_from_url(
713
+ metadata.source.url,
714
+ format=source.format,
715
+ expected_type=source.expected_type,
716
+ compression=source.compression,
717
+ select_query=source.select_query,
718
+ **kwargs,
719
+ )
720
+ else:
721
+ raise ValueError("No source local_path or url provided")
722
+ self.insert(objects)
723
+
724
+ def _check_if_initialized(self) -> bool:
725
+ return self._initialized
726
+
727
+ def _materialize_derivations(self, **kwargs):
728
+ metadata = self.metadata
729
+ if not metadata.derived_from:
730
+ logger.info(f"No metadata for {self.alias}; no derivations")
731
+ return
732
+ if self._check_if_initialized():
733
+ logger.info(f"Already initialized {self.alias}; no derivations")
734
+ return
735
+ parent_db = self.parent
736
+ client = parent_db.parent
737
+ # cd = self.class_definition()
738
+ for derivation in metadata.derived_from:
739
+ # TODO: optimize this; utilize underlying engine
740
+ logger.info(f"Deriving from {derivation}")
741
+ if derivation.database:
742
+ db = client.get_database(derivation.database)
743
+ else:
744
+ db = parent_db
745
+ if derivation.collection:
746
+ coll = db.get_collection(derivation.collection)
747
+ else:
748
+ coll = self
749
+ coll.class_definition()
750
+ source_obj_iter = coll.find_iter(derivation.where or {})
751
+ mappings = derivation.mappings
752
+ if not mappings:
753
+ raise ValueError(f"No mappings for {self.name}")
754
+ target_class_name = self.target_class_name
755
+ from linkml_map.session import Session
756
+
757
+ session = Session()
758
+ session.set_source_schema(db.schema_view.schema)
759
+ session.set_object_transformer(
760
+ {
761
+ "class_derivations": {
762
+ target_class_name: {
763
+ "populated_from": coll.target_class_name,
764
+ "slot_derivations": mappings,
765
+ },
766
+ }
767
+ },
768
+ )
769
+ logger.debug(f"Session Spec: {session.object_transformer}")
770
+ tr_objs = []
771
+ for source_obj in source_obj_iter:
772
+ tr_obj = session.transform(source_obj, source_type=coll.target_class_name)
773
+ tr_objs.append(tr_obj)
774
+ if not tr_objs:
775
+ raise ValueError(f"No objects derived from {coll.name}")
776
+ self.insert(tr_objs)
777
+ self.commit()
778
+
779
+ def size(self) -> int:
780
+ """
781
+ Return the number of objects in the collection.
782
+
783
+ :return: The number of objects in the collection.
784
+ """
785
+ return self.find({}, limit=1).num_rows
786
+
787
+ def rows_iter(self) -> Iterable[OBJECT]:
788
+ """
789
+ Return an iterator over the objects in the collection.
790
+
791
+ :return:
792
+ """
793
+ yield from self.find({}, limit=-1).rows
794
+
795
+ @property
796
+ def rows(self) -> List[OBJECT]:
797
+ """
798
+ Return a list of objects in the collection.
799
+
800
+ :return:
801
+ """
802
+ return list(self.rows_iter())
803
+
804
+ def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
805
+ """
806
+ Return a list of objects in the collection, with scores.
807
+ """
808
+ return [(n, obj) for n, obj in enumerate(self.rows_iter())]
809
+
810
+ def attach_indexer(
811
+ self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
812
+ ) -> Indexer:
813
+ """
814
+ Attach an index to the collection.
815
+
816
+ As an example, first let's create a collection in a database:
817
+
818
+ >>> from linkml_store import Client
819
+ >>> from linkml_store.utils.format_utils import load_objects
820
+ >>> client = Client()
821
+ >>> db = client.attach_database("duckdb")
822
+ >>> collection = db.create_collection("Country")
823
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
824
+ >>> collection.insert(objs)
825
+
826
+ We will create two indexes - one that indexes the whole object
827
+ (default behavior), the other one indexes the name only
828
+
829
+ >>> full_index = get_indexer("simple")
830
+ >>> full_index.name = "full"
831
+ >>> name_index = get_indexer("simple", text_template="{name}")
832
+ >>> name_index.name = "name"
833
+ >>> _ = collection.attach_indexer(full_index)
834
+ >>> _ = collection.attach_indexer(name_index)
835
+
836
+ Now let's find objects using the full index, using the string "France".
837
+ We expect the country France to be the top hit, but the score will
838
+ be less than zero because we did not match all fields in the object.
839
+
840
+ >>> qr = collection.search("France", index_name="full")
841
+ >>> score, top_obj = qr.ranked_rows[0]
842
+ >>> assert score > 0.1
843
+ >>> assert score < 0.5
844
+ >>> top_obj["code"]
845
+ 'FR'
846
+
847
+ Now using the name index
848
+
849
+ >>> qr = collection.search("France", index_name="name")
850
+ >>> score, top_obj = qr.ranked_rows[0]
851
+ >>> assert score > 0.99
852
+ >>> top_obj["code"]
853
+ 'FR'
854
+
855
+ :param index:
856
+ :param name:
857
+ :param auto_index: Automatically index all objects in the collection
858
+ :param kwargs:
859
+ :return:
860
+ """
861
+ if isinstance(index, str):
862
+ index = get_indexer(index)
863
+ if name:
864
+ index.name = name
865
+ if not index.name:
866
+ index.name = type(index).__name__.lower()
867
+ index_name = index.name
868
+ if not index_name:
869
+ raise ValueError("Index must have a name")
870
+ if not self._indexers:
871
+ self._indexers = {}
872
+ self._indexers[index_name] = index
873
+ if auto_index:
874
+ all_objs = self.find(limit=-1).rows
875
+ logger.info(f"Auto-indexing {len(all_objs)} objects")
876
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
877
+ return index
878
+
879
+ def get_index_collection_name(self, indexer: Indexer) -> str:
880
+ return self._index_collection_name(indexer.name)
881
+
882
+ def _index_collection_name(self, index_name: str) -> str:
883
+ """
884
+ Create a name for a special collection that holds index data
885
+
886
+ :param index_name:
887
+ :param indexer:
888
+ :return:
889
+ """
890
+ return f"internal__index__{self.alias}__{index_name}"
891
+
892
+ def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
893
+ """
894
+ Index a list of objects using a specified index.
895
+
896
+ By default, the indexed objects will be stored in a shadow
897
+ collection in the same database, with additional fields for the index vector
898
+
899
+ TODO: Support batch_size parameter for processing large collections
900
+ TODO: Implement parallel indexing for multiple objects
901
+ TODO: Add progress reporting for long-running index operations
902
+ TODO: Support incremental indexing (only index new/changed items)
903
+
904
+ :param objs:
905
+ :param index_name: e.g. simple, llm
906
+ :param replace:
907
+ :param kwargs:
908
+ :return:
909
+ """
910
+ ix = self._indexers.get(index_name, None)
911
+ if not ix:
912
+ raise ValueError(f"No index named {index_name}")
913
+ ix_coll_name = self._index_collection_name(index_name)
914
+ ix_coll = self.parent.get_collection(ix_coll_name, create_if_not_exists=True)
915
+ if not ix_coll.metadata:
916
+ ix_coll.metadata = CollectionConfig()
917
+ if not ix_coll.metadata.additional_properties:
918
+ ix_coll.metadata.additional_properties = {}
919
+ #for k in ["name", "index_type", "index_field", "index_value_field"]:
920
+ # ix_coll.metadata.additional_properties[k] = getattr(ix, k)
921
+ for k, v in ix.model_dump().items():
922
+ ix_coll.metadata.additional_properties[k] = v
923
+ ix_coll.store_metadata()
924
+ # TODO: Process vectors in batches rather than all at once
925
+ vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
926
+ objects_with_ix = []
927
+ index_col = ix.index_field
928
+ # TODO: implement this
929
+ index_value_col = ix.index_value_field
930
+ for obj, vector in zip(objs, vectors):
931
+ # TODO: id field
932
+ objects_with_ix.append({**obj, **{index_col: vector}})
933
+ if replace:
934
+ schema = self.parent.schema_view.schema
935
+ logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
936
+ if ix_coll_name in schema.classes:
937
+ ix_coll.delete_where()
938
+
939
+ # TODO: Use bulk insert operations for better performance
940
+ logger.info(f"Inserting {len(objects_with_ix)} objects into {ix_coll_name}")
941
+ ix_coll.insert(objects_with_ix, **kwargs)
942
+ ix_coll.commit()
943
+
944
+ def list_index_names(self) -> List[str]:
945
+ """
946
+ Return a list of index names
947
+
948
+ :return:
949
+ """
950
+ return list(self._indexers.keys())
951
+
952
+ @property
953
+ def indexers(self) -> Dict[str, Indexer]:
954
+ """
955
+ Return a list of indexers
956
+
957
+ :return:
958
+ """
959
+ return self._indexers if self._indexers else {}
960
+
961
+ def peek(self, limit: Optional[int] = None) -> QueryResult:
962
+ """
963
+ Return the first N objects in the collection
964
+
965
+ :param limit:
966
+ :return:
967
+ """
968
+ q = self._create_query()
969
+ return self.query(q, limit=limit)
970
+
971
+ def class_definition(self) -> Optional[ClassDefinition]:
972
+ """
973
+ Return the class definition for the collection.
974
+
975
+ If no schema has been explicitly set, and the native database does not
976
+ have a schema, then a schema will be induced from the objects in the collection.
977
+
978
+ :return:
979
+ """
980
+ sv: SchemaView = self.parent.schema_view
981
+ if sv:
982
+ cls = sv.get_class(self.target_class_name)
983
+ # if not cls:
984
+ # logger.warning(f"{self.target_class_name} not in {sv.all_classes().keys()} ")
985
+ # cls = sv.schema.classes[self.target_class_name]
986
+ if cls and not cls.attributes:
987
+ if not sv.class_induced_slots(cls.name):
988
+ for att in self._induce_attributes():
989
+ cls.attributes[att.name] = att
990
+ sv.set_modified()
991
+ return cls
992
+ return None
993
+
994
+ def _induce_attributes(self) -> List[SlotDefinition]:
995
+ result = self.find({}, limit=-1)
996
+ cd = self.induce_class_definition_from_objects(result.rows, max_sample_size=None)
997
+ return list(cd.attributes.values())
998
+
999
+ @property
1000
+ def identifier_attribute_name(self) -> Optional[str]:
1001
+ """
1002
+ Return the name of the identifier attribute for the collection.
1003
+
1004
+ AKA the primary key.
1005
+
1006
+ :return: The name of the identifier attribute, if one exists.
1007
+ """
1008
+ cd = self.class_definition()
1009
+ if cd:
1010
+ for att in self.parent.schema_view.class_induced_slots(cd.name):
1011
+ if att.identifier:
1012
+ return att.name
1013
+ return None
1014
+
1015
+ def set_identifier_attribute_name(self, name: str):
1016
+ """
1017
+ Set the name of the identifier attribute for the collection.
1018
+
1019
+ AKA the primary key.
1020
+
1021
+ :param name: The name of the identifier attribute.
1022
+ """
1023
+ cd = self.class_definition()
1024
+ if not cd:
1025
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
1026
+ id_att = None
1027
+ candidates = []
1028
+ sv: SchemaView = self.parent.schema_view
1029
+ cls = sv.get_class(cd.name)
1030
+ existing_id_slot = sv.get_identifier_slot(cls.name)
1031
+ if existing_id_slot:
1032
+ if existing_id_slot.name == name:
1033
+ return
1034
+ existing_id_slot.identifier = False
1035
+ for att in cls.attributes.values():
1036
+ candidates.append(att.name)
1037
+ if att.name == name:
1038
+ att.identifier = True
1039
+ id_att = att
1040
+ else:
1041
+ att.identifier = False
1042
+ if not id_att:
1043
+ raise ValueError(f"No attribute found with name {name} in {candidates}")
1044
+ sv.set_modified()
1045
+
1046
+ def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
1047
+ """
1048
+ Return the identifier for an object.
1049
+
1050
+ :param obj:
1051
+ :param auto: If True, generate an identifier if one does not exist.
1052
+ :return:
1053
+ """
1054
+ pk = self.identifier_attribute_name
1055
+ if pk in obj:
1056
+ return obj[pk]
1057
+ elif auto:
1058
+ # TODO: use other unique keys if no primary key
1059
+ as_str = str(obj)
1060
+ md5 = hashlib.md5(as_str.encode()).hexdigest()
1061
+ return md5
1062
+ else:
1063
+ return None
1064
+
1065
+ def induce_class_definition_from_objects(
1066
+ self, objs: List[OBJECT], max_sample_size: Optional[int] = None
1067
+ ) -> ClassDefinition:
1068
+ """
1069
+ Induce a class definition from a list of objects.
1070
+
1071
+ This uses a heuristic procedure to infer the class definition from a list of objects.
1072
+ In general it is recommended you explicitly provide a schema.
1073
+
1074
+ :param objs:
1075
+ :param max_sample_size:
1076
+ :return:
1077
+ """
1078
+ # TODO: use schemaview
1079
+ if max_sample_size is None:
1080
+ max_sample_size = 10
1081
+ if not self.target_class_name:
1082
+ raise ValueError(f"No target_class_name for {self.alias}")
1083
+ cd = ClassDefinition(self.target_class_name)
1084
+ keys = defaultdict(list)
1085
+ for obj in objs[0:max_sample_size]:
1086
+ if isinstance(obj, BaseModel):
1087
+ obj = obj.model_dump()
1088
+ if not isinstance(obj, dict):
1089
+ logger.warning(f"Skipping non-dict object: {obj}")
1090
+ continue
1091
+ for k, v in obj.items():
1092
+ keys[k].append(v)
1093
+ for k, vs in keys.items():
1094
+ if k == "_id":
1095
+ continue
1096
+ multivalueds = []
1097
+ inlineds = []
1098
+ rngs = []
1099
+ exact_dimensions_list = []
1100
+ for v in vs:
1101
+ if v is None:
1102
+ continue
1103
+ if isinstance(v, np.ndarray):
1104
+ rngs.append("float")
1105
+ exact_dimensions_list.append(v.shape)
1106
+ break
1107
+ if isinstance(v, list):
1108
+ # sample first item. TODO: more robust strategy
1109
+ v = v[0] if v else None
1110
+ multivalueds.append(True)
1111
+ elif isinstance(v, dict):
1112
+ pass
1113
+ # TODO: check if this is a nested object or key-value list
1114
+ # v = list(v.values())[0]
1115
+ # multivalueds.append(True)
1116
+ else:
1117
+ multivalueds.append(False)
1118
+ if not v:
1119
+ continue
1120
+ if isinstance(v, str):
1121
+ rng = "string"
1122
+ elif isinstance(v, bool):
1123
+ rng = "boolean"
1124
+ elif isinstance(v, int):
1125
+ rng = "integer"
1126
+ elif isinstance(v, float):
1127
+ rng = "float"
1128
+ elif isinstance(v, dict):
1129
+ rng = None
1130
+ inlineds.append(True)
1131
+ else:
1132
+ # raise ValueError(f"No mappings for {type(v)} // v={v}")
1133
+ rng = None
1134
+ inlineds.append(False)
1135
+ rngs.append(rng)
1136
+ multivalued = any(multivalueds)
1137
+ inlined = any(inlineds)
1138
+ if multivalued and False in multivalueds:
1139
+ logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1140
+ # if not rngs:
1141
+ # raise AssertionError(f"Empty rngs for {k} = {vs}")
1142
+ rng = rngs[0] if rngs else None
1143
+ for other_rng in rngs:
1144
+ coercions = {
1145
+ ("integer", "float"): "float",
1146
+ }
1147
+ if rng != other_rng:
1148
+ if (rng, other_rng) in coercions:
1149
+ rng = coercions[(rng, other_rng)]
1150
+ elif (other_rng, rng) in coercions:
1151
+ rng = coercions[(other_rng, rng)]
1152
+ else:
1153
+ raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
1154
+ logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
1155
+ inlined_as_list = inlined and multivalued
1156
+ cd.attributes[k] = SlotDefinition(
1157
+ k, range=rng, multivalued=multivalued, inlined=inlined, inlined_as_list=inlined_as_list
1158
+ )
1159
+ if exact_dimensions_list:
1160
+ array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
1161
+ cd.attributes[k].array = array_expr
1162
+ sv = self.parent.schema_view
1163
+ sv.schema.classes[self.target_class_name] = cd
1164
+ sv.set_modified()
1165
+ return cd
1166
+
1167
+ def import_data(self, location: Union[Path, str, TextIO], **kwargs):
1168
+ """
1169
+ Import data from a file or stream
1170
+
1171
+ :param location:
1172
+ :param kwargs:
1173
+ :return:
1174
+ """
1175
+ raise NotImplementedError
1176
+
1177
+ def export_data(self, location: Union[Path, str, TextIO], **kwargs):
1178
+ """
1179
+ Export data to a file or stream
1180
+
1181
+ :param location:
1182
+ :param kwargs:
1183
+ :return:
1184
+ """
1185
+ raise NotImplementedError
1186
+
1187
+ def apply_patches(self, patches: List[PatchDict], **kwargs):
1188
+ """
1189
+ Apply a patch to the collection.
1190
+
1191
+ Patches conform to the JSON Patch format.
1192
+
1193
+ :param patches:
1194
+ :param kwargs:
1195
+ :return:
1196
+ """
1197
+ all_objs = self.find(limit=-1).rows
1198
+ primary_key = self.identifier_attribute_name
1199
+ if not primary_key:
1200
+ raise ValueError(f"No primary key for {self.target_class_name}")
1201
+ new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
1202
+ self.replace(new_objs)
1203
+
1204
+ def diff(self, other: "Collection", **kwargs) -> List[PatchDict]:
1205
+ """
1206
+ Diff two collections.
1207
+
1208
+ :param other: The collection to diff against
1209
+ :param kwargs:
1210
+ :return:
1211
+ """
1212
+ src_objs = self.find(limit=-1).rows
1213
+ tgt_objs = other.find(limit=-1).rows
1214
+ primary_key = self.identifier_attribute_name
1215
+ if not primary_key:
1216
+ raise ValueError(f"No primary key for {self.target_class_name}")
1217
+ patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
1218
+ return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
1219
+
1220
+ def iter_validate_collection(
1221
+ self, objects: Optional[Iterable[OBJECT]] = None, **kwargs
1222
+ ) -> Iterator["ValidationResult"]:
1223
+ """
1224
+ Validate the contents of the collection
1225
+
1226
+ :param kwargs:
1227
+ :param objects: objects to validate
1228
+ :return: iterator over validation results
1229
+ """
1230
+ from linkml.validator import JsonschemaValidationPlugin, Validator
1231
+
1232
+ validation_plugins = [JsonschemaValidationPlugin(closed=True)]
1233
+ validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
1234
+ cd = self.class_definition()
1235
+ if not cd:
1236
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
1237
+ type_designator = None
1238
+ for att in self.parent.schema_view.class_induced_slots(cd.name):
1239
+ if att.designates_type:
1240
+ type_designator = att.name
1241
+ class_name = cd.name
1242
+ if objects is None:
1243
+ objects = self.find_iter(**kwargs)
1244
+ for obj in objects:
1245
+ obj = clean_empties(obj)
1246
+ v_class_name = class_name
1247
+ if type_designator is not None:
1248
+ # TODO: move type designator logic to core linkml
1249
+ this_class_name = obj.get(type_designator)
1250
+ if this_class_name:
1251
+ if ":" in this_class_name:
1252
+ this_class_name = this_class_name.split(":")[-1]
1253
+ v_class_name = this_class_name
1254
+ yield from validator.iter_results(obj, v_class_name)
1255
+
1256
+ def commit(self):
1257
+ """
1258
+ Commit changes to the collection.
1259
+
1260
+ :return:
1261
+ """
1262
+ pass
1263
+
1264
+ def _broadcast(self, *args, **kwargs):
1265
+ self.parent.broadcast(self, *args, **kwargs)
1266
+
1267
+ def store_metadata(self, replace=True):
1268
+ """
1269
+ Store the metadata for the collection.
1270
+ """
1271
+ if not self.metadata:
1272
+ return
1273
+ this_collection_name = self.alias
1274
+ metadata_collection_name = f"{this_collection_name}__metadata"
1275
+ metadata_collection = self.parent.get_collection(metadata_collection_name, create_if_not_exists=True)
1276
+ metadata_dict = self.metadata.model_dump()
1277
+ if replace:
1278
+ metadata_collection.replace(metadata_dict)
1279
+ else:
1280
+ metadata_collection.insert(metadata_dict)