linkml-store 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (49) hide show
  1. {linkml_store-0.1.6 → linkml_store-0.1.8}/PKG-INFO +4 -1
  2. {linkml_store-0.1.6 → linkml_store-0.1.8}/README.md +2 -0
  3. {linkml_store-0.1.6 → linkml_store-0.1.8}/pyproject.toml +4 -1
  4. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/client.py +32 -3
  5. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/collection.py +231 -30
  6. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/config.py +10 -2
  7. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/database.py +305 -19
  8. linkml_store-0.1.8/src/linkml_store/api/stores/chromadb/__init__.py +7 -0
  9. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +8 -1
  10. linkml_store-0.1.8/src/linkml_store/api/stores/duckdb/__init__.py +16 -0
  11. linkml_store-0.1.8/src/linkml_store/api/stores/duckdb/duckdb_collection.py +144 -0
  12. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/duckdb/duckdb_database.py +22 -8
  13. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/duckdb/mappings.py +1 -0
  14. linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/__init__.py +16 -0
  15. linkml_store-0.1.6/src/linkml_store/api/stores/duckdb/duckdb_collection.py → linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/filesystem_collection.py +10 -10
  16. linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/filesystem_database.py +36 -0
  17. linkml_store-0.1.8/src/linkml_store/api/stores/hdf5/__init__.py +7 -0
  18. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +1 -1
  19. linkml_store-0.1.8/src/linkml_store/api/stores/mongodb/__init__.py +25 -0
  20. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +29 -8
  21. linkml_store-0.1.8/src/linkml_store/api/stores/solr/__init__.py +3 -0
  22. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/solr/solr_collection.py +2 -1
  23. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/solr/solr_database.py +1 -0
  24. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/cli.py +64 -10
  25. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/index/__init__.py +6 -2
  26. linkml_store-0.1.8/src/linkml_store/index/implementations/llm_indexer.py +122 -0
  27. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/index/implementations/simple_indexer.py +2 -2
  28. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/index/indexer.py +32 -8
  29. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/utils/format_utils.py +52 -2
  30. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/utils/object_utils.py +9 -1
  31. linkml_store-0.1.6/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  32. linkml_store-0.1.6/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  33. linkml_store-0.1.6/src/linkml_store/index/implementations/__init__.py +0 -0
  34. linkml_store-0.1.6/src/linkml_store/index/implementations/llm_indexer.py +0 -44
  35. linkml_store-0.1.6/src/linkml_store/utils/__init__.py +0 -0
  36. {linkml_store-0.1.6 → linkml_store-0.1.8}/LICENSE +0 -0
  37. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/__init__.py +0 -0
  38. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/__init__.py +0 -0
  39. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/queries.py +0 -0
  40. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/__init__.py +0 -0
  41. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  42. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  43. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  44. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  45. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/constants.py +0 -0
  46. {linkml_store-0.1.6/src/linkml_store/api/stores/chromadb → linkml_store-0.1.8/src/linkml_store/index/implementations}/__init__.py +0 -0
  47. {linkml_store-0.1.6/src/linkml_store/api/stores/duckdb → linkml_store-0.1.8/src/linkml_store/utils}/__init__.py +0 -0
  48. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/utils/io.py +0 -0
  49. {linkml_store-0.1.6 → linkml_store-0.1.8}/src/linkml_store/utils/sql_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -27,6 +27,7 @@ Requires-Dist: click
27
27
  Requires-Dist: duckdb (>=0.10.1,<0.11.0)
28
28
  Requires-Dist: duckdb-engine (>=0.11.2)
29
29
  Requires-Dist: h5py ; extra == "h5py"
30
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
31
  Requires-Dist: linkml ; extra == "validation"
31
32
  Requires-Dist: linkml-runtime (>=1.7.5,<2.0.0)
32
33
  Requires-Dist: linkml_map ; extra == "map"
@@ -53,3 +54,5 @@ There is also experimental support for vector-based indexing using OpenAI test e
53
54
  The goals of this project are to provide high level access to data stored in heterogeneous databases,
54
55
  with optional schema management using LinkML.
55
56
 
57
+ See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
58
+
@@ -8,3 +8,5 @@ There is also experimental support for vector-based indexing using OpenAI test e
8
8
 
9
9
  The goals of this project are to provide high level access to data stored in heterogeneous databases,
10
10
  with optional schema management using LinkML.
11
+
12
+ See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.6"
3
+ version = "0.1.8"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -27,6 +27,7 @@ h5py = { version="*", optional = true }
27
27
  linkml = { version="*", optional = true }
28
28
  linkml_map = { version="*", optional = true }
29
29
  pandas = ">=2.2.1"
30
+ jinja2 = "^3.1.4"
30
31
 
31
32
  [tool.poetry.group.dev.dependencies]
32
33
  pytest = {version = ">=7.1.2"}
@@ -36,10 +37,12 @@ sphinx = {version = ">=6.1.3"}
36
37
  sphinx-rtd-theme = {version = ">=1.0.0"}
37
38
  sphinx-autodoc-typehints = {version = "<2.0.0"}
38
39
  sphinx-click = {version = ">=4.3.0"}
40
+ sphinx-automodapi = "*"
39
41
  myst-parser = {version = ">=0.18.1"}
40
42
  furo = {version = "*"}
41
43
  nbsphinx = "*"
42
44
  jupyter = "*"
45
+ jupysql = "*"
43
46
 
44
47
  [tool.poetry.group.tests.dependencies]
45
48
  pytest = "^7.4.0"
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
  from typing import Dict, Optional, Union
3
4
 
@@ -11,6 +12,9 @@ from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
11
12
  from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
12
13
  from linkml_store.api.stores.solr.solr_database import SolrDatabase
13
14
 
15
+ logger = logging.getLogger(__name__)
16
+
17
+
14
18
  HANDLE_MAP = {
15
19
  "duckdb": DuckDBDatabase,
16
20
  "solr": SolrDatabase,
@@ -21,12 +25,29 @@ HANDLE_MAP = {
21
25
 
22
26
  class Client:
23
27
  """
24
- A client provides access to named collections.
28
+ A client is the top-level object for interacting with databases.
29
+
30
+ * A client has access to one or more :class:`.Database` objects.
31
+ * Each database consists of a number of :class:`.Collection` objects.
25
32
 
26
- Examples
27
- --------
33
+ Creating a client
34
+ -----------------
28
35
  >>> client = Client()
36
+
37
+ Attaching a database
38
+ --------------------
29
39
  >>> db = client.attach_database("duckdb", alias="test")
40
+
41
+ Note that normally a handle would be specified by a locator such as ``duckdb:///<PATH>``, but
42
+ for convenience, an in-memory duckdb object can be specified without a full locator
43
+
44
+ We can check the actual handle:
45
+
46
+ >>> db.handle
47
+ 'duckdb:///:memory:'
48
+
49
+ Creating a new collection
50
+ -------------------------
30
51
  >>> collection = db.create_collection("Person")
31
52
  >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
32
53
  >>> collection.insert(objs)
@@ -147,6 +168,8 @@ class Client:
147
168
  if ":" not in handle:
148
169
  scheme = handle
149
170
  handle = None
171
+ if alias is None:
172
+ alias = scheme
150
173
  else:
151
174
  scheme, _ = handle.split(":", 1)
152
175
  if scheme not in HANDLE_MAP:
@@ -161,6 +184,11 @@ class Client:
161
184
  self._databases = {}
162
185
  self._databases[alias] = db
163
186
  db.parent = self
187
+ if db.alias:
188
+ if db.alias != alias:
189
+ raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
190
+ else:
191
+ db.metadata.alias = alias
164
192
  return db
165
193
 
166
194
  def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -191,6 +219,7 @@ class Client:
191
219
  self._databases = {}
192
220
  if name not in self._databases:
193
221
  if create_if_not_exists:
222
+ logger.info(f"Creating database: {name}")
194
223
  self.attach_database(name, **kwargs)
195
224
  else:
196
225
  raise ValueError(f"Database {name} does not exist")
@@ -1,3 +1,5 @@
1
+ """A structure for representing collections of similar objects."""
2
+
1
3
  import hashlib
2
4
  import logging
3
5
  from collections import defaultdict
@@ -10,6 +12,8 @@ from linkml_runtime.linkml_model.meta import ArrayExpression
10
12
  from pydantic import BaseModel
11
13
 
12
14
  from linkml_store.index import get_indexer
15
+ from linkml_store.utils.format_utils import load_objects
16
+ from linkml_store.utils.object_utils import clean_empties
13
17
 
14
18
  try:
15
19
  from linkml.validator.report import ValidationResult
@@ -38,7 +42,17 @@ class Collection:
38
42
 
39
43
  - For relational databases, a collection is typically a table
40
44
  - For document databases such as MongoDB, a collection is the native type
41
- - For a file system, a collection could be a single tabular file such as Parquet or CSV
45
+ - For a file system, a collection could be a single tabular file such as Parquet or CSV.
46
+
47
+ Collection objects are typically not created directly - instead they are generated
48
+ from a parent :class:`.Database` object:
49
+
50
+ >>> from linkml_store import Client
51
+ >>> client = Client()
52
+ >>> db = client.attach_database("duckdb", alias="test")
53
+ >>> collection = db.create_collection("Person")
54
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
55
+ >>> collection.insert(objs)
42
56
  """
43
57
 
44
58
  # name: str
@@ -56,25 +70,51 @@ class Collection:
56
70
  self.metadata = metadata
57
71
  else:
58
72
  self.metadata = CollectionConfig(name=name, **kwargs)
59
- if name is not None and self.metadata.name is not None and name != self.metadata.name:
60
- raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
73
+ if not self.metadata.alias:
74
+ self.metadata.alias = name
75
+ if not self.metadata.type:
76
+ self.metadata.type = name
77
+ # if name is not None and self.metadata.name is not None and name != self.metadata.name:
78
+ # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
61
79
 
62
80
  @property
63
81
  def name(self) -> str:
82
+ """
83
+ Return the name of the collection.
84
+
85
+ TODO: deprecate in favor of Type
86
+
87
+ :return: name of the collection
88
+ """
64
89
  return self.metadata.name
65
90
 
66
91
  @property
67
92
  def hidden(self) -> bool:
68
- return self.metadata.hidden
93
+ """
94
+ True if the collection is hidden.
95
+
96
+ An example of a hidden collection is a collection that indexes another
97
+ collection
98
+
99
+ :return: True if the collection is hidden
100
+ """
101
+ # return self.metadata.hidden
69
102
 
70
103
  @property
71
- def _target_class_name(self):
104
+ def target_class_name(self):
72
105
  """
73
106
  Return the name of the class that this collection represents
74
107
 
75
108
  This MUST be a LinkML class name
76
109
 
77
- :return:
110
+ >>> from linkml_store import Client
111
+ >>> client = Client()
112
+ >>> db = client.attach_database("duckdb", alias="test")
113
+ >>> collection = db.create_collection("Person", alias="persons")
114
+ >>> collection.target_class_name
115
+ 'Person'
116
+
117
+ :return: name of the class which members of this collection instantiate
78
118
  """
79
119
  # TODO: this is a shim layer until we can normalize on this
80
120
  if self.metadata.type:
@@ -82,7 +122,7 @@ class Collection:
82
122
  return self.name
83
123
 
84
124
  @property
85
- def _alias(self):
125
+ def alias(self):
86
126
  """
87
127
  Return the primary name/alias used for the collection.
88
128
 
@@ -90,15 +130,34 @@ class Collection:
90
130
  to have an alias, for example "persons" which collects all instances
91
131
  of class Person.
92
132
 
93
- The _alias SHOULD be used for Table names in SQL.
133
+ >>> from linkml_store import Client
134
+ >>> client = Client()
135
+ >>> db = client.attach_database("duckdb", alias="test")
136
+ >>> collection = db.create_collection("Person", alias="persons")
137
+ >>> collection.alias
138
+ 'persons'
139
+
140
+ If no explicit alias is provided, then the target class name is used:
141
+
142
+ >>> from linkml_store import Client
143
+ >>> client = Client()
144
+ >>> db = client.attach_database("duckdb", alias="test")
145
+ >>> collection = db.create_collection("Person")
146
+ >>> collection.alias
147
+ 'Person'
148
+
149
+ The alias SHOULD be used for Table names in SQL.
94
150
 
95
151
  For nested data, the alias SHOULD be used as the key; e.g
96
152
 
97
- ``{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }``
153
+ .. code-block:: json
154
+
155
+ { "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }
98
156
 
99
157
  :return:
100
158
  """
101
159
  # TODO: this is a shim layer until we can normalize on this
160
+ # TODO: this is a shim layer until we can normalize on this
102
161
  if self.metadata.alias:
103
162
  return self.metadata.alias
104
163
  return self.name
@@ -107,6 +166,13 @@ class Collection:
107
166
  """
108
167
  Replace entire collection with objects.
109
168
 
169
+ >>> from linkml_store import Client
170
+ >>> client = Client()
171
+ >>> db = client.attach_database("duckdb", alias="test")
172
+ >>> collection = db.create_collection("Person")
173
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
174
+ >>> collection.insert(objs)
175
+
110
176
  :param objs:
111
177
  :param kwargs:
112
178
  :return:
@@ -116,7 +182,14 @@ class Collection:
116
182
 
117
183
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
118
184
  """
119
- Add one or more objects to the collection
185
+ Add one or more objects to the collection.
186
+
187
+ >>> from linkml_store import Client
188
+ >>> client = Client()
189
+ >>> db = client.attach_database("duckdb", alias="test")
190
+ >>> collection = db.create_collection("Person")
191
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
192
+ >>> collection.insert(objs)
120
193
 
121
194
  :param objs:
122
195
  :param kwargs:
@@ -124,9 +197,32 @@ class Collection:
124
197
  """
125
198
  raise NotImplementedError
126
199
 
127
- def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
200
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
128
201
  """
129
- Delete one or more objects from the collection
202
+ Delete one or more objects from the collection.
203
+
204
+ First let's set up a collection:
205
+
206
+ >>> from linkml_store import Client
207
+ >>> client = Client()
208
+ >>> db = client.attach_database("duckdb", alias="test")
209
+ >>> collection = db.create_collection("Person")
210
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
211
+ >>> collection.insert(objs)
212
+ >>> collection.find({}).num_rows
213
+ 2
214
+
215
+ Now let's delete an object:
216
+
217
+ >>> collection.delete(objs[0])
218
+ >>> collection.find({}).num_rows
219
+ 1
220
+
221
+ Deleting the same object again should have no effect:
222
+
223
+ >>> collection.delete(objs[0])
224
+ >>> collection.find({}).num_rows
225
+ 1
130
226
 
131
227
  :param objs:
132
228
  :param kwargs:
@@ -134,9 +230,30 @@ class Collection:
134
230
  """
135
231
  raise NotImplementedError
136
232
 
137
- def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
233
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
138
234
  """
139
- Delete objects that match a query
235
+ Delete objects that match a query.
236
+
237
+ First let's set up a collection:
238
+
239
+ >>> from linkml_store import Client
240
+ >>> client = Client()
241
+ >>> db = client.attach_database("duckdb", alias="test")
242
+ >>> collection = db.create_collection("Person")
243
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
244
+ >>> collection.insert(objs)
245
+
246
+ Now let's delete an object:
247
+
248
+ >>> collection.delete_where({"id": "P1"})
249
+ >>> collection.find({}).num_rows
250
+ 1
251
+
252
+ Match everything:
253
+
254
+ >>> collection.delete_where({})
255
+ >>> collection.find({}).num_rows
256
+ 0
140
257
 
141
258
  :param where: where conditions
142
259
  :param missing_ok: if True, do not raise an error if the collection does not exist
@@ -147,7 +264,7 @@ class Collection:
147
264
 
148
265
  def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
149
266
  """
150
- Update one or more objects in the collection
267
+ Update one or more objects in the collection.
151
268
 
152
269
  :param objs:
153
270
  :param kwargs:
@@ -156,11 +273,25 @@ class Collection:
156
273
  raise NotImplementedError
157
274
 
158
275
  def _create_query(self, **kwargs) -> Query:
159
- return Query(from_table=self._alias, **kwargs)
276
+ return Query(from_table=self.alias, **kwargs)
160
277
 
161
278
  def query(self, query: Query, **kwargs) -> QueryResult:
162
279
  """
163
- Run a query against the collection
280
+ Run a query against the collection.
281
+
282
+ First let's load a collection:
283
+
284
+ >>> from linkml_store import Client
285
+ >>> from linkml_store.utils.format_utils import load_objects
286
+ >>> client = Client()
287
+ >>> db = client.attach_database("duckdb")
288
+ >>> collection = db.create_collection("Country")
289
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
290
+ >>> collection.insert(objs)
291
+
292
+ Now let's run a query:
293
+
294
+ TODO
164
295
 
165
296
  :param query:
166
297
  :param kwargs:
@@ -193,7 +324,7 @@ class Collection:
193
324
  """
194
325
  raise NotImplementedError
195
326
 
196
- def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
327
+ def get(self, ids: Optional[List[IDENTIFIER]], **kwargs) -> QueryResult:
197
328
  """
198
329
  Get one or more objects by ID.
199
330
 
@@ -201,14 +332,60 @@ class Collection:
201
332
  :param kwargs:
202
333
  :return:
203
334
  """
204
- id_field = self.identifier_field
205
- q = self._create_query(where_clause={id_field: ids})
206
- return self.query(q, **kwargs)
335
+ # TODO
336
+ id_field = self.identifier_attribute_name
337
+ if not id_field:
338
+ raise ValueError(f"No identifier for {self.name}")
339
+ return self.find({id_field: ids})
340
+
341
+ def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
342
+ """
343
+ Get one object by ID.
344
+
345
+ :param id:
346
+ :param kwargs:
347
+ :return:
348
+ """
349
+ if not id:
350
+ raise ValueError("Must pass an ID")
351
+ id_field = self.identifier_attribute_name
352
+ if not id_field:
353
+ raise ValueError(f"No identifier for {self.name}")
354
+ w = {id_field: id}
355
+ qr = self.find(w)
356
+ if qr.num_rows == 1:
357
+ return qr.rows[0]
358
+ return None
207
359
 
208
360
  def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
209
361
  """
210
362
  Find objects in the collection using a where query.
211
363
 
364
+ As an example, first load a collection:
365
+
366
+ >>> from linkml_store import Client
367
+ >>> from linkml_store.utils.format_utils import load_objects
368
+ >>> client = Client()
369
+ >>> db = client.attach_database("duckdb")
370
+ >>> collection = db.create_collection("Country")
371
+ >>> objs = load_objects("tests/input/countries/countries.jsonl")
372
+ >>> collection.insert(objs)
373
+
374
+ Now let's find all objects:
375
+
376
+ >>> qr = collection.find({})
377
+ >>> qr.num_rows
378
+ 20
379
+
380
+ We can do a more restrictive query:
381
+
382
+ >>> qr = collection.find({"code": "FR"})
383
+ >>> qr.num_rows
384
+ 1
385
+ >>> qr.rows[0]["name"]
386
+ 'France'
387
+
388
+
212
389
  :param where:
213
390
  :param kwargs:
214
391
  :return:
@@ -216,6 +393,18 @@ class Collection:
216
393
  query = self._create_query(where_clause=where)
217
394
  return self.query(query, **kwargs)
218
395
 
396
+ def find_iter(self, where: Optional[Any] = None, **kwargs) -> Iterator[OBJECT]:
397
+ """
398
+ Find objects in the collection using a where query.
399
+
400
+ :param where:
401
+ :param kwargs:
402
+ :return:
403
+ """
404
+ qr = self.find(where=where, limit=-1, **kwargs)
405
+ for row in qr.rows:
406
+ yield row
407
+
219
408
  def search(
220
409
  self,
221
410
  query: str,
@@ -245,6 +434,7 @@ class Collection:
245
434
  raise ValueError(f"No index named {index_name}")
246
435
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
247
436
  index_col = ix.index_field
437
+ # TODO: optimize this for large indexes
248
438
  vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
249
439
  results = ix.search(query, vector_pairs, limit=limit)
250
440
  for r in results:
@@ -260,11 +450,15 @@ class Collection:
260
450
 
261
451
  :return:
262
452
  """
263
- if not self.name:
264
- raise ValueError(f"Collection has no name: {self} // {self.metadata}")
265
- return self.name.startswith("internal__")
453
+ if not self.alias:
454
+ raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
455
+ return self.alias.startswith("internal__")
266
456
 
267
- def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = True, auto_index=True, **kwargs):
457
+ def load_from_source(self):
458
+ objects = load_objects(self.metadata.source_location)
459
+ self.insert(objects)
460
+
461
+ def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
268
462
  """
269
463
  Attach an index to the collection.
270
464
 
@@ -288,6 +482,7 @@ class Collection:
288
482
  self._indexers[index_name] = index
289
483
  if auto_index:
290
484
  all_objs = self.find(limit=-1).rows
485
+ logger.info(f"Auto-indexing {len(all_objs)} objects")
291
486
  self.index_objects(all_objs, index_name, replace=True, **kwargs)
292
487
 
293
488
  def _index_collection_name(self, index_name: str) -> str:
@@ -295,6 +490,7 @@ class Collection:
295
490
  Create a name for a special collection that holds index data
296
491
 
297
492
  :param index_name:
493
+ :param indexer:
298
494
  :return:
299
495
  """
300
496
  return f"internal__index__{self.name}__{index_name}"
@@ -325,6 +521,7 @@ class Collection:
325
521
  logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
326
522
  if ix_coll_name in schema.classes:
327
523
  ix_coll.delete_where()
524
+
328
525
  ix_coll.insert(objects_with_ix, **kwargs)
329
526
 
330
527
  def list_index_names(self) -> List[str]:
@@ -362,10 +559,11 @@ class Collection:
362
559
  """
363
560
  sv = self.parent.schema_view
364
561
  if sv:
365
- cls = sv.get_class(self._target_class_name)
562
+ cls = sv.get_class(self.target_class_name)
366
563
  return cls
367
564
  return None
368
565
 
566
+ @property
369
567
  def identifier_attribute_name(self) -> Optional[str]:
370
568
  """
371
569
  Return the name of the identifier attribute for the collection.
@@ -376,7 +574,7 @@ class Collection:
376
574
  """
377
575
  cd = self.class_definition()
378
576
  if cd:
379
- for att in cd.attributes.values():
577
+ for att in self.parent.schema_view.class_induced_slots(cd.name):
380
578
  if att.identifier:
381
579
  return att.name
382
580
  return None
@@ -411,7 +609,9 @@ class Collection:
411
609
  :param max_sample_size:
412
610
  :return:
413
611
  """
414
- cd = ClassDefinition(self._target_class_name)
612
+ if not self.target_class_name:
613
+ raise ValueError(f"No target_class_name for {self.alias}")
614
+ cd = ClassDefinition(self.target_class_name)
415
615
  keys = defaultdict(list)
416
616
  for obj in objs[0:max_sample_size]:
417
617
  if isinstance(obj, BaseModel):
@@ -474,7 +674,7 @@ class Collection:
474
674
  array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
475
675
  cd.attributes[k].array = array_expr
476
676
  sv = self.parent.schema_view
477
- sv.schema.classes[self._target_class_name] = cd
677
+ sv.schema.classes[self.target_class_name] = cd
478
678
  sv.set_modified()
479
679
  return cd
480
680
 
@@ -511,8 +711,9 @@ class Collection:
511
711
  validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
512
712
  cd = self.class_definition()
513
713
  if not cd:
514
- raise ValueError(f"Cannot find class definition for {self._target_class_name}")
714
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
515
715
  class_name = cd.name
516
716
  result = self.find(**kwargs)
517
717
  for obj in result.rows:
718
+ obj = clean_empties(obj)
518
719
  yield from validator.iter_results(obj, class_name)
@@ -16,7 +16,7 @@ class CollectionConfig(BaseModel):
16
16
  default=None,
17
17
  description="The type of object in the collection. TODO; use this instead of name",
18
18
  )
19
- metadata: Optional[Dict] = Field(
19
+ additional_properties: Optional[Dict] = Field(
20
20
  default=None,
21
21
  description="Optional metadata for the collection",
22
22
  )
@@ -36,6 +36,10 @@ class CollectionConfig(BaseModel):
36
36
  default=False,
37
37
  description="Whether the collection is prepopulated",
38
38
  )
39
+ source_location: Optional[str] = Field(
40
+ default=None,
41
+ description="Filesystem or remote URL that stores the data",
42
+ )
39
43
 
40
44
 
41
45
  class DatabaseConfig(BaseModel):
@@ -55,7 +59,7 @@ class DatabaseConfig(BaseModel):
55
59
  default=None,
56
60
  description="The LinkML schema as a dictionary",
57
61
  )
58
- collections: Dict[str, CollectionConfig] = Field(
62
+ collections: Optional[Dict[str, CollectionConfig]] = Field(
59
63
  default={},
60
64
  description="A dictionary of collection configurations",
61
65
  )
@@ -76,6 +80,10 @@ class DatabaseConfig(BaseModel):
76
80
  default=None,
77
81
  description="Optional configuration for search fields",
78
82
  )
83
+ ensure_referential_integrity: bool = Field(
84
+ default=False,
85
+ description="Whether to ensure referential integrity",
86
+ )
79
87
 
80
88
 
81
89
  class ClientConfig(BaseModel):