linkml-store 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -3,7 +3,9 @@ from abc import ABC
3
3
  from collections import defaultdict
4
4
  from copy import copy
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
7
+
8
+ from linkml_store.utils.format_utils import load_objects, render_output
7
9
 
8
10
  try:
9
11
  from linkml.validator.report import Severity, ValidationResult
@@ -27,13 +29,33 @@ class Database(ABC):
27
29
  """
28
30
  A Database provides access to named collections of data.
29
31
 
30
- Examples
31
- --------
32
+ A database object is owned by a :ref:`Client`. The database
33
+ object uses a :ref:`handle` to know what kind of external
34
+ dataase system to connect to (e.g. duckdb, mongodb). The handle
35
+ is a string ``<DatabaseType>:<LocalLocator>``
36
+
37
+ The
38
+ database object may also have an :ref:`alias` that is mapped
39
+ to the handle.
40
+
41
+ Attaching a database
42
+ --------------------
32
43
  >>> from linkml_store.api.client import Client
33
44
  >>> client = Client()
34
- >>> db = client.attach_database("duckdb", alias="test")
45
+ >>> db = client.attach_database("duckdb:///:memory:", alias="test")
46
+
47
+ We can check the value of the handle:
48
+
35
49
  >>> db.handle
36
50
  'duckdb:///:memory:'
51
+
52
+ The alias can be used to retrieve the database object from the client
53
+
54
+ >>> assert db == client.get_database("test")
55
+
56
+ Creating a collection
57
+ ---------------------
58
+
37
59
  >>> collection = db.create_collection("Person")
38
60
  >>> len(db.list_collections())
39
61
  1
@@ -57,6 +79,11 @@ class Database(ABC):
57
79
  """
58
80
 
59
81
  _schema_view: Optional[SchemaView] = None
82
+ """Schema for the database. May be transformed."""
83
+
84
+ _original_schema_view: Optional[SchemaView] = None
85
+ """If a schema must be transformed, then the original is stored here."""
86
+
60
87
  _collections: Optional[Dict[str, Collection]] = None
61
88
  parent: Optional["Client"] = None
62
89
  metadata: Optional[DatabaseConfig] = None
@@ -101,6 +128,8 @@ class Database(ABC):
101
128
  return self
102
129
 
103
130
  def _initialize_collections(self):
131
+ if not self.metadata.collections:
132
+ return
104
133
  for name, collection_config in self.metadata.collections.items():
105
134
  alias = collection_config.alias
106
135
  typ = collection_config.type
@@ -127,15 +156,46 @@ class Database(ABC):
127
156
 
128
157
  @property
129
158
  def recreate_if_exists(self) -> bool:
159
+ """
160
+ Return whether to recreate the database if it already exists.
161
+
162
+ :return:
163
+ """
130
164
  return self.metadata.recreate_if_exists
131
165
 
132
166
  @property
133
167
  def handle(self) -> str:
168
+ """
169
+ Return the database handle.
170
+
171
+ Examples:
172
+
173
+ - ``duckdb:///:memory:``
174
+ - ``duckdb:///tmp/test.db``
175
+ - ``mongodb://localhost:27017/``
176
+
177
+ :return:
178
+ """
134
179
  return self.metadata.handle
135
180
 
136
- def store(self, obj: Dict[str, str], **kwargs):
181
+ @property
182
+ def alias(self):
183
+ return self.metadata.alias
184
+
185
+ def store(self, obj: Dict[str, Any], **kwargs):
137
186
  """
138
- Store an object in the database
187
+ Store an object in the database.
188
+
189
+ The object is assumed to be a Dictionary of Collections.
190
+
191
+ >>> from linkml_store.api.client import Client
192
+ >>> client = Client()
193
+ >>> db = client.attach_database("duckdb", alias="test")
194
+ >>> db.store({"persons": [{"id": "P1", "name": "John", "age_in_years": 30}]})
195
+ >>> collection = db.get_collection("persons")
196
+ >>> qr = collection.find()
197
+ >>> qr.num_rows
198
+ 1
139
199
 
140
200
  :param obj: object to store
141
201
  :param kwargs: additional arguments
@@ -144,6 +204,7 @@ class Database(ABC):
144
204
  roots = [c for c in sv.all_classes().values() if c.tree_root]
145
205
  root = roots[0] if roots else None
146
206
  for k, v in obj.items():
207
+ logger.info(f"Storing collection {k}")
147
208
  if root:
148
209
  slot = sv.induced_slot(k, root.name)
149
210
  if not slot:
@@ -158,20 +219,28 @@ class Database(ABC):
158
219
  if not v:
159
220
  continue
160
221
  if slot:
161
- collection = self.get_collection(slot.range, create_if_not_exists=True)
222
+ logger.debug(f"Aligning to existing slot: {slot.name} range={slot.range}")
223
+ collection = self.get_collection(slot.name, type=slot.range, create_if_not_exists=True)
162
224
  else:
163
225
  collection = self.get_collection(k, create_if_not_exists=True)
226
+ logger.debug(f"Replacing using {collection.alias} {collection.target_class_name}")
164
227
  collection.replace(v)
165
228
 
166
229
  def commit(self, **kwargs):
167
230
  """
168
- Commit any pending changes to the database
231
+ Commit pending changes to the database.
232
+
233
+ :param kwargs:
234
+ :return:
169
235
  """
170
236
  raise NotImplementedError()
171
237
 
172
238
  def close(self, **kwargs):
173
239
  """
174
- Close the database and all connection objects
240
+ Close the database.
241
+
242
+ :param kwargs:
243
+ :return:
175
244
  """
176
245
  raise NotImplementedError()
177
246
 
@@ -188,15 +257,27 @@ class Database(ABC):
188
257
  **kwargs,
189
258
  ) -> Collection:
190
259
  """
191
- Create a new collection
260
+ Create a new collection in the current database.
261
+
262
+ The collection must have a *Type*, and may have an *Alias*.
263
+
264
+ Examples:
192
265
 
193
266
  >>> from linkml_store.api.client import Client
194
267
  >>> client = Client()
195
268
  >>> db = client.attach_database("duckdb", alias="test")
196
- >>> collection = db.create_collection("Person")
197
- >>> collection.name
269
+ >>> collection = db.create_collection("Person", alias="persons")
270
+ >>> collection.alias
271
+ 'persons'
272
+ >>> collection.target_class_name
198
273
  'Person'
199
274
 
275
+ If alias is not provided, it defaults to the name of the type.
276
+
277
+ >>> collection = db.create_collection("Organization")
278
+ >>> collection.alias
279
+ 'Organization'
280
+
200
281
  :param name: name of the collection
201
282
  :param alias: alias for the collection
202
283
  :param metadata: metadata for the collection
@@ -207,6 +288,8 @@ class Database(ABC):
207
288
  raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
208
289
  collection_cls = self.collection_class
209
290
  collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
291
+ if metadata and metadata.source_location:
292
+ collection.load_from_source()
210
293
  if metadata and metadata.attributes:
211
294
  sv = self.schema_view
212
295
  schema = sv.schema
@@ -265,7 +348,9 @@ class Database(ABC):
265
348
  """
266
349
  return [c.name for c in self.list_collections(**kwargs)]
267
350
 
268
- def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
351
+ def get_collection(
352
+ self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs
353
+ ) -> "Collection":
269
354
  """
270
355
  Get a named collection.
271
356
 
@@ -283,14 +368,19 @@ class Database(ABC):
283
368
  KeyError: 'Collection NonExistent does not exist'
284
369
 
285
370
  :param name: name of the collection
371
+ :param type: target class name
286
372
  :param create_if_not_exists: create the collection if it does not exist
287
373
 
288
374
  """
289
375
  if not self._collections:
376
+ logger.debug("Initializing collections")
290
377
  self.init_collections()
291
378
  if name not in self._collections.keys():
292
379
  if create_if_not_exists:
293
- self._collections[name] = self.create_collection(name)
380
+ if type is None:
381
+ type = name
382
+ logger.debug(f"Creating new collection: {name} kwargs: {kwargs}")
383
+ self._collections[name] = self.create_collection(type, alias=name, **kwargs)
294
384
  else:
295
385
  raise KeyError(f"Collection {name} does not exist")
296
386
  return self._collections[name]
@@ -333,7 +423,29 @@ class Database(ABC):
333
423
  @property
334
424
  def schema_view(self) -> SchemaView:
335
425
  """
336
- Return a schema view for the named collection
426
+ Return a schema view for the named collection.
427
+
428
+ If no explicit schema is provided, this will generalize one
429
+
430
+ Induced schema example:
431
+
432
+ >>> from linkml_store.api.client import Client
433
+ >>> client = Client()
434
+ >>> db = client.attach_database("duckdb", alias="test")
435
+ >>> collection = db.create_collection("Person", alias="persons")
436
+ >>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25}])
437
+ >>> schema_view = db.schema_view
438
+ >>> cd = schema_view.get_class("Person")
439
+ >>> cd.attributes["id"].range
440
+ 'string'
441
+ >>> cd.attributes["age_in_years"].range
442
+ 'integer'
443
+
444
+ We can reuse the same class:
445
+
446
+ >>> collection2 = db.create_collection("Person", alias="other_persons")
447
+ >>> collection2.class_definition().attributes["age_in_years"].range
448
+ 'integer'
337
449
  """
338
450
  if not self._schema_view:
339
451
  self._initialize_schema()
@@ -345,6 +457,26 @@ class Database(ABC):
345
457
  """
346
458
  Set the schema view for the database.
347
459
 
460
+ >>> from linkml_store.api.client import Client
461
+ >>> client = Client()
462
+ >>> db = client.attach_database("duckdb", alias="test")
463
+ >>> sv = SchemaView("tests/input/countries/countries.linkml.yaml")
464
+ >>> db.set_schema_view(sv)
465
+ >>> cd = db.schema_view.schema.classes["Country"]
466
+ >>> sorted(cd.slots)
467
+ ['capital', 'code', 'continent', 'languages', 'name']
468
+ >>> induced_slots = {s.name: s for s in sv.class_induced_slots("Country")}
469
+ >>> sorted(induced_slots.keys())
470
+ ['capital', 'code', 'continent', 'languages', 'name']
471
+ >>> induced_slots["code"].identifier
472
+ True
473
+
474
+ Creating a new collection will align with the schema view:
475
+
476
+ >>> collection = db.create_collection("Country", "all_countries")
477
+ >>> sorted(collection.class_definition().slots)
478
+ ['capital', 'code', 'continent', 'languages', 'name']
479
+
348
480
  :param schema_view:
349
481
  :return:
350
482
  """
@@ -375,8 +507,7 @@ class Database(ABC):
375
507
  if inlined and slot.range:
376
508
  if slot.name in self._collections:
377
509
  coll = self._collections[slot.name]
378
- if not coll.metadata.type:
379
- coll.metadata.type = slot.range
510
+ coll.metadata.type = slot.range
380
511
 
381
512
  def load_schema_view(self, path: Union[str, Path]):
382
513
  """
@@ -386,6 +517,21 @@ class Database(ABC):
386
517
  >>> client = Client()
387
518
  >>> db = client.attach_database("duckdb", alias="test")
388
519
  >>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
520
+ >>> sv = db.schema_view
521
+ >>> cd = sv.schema.classes["Country"]
522
+ >>> sorted(cd.slots)
523
+ ['capital', 'code', 'continent', 'languages', 'name']
524
+ >>> induced_slots = {s.name: s for s in sv.class_induced_slots("Country")}
525
+ >>> sorted(induced_slots.keys())
526
+ ['capital', 'code', 'continent', 'languages', 'name']
527
+ >>> induced_slots["code"].identifier
528
+ True
529
+
530
+ Creating a new collection will align with the schema view:
531
+
532
+ >>> collection = db.create_collection("Country", "all_countries")
533
+ >>> sorted(collection.class_definition().slots)
534
+ ['capital', 'code', 'continent', 'languages', 'name']
389
535
 
390
536
  :param path:
391
537
  :return:
@@ -420,6 +566,42 @@ class Database(ABC):
420
566
  """
421
567
  Validate the contents of the database.
422
568
 
569
+ An an example, let's create a database with a predefined schema
570
+ from the countries.linkml.yaml file:
571
+
572
+ >>> from linkml_store.api.client import Client
573
+ >>> client = Client()
574
+ >>> db = client.attach_database("duckdb", alias="test")
575
+ >>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
576
+
577
+ Let's introspect the schema to see what slots are applicable for the class "Country":
578
+
579
+ >>> sv = db.schema_view
580
+ >>> for slot in sv.class_induced_slots("Country"):
581
+ ... print(slot.name, slot.range, slot.required)
582
+ name string True
583
+ code string True
584
+ capital string True
585
+ continent string True
586
+ languages Language None
587
+
588
+ Next we'll create a collection, binding it to the target class "Country", and insert
589
+ valid data:
590
+
591
+ >>> collection = db.create_collection("Country", "all_countries")
592
+ >>> obj = {"code": "US", "name": "United States", "continent": "North America", "capital": "Washington, D.C."}
593
+ >>> collection.insert([obj])
594
+ >>> list(db.iter_validate_database())
595
+ []
596
+
597
+ Now let's insert some invalid data (missing required fields)
598
+
599
+ >>> collection.insert([{"code": "FR", "name": "France"}])
600
+ >>> for r in db.iter_validate_database():
601
+ ... print(r.message[0:32])
602
+ 'capital' is a required property
603
+ 'continent' is a required proper
604
+
423
605
  :param kwargs:
424
606
  :return: iterator over validation results
425
607
  """
@@ -474,6 +656,36 @@ class Database(ABC):
474
656
 
475
657
  def drop(self, **kwargs):
476
658
  """
477
- Drop the database and all collections
659
+ Drop the database and all collections.
660
+
661
+ :param kwargs: additional arguments
478
662
  """
479
663
  raise NotImplementedError()
664
+
665
+ def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
666
+ """
667
+ Import a database from a file or location.
668
+
669
+ :param location: location of the file
670
+ :param source_format: source format
671
+ :param kwargs: additional arguments
672
+ """
673
+ objects = load_objects(location, format=source_format)
674
+ for obj in objects:
675
+ self.store(obj)
676
+
677
+ def export_database(self, location: str, target_format: Optional[str] = None, **kwargs):
678
+ """
679
+ Export a database to a file or location.
680
+
681
+ :param location: location of the file
682
+ :param target_format: target format
683
+ :param kwargs: additional arguments
684
+ """
685
+ obj = {}
686
+ for coll in self.list_collections():
687
+ qr = coll.find({}, limit=-1)
688
+ obj[coll.alias] = qr.rows
689
+ logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
690
+ with open(location, "w", encoding="utf-8") as stream:
691
+ stream.write(render_output(obj, format=target_format))
@@ -1,3 +1,7 @@
1
1
  """
2
- Support for ChromaDB is experimental.
2
+ Adapter for ChromaDB vector database.
3
+
4
+ .. warning::
5
+
6
+ Support for ChromaDB is experimental and may change in the future.
3
7
  """
@@ -1,3 +1,12 @@
1
+ """
2
+ Adapter for DuckDB embedded database.
3
+
4
+ Handles have the form:
5
+
6
+ - ``duckdb:///<path>`` for a file-based database
7
+ - ``duckdb:///:memory:`` for an in-memory database
8
+ """
9
+
1
10
  from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
2
11
  from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
3
12
 
@@ -19,12 +19,14 @@ class DuckDBCollection(Collection):
19
19
  _table_created: bool = None
20
20
 
21
21
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
22
+ logger.debug(f"Inserting {len(objs)}")
22
23
  if not isinstance(objs, list):
23
24
  objs = [objs]
24
25
  if not objs:
25
26
  return
26
27
  cd = self.class_definition()
27
28
  if not cd:
29
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
28
30
  cd = self.induce_class_definition_from_objects(objs)
29
31
  self._create_table(cd)
30
32
  table = self._sqla_table(cd)
@@ -37,7 +39,7 @@ class DuckDBCollection(Collection):
37
39
  conn.execute(insert(table), objs)
38
40
  conn.commit()
39
41
 
40
- def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
42
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
41
43
  if not isinstance(objs, list):
42
44
  objs = [objs]
43
45
  cd = self.class_definition()
@@ -52,9 +54,9 @@ class DuckDBCollection(Collection):
52
54
  stmt = stmt.compile(engine)
53
55
  conn.execute(stmt)
54
56
  conn.commit()
55
- return len(objs)
57
+ return
56
58
 
57
- def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
59
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
58
60
  logger.info(f"Deleting from {self.target_class_name} where: {where}")
59
61
  if where is None:
60
62
  where = {}
@@ -78,7 +80,7 @@ class DuckDBCollection(Collection):
78
80
  if deleted_rows_count == 0 and not missing_ok:
79
81
  raise ValueError(f"No rows found for {where}")
80
82
  conn.commit()
81
- return deleted_rows_count
83
+ return deleted_rows_count if deleted_rows_count > -1 else None
82
84
 
83
85
  def query_facets(
84
86
  self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from pathlib import Path
3
4
  from typing import Optional
4
5
 
5
6
  import pandas as pd
@@ -22,6 +23,7 @@ TYPE_MAP = {
22
23
  "DATE": "date",
23
24
  "DOUBLE": "float",
24
25
  "INTEGER": "integer",
26
+ "JSON": "Any",
25
27
  }
26
28
 
27
29
 
@@ -33,9 +35,13 @@ class DuckDBDatabase(Database):
33
35
  _engine: sqlalchemy.Engine = None
34
36
  collection_class = DuckDBCollection
35
37
 
36
- def __init__(self, handle: Optional[str] = None, **kwargs):
38
+ def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
37
39
  if handle is None:
38
40
  handle = "duckdb:///:memory:"
41
+ if recreate_if_exists:
42
+ path = Path(handle.replace("duckdb:///", ""))
43
+ if path.exists():
44
+ path.unlink()
39
45
  super().__init__(handle=handle, **kwargs)
40
46
 
41
47
  @property
@@ -69,7 +75,10 @@ class DuckDBDatabase(Database):
69
75
  if qr.num_rows == 0:
70
76
  logger.debug(f"Table {query.from_table} not created yet")
71
77
  return QueryResult(query=query, num_rows=0, rows=[])
72
- sv = self._schema_view
78
+ if not query.from_table.startswith("information_schema"):
79
+ sv = self.schema_view
80
+ else:
81
+ sv = None
73
82
  if sv:
74
83
  cd = None
75
84
  for c in self._collections.values():
@@ -107,7 +116,10 @@ class DuckDBDatabase(Database):
107
116
 
108
117
  def init_collections(self):
109
118
  # TODO: unify schema introspection
110
- schema = introspect_schema(self.engine)
119
+ if not self.schema_view:
120
+ schema = introspect_schema(self.engine)
121
+ else:
122
+ schema = self.schema_view.schema
111
123
  table_names = schema.classes.keys()
112
124
  if self._collections is None:
113
125
  self._collections = {}
@@ -119,7 +131,7 @@ class DuckDBDatabase(Database):
119
131
  def induce_schema_view(self) -> SchemaView:
120
132
  # TODO: unify schema introspection
121
133
  # TODO: handle case where schema is provided in advance
122
- logger.info(f"Inducing schema view for {self.metadata.handle}")
134
+ logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
123
135
  sb = SchemaBuilder()
124
136
  schema = sb.schema
125
137
  query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
@@ -144,8 +156,10 @@ class DuckDBDatabase(Database):
144
156
  sd = SlotDefinition(
145
157
  row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
146
158
  )
159
+ if dt == "JSON":
160
+ sd.inlined_as_list = True
147
161
  sb.schema.classes[tbl_name].attributes[sd.name] = sd
148
- logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range}")
162
+ logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range} FROM {dt}")
149
163
  sb.add_defaults()
150
164
  for cls_name in schema.classes:
151
165
  if cls_name in self.metadata.collections:
@@ -3,5 +3,6 @@ import sqlalchemy as sqla
3
3
  TMAP = {
4
4
  "string": sqla.String,
5
5
  "integer": sqla.Integer,
6
+ "float": sqla.Float,
6
7
  "linkml:Any": sqla.JSON,
7
8
  }
@@ -0,0 +1,16 @@
1
+ """
2
+ Adapter for DuckDB embedded database.
3
+
4
+ Handles have the form:
5
+
6
+ - ``duckdb:///<path>`` for a file-based database
7
+ - ``duckdb:///:memory:`` for an in-memory database
8
+ """
9
+
10
+ from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
11
+ from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
+
13
+ __all__ = [
14
+ "DuckDBCollection",
15
+ "DuckDBDatabase",
16
+ ]