linkml-store 0.1.8__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (51) hide show
  1. {linkml_store-0.1.8 → linkml_store-0.1.9}/PKG-INFO +4 -1
  2. {linkml_store-0.1.8 → linkml_store-0.1.9}/pyproject.toml +6 -2
  3. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/client.py +2 -0
  4. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/collection.py +101 -6
  5. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/database.py +36 -5
  6. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +1 -0
  7. linkml_store-0.1.9/src/linkml_store/api/stores/filesystem/__init__.py +15 -0
  8. linkml_store-0.1.9/src/linkml_store/api/stores/filesystem/filesystem_collection.py +177 -0
  9. linkml_store-0.1.9/src/linkml_store/api/stores/filesystem/filesystem_database.py +72 -0
  10. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +10 -4
  11. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
  12. linkml_store-0.1.9/src/linkml_store/api/types.py +4 -0
  13. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/cli.py +88 -7
  14. linkml_store-0.1.9/src/linkml_store/utils/change_utils.py +17 -0
  15. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/utils/format_utils.py +89 -8
  16. linkml_store-0.1.9/src/linkml_store/utils/patch_utils.py +126 -0
  17. linkml_store-0.1.9/src/linkml_store/utils/query_utils.py +89 -0
  18. linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/__init__.py +0 -16
  19. linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -142
  20. linkml_store-0.1.8/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -36
  21. {linkml_store-0.1.8 → linkml_store-0.1.9}/LICENSE +0 -0
  22. {linkml_store-0.1.8 → linkml_store-0.1.9}/README.md +0 -0
  23. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/__init__.py +0 -0
  24. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/__init__.py +0 -0
  25. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/config.py +0 -0
  26. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/queries.py +0 -0
  27. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/__init__.py +0 -0
  28. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  29. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  30. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  31. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  32. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  33. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  34. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  35. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  36. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  37. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  38. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  39. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  40. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  41. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/constants.py +0 -0
  43. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/index/__init__.py +0 -0
  44. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/index/implementations/__init__.py +0 -0
  45. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
  46. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  47. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/index/indexer.py +0 -0
  48. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/utils/__init__.py +0 -0
  49. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/utils/io.py +0 -0
  50. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/utils/object_utils.py +0 -0
  51. {linkml_store-0.1.8 → linkml_store-0.1.9}/src/linkml_store/utils/sql_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -19,6 +19,7 @@ Provides-Extra: h5py
19
19
  Provides-Extra: llm
20
20
  Provides-Extra: map
21
21
  Provides-Extra: mongodb
22
+ Provides-Extra: pyarrow
22
23
  Provides-Extra: tests
23
24
  Provides-Extra: validation
24
25
  Requires-Dist: black (>=24.0.0) ; extra == "tests"
@@ -28,6 +29,7 @@ Requires-Dist: duckdb (>=0.10.1,<0.11.0)
28
29
  Requires-Dist: duckdb-engine (>=0.11.2)
29
30
  Requires-Dist: h5py ; extra == "h5py"
30
31
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
32
+ Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
31
33
  Requires-Dist: linkml ; extra == "validation"
32
34
  Requires-Dist: linkml-runtime (>=1.7.5,<2.0.0)
33
35
  Requires-Dist: linkml_map ; extra == "map"
@@ -35,6 +37,7 @@ Requires-Dist: llm ; extra == "llm"
35
37
  Requires-Dist: matplotlib ; extra == "analytics"
36
38
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
37
39
  Requires-Dist: plotly ; extra == "analytics"
40
+ Requires-Dist: pyarrow ; extra == "pyarrow"
38
41
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
39
42
  Requires-Dist: pymongo ; extra == "mongodb"
40
43
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.8"
3
+ version = "0.1.9"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -23,11 +23,13 @@ black = { version=">=24.0.0", optional = true }
23
23
  llm = { version="*", optional = true }
24
24
  pymongo = { version="*", optional = true }
25
25
  chromadb = { version="*", optional = true }
26
+ pyarrow = { version="*", optional = true }
26
27
  h5py = { version="*", optional = true }
27
28
  linkml = { version="*", optional = true }
28
29
  linkml_map = { version="*", optional = true }
29
30
  pandas = ">=2.2.1"
30
31
  jinja2 = "^3.1.4"
32
+ jsonlines = "^4.0.0"
31
33
 
32
34
  [tool.poetry.group.dev.dependencies]
33
35
  pytest = {version = ">=7.1.2"}
@@ -43,6 +45,7 @@ furo = {version = "*"}
43
45
  nbsphinx = "*"
44
46
  jupyter = "*"
45
47
  jupysql = "*"
48
+ papermill = "*"
46
49
 
47
50
  [tool.poetry.group.tests.dependencies]
48
51
  pytest = "^7.4.0"
@@ -60,6 +63,7 @@ llm = ["llm"]
60
63
  mongodb = ["pymongo"]
61
64
  chromadb = ["chromadb"]
62
65
  h5py = ["h5py"]
66
+ pyarrow = ["pyarrow"]
63
67
  validation = ["linkml"]
64
68
  map = ["linkml_map"]
65
69
 
@@ -127,7 +131,7 @@ skip = '.git,*.pdf,*.svg,./tests,pyproject.toml,*.dill,poetry.lock,*.ipynb'
127
131
  # Ignore table where words could be split across rows
128
132
  # Ignore shortcut specifications like [Ff]alse
129
133
  ignore-regex = '(\|.*\|.*\|.*\||\[[A-Z][a-z]\][a-z][a-z])'
130
- ignore-words-list = 'mater,connexion,infarction'
134
+ ignore-words-list = 'mater,connexion,infarction,nin'
131
135
  count = ""
132
136
  quiet-level = 3
133
137
 
@@ -9,6 +9,7 @@ from linkml_store.api import Database
9
9
  from linkml_store.api.config import ClientConfig
10
10
  from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
11
11
  from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
+ from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
12
13
  from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
13
14
  from linkml_store.api.stores.solr.solr_database import SolrDatabase
14
15
 
@@ -20,6 +21,7 @@ HANDLE_MAP = {
20
21
  "solr": SolrDatabase,
21
22
  "mongodb": MongoDBDatabase,
22
23
  "chromadb": ChromaDBDatabase,
24
+ "file": FileSystemDatabase,
23
25
  }
24
26
 
25
27
 
@@ -4,16 +4,19 @@ import hashlib
4
4
  import logging
5
5
  from collections import defaultdict
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
7
+ from typing import TYPE_CHECKING, Any, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
8
8
 
9
9
  import numpy as np
10
+ from linkml_runtime import SchemaView
10
11
  from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
11
12
  from linkml_runtime.linkml_model.meta import ArrayExpression
12
13
  from pydantic import BaseModel
13
14
 
15
+ from linkml_store.api.types import DatabaseType
14
16
  from linkml_store.index import get_indexer
15
17
  from linkml_store.utils.format_utils import load_objects
16
18
  from linkml_store.utils.object_utils import clean_empties
19
+ from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
17
20
 
18
21
  try:
19
22
  from linkml.validator.report import ValidationResult
@@ -36,7 +39,7 @@ IDENTIFIER = str
36
39
  FIELD_NAME = str
37
40
 
38
41
 
39
- class Collection:
42
+ class Collection(Generic[DatabaseType]):
40
43
  """
41
44
  A collection is an organized set of objects of the same or similar type.
42
45
 
@@ -56,7 +59,7 @@ class Collection:
56
59
  """
57
60
 
58
61
  # name: str
59
- parent: Optional["Database"] = None
62
+ parent: Optional[DatabaseType] = None
60
63
  _indexers: Optional[Dict[str, Indexer]] = None
61
64
  # hidden: Optional[bool] = False
62
65
 
@@ -197,6 +200,10 @@ class Collection:
197
200
  """
198
201
  raise NotImplementedError
199
202
 
203
+ def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
204
+ patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
205
+ self._broadcast(patches, **kwargs)
206
+
200
207
  def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
201
208
  """
202
209
  Delete one or more objects from the collection.
@@ -301,7 +308,7 @@ class Collection:
301
308
 
302
309
  def query_facets(
303
310
  self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
304
- ) -> Dict[str, Dict[str, int]]:
311
+ ) -> Dict[str, List[Tuple[Any, int]]]:
305
312
  """
306
313
  Run a query to get facet counts for one or more columns.
307
314
 
@@ -319,7 +326,7 @@ class Collection:
319
326
  :param query: A Query object representing the base query.
320
327
  :param facet_columns: A list of column names to get facet counts for.
321
328
  :param facet_limit:
322
- :return: A dictionary where keys are column names and values are pandas DataFrames
329
+ :return: A dictionary where keys are column names and values are tuples
323
330
  containing the facet counts for each unique value in the respective column.
324
331
  """
325
332
  raise NotImplementedError
@@ -523,6 +530,7 @@ class Collection:
523
530
  ix_coll.delete_where()
524
531
 
525
532
  ix_coll.insert(objects_with_ix, **kwargs)
533
+ ix_coll.commit()
526
534
 
527
535
  def list_index_names(self) -> List[str]:
528
536
  """
@@ -557,12 +565,22 @@ class Collection:
557
565
 
558
566
  :return:
559
567
  """
560
- sv = self.parent.schema_view
568
+ sv: SchemaView = self.parent.schema_view
561
569
  if sv:
562
570
  cls = sv.get_class(self.target_class_name)
571
+ if cls and not cls.attributes:
572
+ if not sv.class_induced_slots(cls.name):
573
+ for att in self._induce_attributes():
574
+ cls.attributes[att.name] = att
575
+ sv.set_modified()
563
576
  return cls
564
577
  return None
565
578
 
579
+ def _induce_attributes(self) -> List[SlotDefinition]:
580
+ result = self.find({}, limit=-1)
581
+ cd = self.induce_class_definition_from_objects(result.rows, max_sample_size=None)
582
+ return list(cd.attributes.values())
583
+
566
584
  @property
567
585
  def identifier_attribute_name(self) -> Optional[str]:
568
586
  """
@@ -579,6 +597,37 @@ class Collection:
579
597
  return att.name
580
598
  return None
581
599
 
600
+ def set_identifier_attribute_name(self, name: str):
601
+ """
602
+ Set the name of the identifier attribute for the collection.
603
+
604
+ AKA the primary key.
605
+
606
+ :param name: The name of the identifier attribute.
607
+ """
608
+ cd = self.class_definition()
609
+ if not cd:
610
+ raise ValueError(f"Cannot find class definition for {self.target_class_name}")
611
+ id_att = None
612
+ candidates = []
613
+ sv: SchemaView = self.parent.schema_view
614
+ cls = sv.get_class(cd.name)
615
+ existing_id_slot = sv.get_identifier_slot(cls.name)
616
+ if existing_id_slot:
617
+ if existing_id_slot.name == name:
618
+ return
619
+ existing_id_slot.identifier = False
620
+ for att in cls.attributes.values():
621
+ candidates.append(att.name)
622
+ if att.name == name:
623
+ att.identifier = True
624
+ id_att = att
625
+ else:
626
+ att.identifier = False
627
+ if not id_att:
628
+ raise ValueError(f"No attribute found with name {name} in {candidates}")
629
+ sv.set_modified()
630
+
582
631
  def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
583
632
  """
584
633
  Return the identifier for an object.
@@ -622,6 +671,8 @@ class Collection:
622
671
  for k, v in obj.items():
623
672
  keys[k].append(v)
624
673
  for k, vs in keys.items():
674
+ if k == "_id":
675
+ continue
625
676
  multivalueds = []
626
677
  inlineds = []
627
678
  rngs = []
@@ -698,6 +749,39 @@ class Collection:
698
749
  """
699
750
  raise NotImplementedError
700
751
 
752
+ def apply_patches(self, patches: List[PatchDict], **kwargs):
753
+ """
754
+ Apply a patch to the collection.
755
+
756
+ Patches conform to the JSON Patch format,
757
+
758
+ :param patches:
759
+ :param kwargs:
760
+ :return:
761
+ """
762
+ all_objs = self.find(limit=-1).rows
763
+ primary_key = self.identifier_attribute_name
764
+ if not primary_key:
765
+ raise ValueError(f"No primary key for {self.target_class_name}")
766
+ new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
767
+ self.replace(new_objs)
768
+
769
+ def diff(self, other: "Collection", **kwargs):
770
+ """
771
+ Diff two collections.
772
+
773
+ :param other:
774
+ :param kwargs:
775
+ :return:
776
+ """
777
+ src_objs = self.find(limit=-1).rows
778
+ tgt_objs = other.find(limit=-1).rows
779
+ primary_key = self.identifier_attribute_name
780
+ if not primary_key:
781
+ raise ValueError(f"No primary key for {self.target_class_name}")
782
+ patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
783
+ return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
784
+
701
785
  def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
702
786
  """
703
787
  Validate the contents of the collection
@@ -717,3 +801,14 @@ class Collection:
717
801
  for obj in result.rows:
718
802
  obj = clean_empties(obj)
719
803
  yield from validator.iter_results(obj, class_name)
804
+
805
+ def commit(self):
806
+ """
807
+ Commit changes to the collection.
808
+
809
+ :return:
810
+ """
811
+ pass
812
+
813
+ def _broadcast(self, *args, **kwargs):
814
+ self.parent.broadcast(self, *args, **kwargs)
@@ -3,9 +3,24 @@ from abc import ABC
3
3
  from collections import defaultdict
4
4
  from copy import copy
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
7
-
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Any,
9
+ Callable,
10
+ ClassVar,
11
+ Dict,
12
+ Generic,
13
+ Iterator,
14
+ List,
15
+ Optional,
16
+ Sequence,
17
+ Type,
18
+ Union,
19
+ )
20
+
21
+ from linkml_store.api.types import CollectionType
8
22
  from linkml_store.utils.format_utils import load_objects, render_output
23
+ from linkml_store.utils.patch_utils import PatchDict
9
24
 
10
25
  try:
11
26
  from linkml.validator.report import Severity, ValidationResult
@@ -24,8 +39,10 @@ if TYPE_CHECKING:
24
39
 
25
40
  logger = logging.getLogger(__name__)
26
41
 
42
+ LISTENER = Callable[[Collection, List[PatchDict]], None]
43
+
27
44
 
28
- class Database(ABC):
45
+ class Database(ABC, Generic[CollectionType]):
29
46
  """
30
47
  A Database provides access to named collections of data.
31
48
 
@@ -89,6 +106,8 @@ class Database(ABC):
89
106
  metadata: Optional[DatabaseConfig] = None
90
107
  collection_class: ClassVar[Optional[Type[Collection]]] = None
91
108
 
109
+ listeners: Optional[List[LISTENER]] = None
110
+
92
111
  def __init__(self, handle: Optional[str] = None, metadata: Optional[DatabaseConfig] = None, **kwargs):
93
112
  if metadata:
94
113
  self.metadata = metadata
@@ -233,7 +252,8 @@ class Database(ABC):
233
252
  :param kwargs:
234
253
  :return:
235
254
  """
236
- raise NotImplementedError()
255
+ for coll in self.list_collections():
256
+ coll.commit()
237
257
 
238
258
  def close(self, **kwargs):
239
259
  """
@@ -301,6 +321,7 @@ class Database(ABC):
301
321
  alias = name
302
322
  self._collections[alias] = collection
303
323
  if recreate_if_exists:
324
+ logger.debug(f"Recreating collection {collection.name}")
304
325
  collection.delete_where({}, missing_ok=True)
305
326
  return collection
306
327
 
@@ -418,7 +439,11 @@ class Database(ABC):
418
439
  :return:
419
440
 
420
441
  """
421
- raise NotImplementedError
442
+ if query.from_table:
443
+ collection = self.get_collection(query.from_table)
444
+ return collection.query(query, **kwargs)
445
+ else:
446
+ raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
422
447
 
423
448
  @property
424
449
  def schema_view(self) -> SchemaView:
@@ -689,3 +714,9 @@ class Database(ABC):
689
714
  logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
690
715
  with open(location, "w", encoding="utf-8") as stream:
691
716
  stream.write(render_output(obj, format=target_format))
717
+
718
+ def broadcast(self, source: Collection, patches: List[PatchDict]):
719
+ if not self.listeners:
720
+ return
721
+ for listener in self.listeners:
722
+ listener(source, patches)
@@ -38,6 +38,7 @@ class DuckDBCollection(Collection):
38
38
  with conn.begin():
39
39
  conn.execute(insert(table), objs)
40
40
  conn.commit()
41
+ self._post_insert_hook(objs)
41
42
 
42
43
  def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
43
44
  if not isinstance(objs, list):
@@ -0,0 +1,15 @@
1
+ """
2
+ Adapter for FileSystem wrapper
3
+
4
+ Handles have the form:
5
+
6
+ - ``file:<path>`` for a local file
7
+ """
8
+
9
+ from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
+ from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
11
+
12
+ __all__ = [
13
+ "FileSystemCollection",
14
+ "FileSystemDatabase",
15
+ ]
@@ -0,0 +1,177 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from linkml_store.api import Collection
6
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
7
+ from linkml_store.api.queries import Query, QueryResult
8
+ from linkml_store.api.types import DatabaseType
9
+ from linkml_store.utils.query_utils import mongo_query_to_match_function
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class FileSystemCollection(Collection[DatabaseType]):
15
+ path: Optional[Path] = None
16
+ file_format: Optional[str] = None
17
+ encoding: Optional[str] = None
18
+ _objects_list: List[OBJECT] = None
19
+ _object_map: Dict[str, OBJECT] = None
20
+
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+ parent: DatabaseType = self.parent
24
+ if not self.path:
25
+ if self.parent:
26
+ self.path = Path(parent.directory_path)
27
+ self._objects_list = []
28
+ self._object_map = {}
29
+ if not self.file_format:
30
+ self.file_format = "json"
31
+
32
+ @property
33
+ def path_to_file(self):
34
+ return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
35
+
36
+ @property
37
+ def objects_as_list(self) -> List[OBJECT]:
38
+ if self._object_map:
39
+ return list(self._object_map.values())
40
+ else:
41
+ return self._objects_list
42
+
43
+ def _set_objects(self, objs: List[OBJECT]):
44
+ pk = self.identifier_attribute_name
45
+ if pk:
46
+ self._object_map = {obj[pk]: obj for obj in objs}
47
+ self._objects_list = []
48
+ else:
49
+ self._objects_list = objs
50
+ self._object_map = {}
51
+
52
+ def commit(self):
53
+ path = self.path_to_file
54
+ if not path:
55
+ raise ValueError("Path not set")
56
+ path.parent.mkdir(parents=True, exist_ok=True)
57
+ self._save(path)
58
+
59
+ def _save(self, path: Path):
60
+ encoding = self.encoding or "utf-8"
61
+ fmt = self.file_format or "json"
62
+ mode = "w"
63
+ if fmt == "parquet":
64
+ mode = "wb"
65
+ encoding = None
66
+ with open(path, mode, encoding=encoding) as stream:
67
+ if fmt == "json":
68
+ import json
69
+
70
+ json.dump(self.objects_as_list, stream, indent=2)
71
+ elif fmt == "jsonl":
72
+ import jsonlines
73
+
74
+ writer = jsonlines.Writer(stream)
75
+ writer.write_all(self.objects_as_list)
76
+ elif fmt == "yaml":
77
+ import yaml
78
+
79
+ yaml.dump_all(self.objects_as_list, stream)
80
+ elif fmt == "parquet":
81
+ import pandas as pd
82
+ import pyarrow
83
+ import pyarrow.parquet as pq
84
+
85
+ df = pd.DataFrame(self.objects_as_list)
86
+ table = pyarrow.Table.from_pandas(df)
87
+ pq.write_table(table, stream)
88
+ elif fmt in {"csv", "tsv"}:
89
+ import csv
90
+
91
+ delimiter = "\t" if fmt == "tsv" else ","
92
+ fieldnames = list(self.objects_as_list[0].keys())
93
+ for obj in self.objects_as_list[1:]:
94
+ fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
95
+ writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
96
+ writer.writeheader()
97
+ for obj in self.objects_as_list:
98
+ writer.writerow(obj)
99
+ else:
100
+ raise ValueError(f"Unsupported file format: {fmt}")
101
+
102
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
103
+ if not isinstance(objs, list):
104
+ objs = [objs]
105
+ if not objs:
106
+ return
107
+ pk = self.identifier_attribute_name
108
+ if pk:
109
+ for obj in objs:
110
+ if pk not in obj:
111
+ raise ValueError(f"Primary key {pk} not found in object {obj}")
112
+ pk_val = obj[pk]
113
+ self._object_map[pk_val] = obj
114
+ else:
115
+ self._objects_list.extend(objs)
116
+
117
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
118
+ if not isinstance(objs, list):
119
+ objs = [objs]
120
+ if not objs:
121
+ return 0
122
+ pk = self.identifier_attribute_name
123
+ n = 0
124
+ if pk:
125
+ for obj in objs:
126
+ pk_val = obj[pk]
127
+ if pk_val in self._object_map:
128
+ del self._object_map[pk_val]
129
+ n += 1
130
+ else:
131
+ n = len(objs)
132
+ self._objects_list = [o for o in self._objects_list if o not in objs]
133
+ n = n - len(objs)
134
+ return n
135
+
136
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
137
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
138
+ if where is None:
139
+ where = {}
140
+
141
+ def matches(obj: OBJECT):
142
+ for k, v in where.items():
143
+ if obj.get(k) != v:
144
+ return False
145
+ return True
146
+
147
+ print(type(self))
148
+ print(self)
149
+ print(vars(self))
150
+ curr_objects = [o for o in self.objects_as_list if not matches(o)]
151
+ self._set_objects(curr_objects)
152
+
153
+ def query(self, query: Query, **kwargs) -> QueryResult:
154
+
155
+ where = query.where_clause or {}
156
+ match = mongo_query_to_match_function(where)
157
+ rows = [o for o in self.objects_as_list if match(o)]
158
+ count = len(rows)
159
+ return QueryResult(query=query, num_rows=count, rows=rows)
160
+
161
+ def query_facets(
162
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
163
+ ) -> Dict[str, Dict[str, int]]:
164
+ match = mongo_query_to_match_function(where)
165
+ rows = [o for o in self.objects_as_list if match(o)]
166
+ if not facet_columns:
167
+ facet_columns = self.class_definition().attributes.keys()
168
+ facet_results = {c: {} for c in facet_columns}
169
+ for row in rows:
170
+ for fc in facet_columns:
171
+ if fc in row:
172
+ v = row[fc]
173
+ if v not in facet_results[fc]:
174
+ facet_results[fc][v] = 1
175
+ else:
176
+ facet_results[fc][v] += 1
177
+ return {fc: list(facet_results[fc].items()) for fc in facet_results}
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import yaml
6
+ from linkml.utils.schema_builder import SchemaBuilder
7
+ from linkml_runtime import SchemaView
8
+
9
+ from linkml_store.api import Database
10
+ from linkml_store.api.config import DatabaseConfig
11
+ from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
12
+ from linkml_store.utils.format_utils import Format, load_objects
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class FileSystemDatabase(Database):
18
+ collection_class = FileSystemCollection
19
+
20
+ directory_path: Optional[Path] = None
21
+ default_file_format: Optional[str] = None
22
+
23
+ def __init__(self, handle: Optional[str] = None, **kwargs):
24
+ handle = handle.replace("file:", "")
25
+ if handle.startswith("//"):
26
+ handle = handle[2:]
27
+ self.directory_path = Path(handle)
28
+ self.load_metadata()
29
+ super().__init__(handle=handle, **kwargs)
30
+
31
+ @property
32
+ def metadata_path(self) -> Path:
33
+ return self.directory_path / ".linkml_metadata.yaml"
34
+
35
+ def load_metadata(self):
36
+ if self.metadata_path.exists():
37
+ md_dict = yaml.safe_load(open(self.metadata_path))
38
+ metadata = DatabaseConfig(**md_dict)
39
+ else:
40
+ metadata = DatabaseConfig()
41
+ self.metadata = metadata
42
+
43
+ def close(self, **kwargs):
44
+ pass
45
+
46
+ def init_collections(self):
47
+ metadata = self.metadata
48
+ if self._collections is None:
49
+ self._collections = {}
50
+ for name, collection_config in metadata.collections.items():
51
+ collection = FileSystemCollection(parent=self, **collection_config.dict())
52
+ self._collections[name] = collection
53
+ path = self.directory_path
54
+ if path.exists():
55
+ for fmt in Format:
56
+ suffix = fmt.value
57
+ logger.info(f"Looking for {suffix} files in {path}")
58
+ for f in path.glob(f"*.{suffix}"):
59
+ logger.info(f"Found {f}")
60
+ n = f.stem
61
+ objs = load_objects(f, suffix, expected_type=list)
62
+ collection = FileSystemCollection(parent=self, name=n)
63
+ self._collections[n] = collection
64
+ collection._set_objects(objs)
65
+
66
+ def induce_schema_view(self) -> SchemaView:
67
+ logger.info(f"Inducing schema view for {self.handle}")
68
+ sb = SchemaBuilder()
69
+
70
+ for collection_name in self.list_collection_names():
71
+ sb.add_class(collection_name)
72
+ return SchemaView(sb.schema)
@@ -26,17 +26,23 @@ class MongoDBCollection(Collection):
26
26
  def mongo_collection(self) -> MongoCollection:
27
27
  if not self.name:
28
28
  raise ValueError("Collection name not set")
29
- return self.parent.native_db[self.name]
29
+ collection_name = self.alias or self.name
30
+ return self.parent.native_db[collection_name]
30
31
 
31
32
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
32
33
  if not isinstance(objs, list):
33
34
  objs = [objs]
34
35
  self.mongo_collection.insert_many(objs)
36
+ # TODO: allow mapping of _id to id for efficiency
37
+ for obj in objs:
38
+ del obj["_id"]
39
+ self._post_insert_hook(objs)
35
40
 
36
- def query(self, query: Query, **kwargs) -> QueryResult:
41
+ def query(self, query: Query, limit: Optional[int] = None, **kwargs) -> QueryResult:
37
42
  mongo_filter = self._build_mongo_filter(query.where_clause)
38
- if query.limit:
39
- cursor = self.mongo_collection.find(mongo_filter).limit(query.limit)
43
+ limit = limit or query.limit
44
+ if limit and limit >= 0:
45
+ cursor = self.mongo_collection.find(mongo_filter).limit(limit)
40
46
  else:
41
47
  cursor = self.mongo_collection.find(mongo_filter)
42
48