PyPI - linkml-store - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

linkml-store 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (20) hide show

linkml_store/api/client.py +2 -0
linkml_store/api/collection.py +101 -6
linkml_store/api/database.py +36 -5
linkml_store/api/stores/duckdb/duckdb_collection.py +1 -0
linkml_store/api/stores/filesystem/__init__.py +7 -8
linkml_store/api/stores/filesystem/filesystem_collection.py +148 -113
linkml_store/api/stores/filesystem/filesystem_database.py +57 -21
linkml_store/api/stores/mongodb/mongodb_collection.py +10 -4
linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
linkml_store/api/types.py +4 -0
linkml_store/cli.py +88 -7
linkml_store/utils/change_utils.py +17 -0
linkml_store/utils/format_utils.py +89 -8
linkml_store/utils/patch_utils.py +126 -0
linkml_store/utils/query_utils.py +89 -0
{linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/METADATA +4 -1
{linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/RECORD +20 -16
{linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
{linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
{linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0

linkml_store/api/client.py CHANGED Viewed

@@ -9,6 +9,7 @@ from linkml_store.api import Database
 from linkml_store.api.config import ClientConfig
 from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
 from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
+from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
 from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
 from linkml_store.api.stores.solr.solr_database import SolrDatabase
@@ -20,6 +21,7 @@ HANDLE_MAP = {
     "solr": SolrDatabase,
     "mongodb": MongoDBDatabase,
     "chromadb": ChromaDBDatabase,
+    "file": FileSystemDatabase,
 }

linkml_store/api/collection.py CHANGED Viewed

@@ -4,16 +4,19 @@ import hashlib
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
+from typing import TYPE_CHECKING, Any, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
 import numpy as np
+from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
 from linkml_runtime.linkml_model.meta import ArrayExpression
 from pydantic import BaseModel
+from linkml_store.api.types import DatabaseType
 from linkml_store.index import get_indexer
 from linkml_store.utils.format_utils import load_objects
 from linkml_store.utils.object_utils import clean_empties
+from linkml_store.utils.patch_utils import PatchDict, apply_patches_to_list, patches_from_objects_lists
 try:
     from linkml.validator.report import ValidationResult
@@ -36,7 +39,7 @@ IDENTIFIER = str
 FIELD_NAME = str
-class Collection:
+class Collection(Generic[DatabaseType]):
     """
     A collection is an organized set of objects of the same or similar type.
@@ -56,7 +59,7 @@ class Collection:
     """
     # name: str
-    parent: Optional["Database"] = None
+    parent: Optional[DatabaseType] = None
     _indexers: Optional[Dict[str, Indexer]] = None
     # hidden: Optional[bool] = False
@@ -197,6 +200,10 @@ class Collection:
         """
         raise NotImplementedError
+    def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
+        patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
+        self._broadcast(patches, **kwargs)
     def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
         """
         Delete one or more objects from the collection.
@@ -301,7 +308,7 @@ class Collection:
     def query_facets(
         self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
-    ) -> Dict[str, Dict[str, int]]:
+    ) -> Dict[str, List[Tuple[Any, int]]]:
         """
         Run a query to get facet counts for one or more columns.
@@ -319,7 +326,7 @@ class Collection:
         :param query: A Query object representing the base query.
         :param facet_columns: A list of column names to get facet counts for.
         :param facet_limit:
-        :return: A dictionary where keys are column names and values are pandas DataFrames
+        :return: A dictionary where keys are column names and values are tuples
                  containing the facet counts for each unique value in the respective column.
         """
         raise NotImplementedError
@@ -523,6 +530,7 @@ class Collection:
                 ix_coll.delete_where()
         ix_coll.insert(objects_with_ix, **kwargs)
+        ix_coll.commit()
     def list_index_names(self) -> List[str]:
         """
@@ -557,12 +565,22 @@ class Collection:
         :return:
         """
-        sv = self.parent.schema_view
+        sv: SchemaView = self.parent.schema_view
         if sv:
             cls = sv.get_class(self.target_class_name)
+            if cls and not cls.attributes:
+                if not sv.class_induced_slots(cls.name):
+                    for att in self._induce_attributes():
+                        cls.attributes[att.name] = att
+                    sv.set_modified()
             return cls
         return None
+    def _induce_attributes(self) -> List[SlotDefinition]:
+        result = self.find({}, limit=-1)
+        cd = self.induce_class_definition_from_objects(result.rows, max_sample_size=None)
+        return list(cd.attributes.values())
     @property
     def identifier_attribute_name(self) -> Optional[str]:
         """
@@ -579,6 +597,37 @@ class Collection:
                     return att.name
         return None
+    def set_identifier_attribute_name(self, name: str):
+        """
+        Set the name of the identifier attribute for the collection.
+        AKA the primary key.
+        :param name: The name of the identifier attribute.
+        """
+        cd = self.class_definition()
+        if not cd:
+            raise ValueError(f"Cannot find class definition for {self.target_class_name}")
+        id_att = None
+        candidates = []
+        sv: SchemaView = self.parent.schema_view
+        cls = sv.get_class(cd.name)
+        existing_id_slot = sv.get_identifier_slot(cls.name)
+        if existing_id_slot:
+            if existing_id_slot.name == name:
+                return
+            existing_id_slot.identifier = False
+        for att in cls.attributes.values():
+            candidates.append(att.name)
+            if att.name == name:
+                att.identifier = True
+                id_att = att
+            else:
+                att.identifier = False
+        if not id_att:
+            raise ValueError(f"No attribute found with name {name} in {candidates}")
+        sv.set_modified()
     def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
         """
         Return the identifier for an object.
@@ -622,6 +671,8 @@ class Collection:
             for k, v in obj.items():
                 keys[k].append(v)
         for k, vs in keys.items():
+            if k == "_id":
+                continue
             multivalueds = []
             inlineds = []
             rngs = []
@@ -698,6 +749,39 @@ class Collection:
         """
         raise NotImplementedError
+    def apply_patches(self, patches: List[PatchDict], **kwargs):
+        """
+        Apply a patch to the collection.
+        Patches conform to the JSON Patch format,
+        :param patches:
+        :param kwargs:
+        :return:
+        """
+        all_objs = self.find(limit=-1).rows
+        primary_key = self.identifier_attribute_name
+        if not primary_key:
+            raise ValueError(f"No primary key for {self.target_class_name}")
+        new_objs = apply_patches_to_list(all_objs, patches, primary_key=primary_key, **kwargs)
+        self.replace(new_objs)
+    def diff(self, other: "Collection", **kwargs):
+        """
+        Diff two collections.
+        :param other:
+        :param kwargs:
+        :return:
+        """
+        src_objs = self.find(limit=-1).rows
+        tgt_objs = other.find(limit=-1).rows
+        primary_key = self.identifier_attribute_name
+        if not primary_key:
+            raise ValueError(f"No primary key for {self.target_class_name}")
+        patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
+        return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
     def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
         """
         Validate the contents of the collection
@@ -717,3 +801,14 @@ class Collection:
         for obj in result.rows:
             obj = clean_empties(obj)
             yield from validator.iter_results(obj, class_name)
+    def commit(self):
+        """
+        Commit changes to the collection.
+        :return:
+        """
+        pass
+    def _broadcast(self, *args, **kwargs):
+        self.parent.broadcast(self, *args, **kwargs)

linkml_store/api/database.py CHANGED Viewed

@@ -3,9 +3,24 @@ from abc import ABC
 from collections import defaultdict
 from copy import copy
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterator, Optional, Sequence, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    Generic,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Type,
+    Union,
+)
+from linkml_store.api.types import CollectionType
 from linkml_store.utils.format_utils import load_objects, render_output
+from linkml_store.utils.patch_utils import PatchDict
 try:
     from linkml.validator.report import Severity, ValidationResult
@@ -24,8 +39,10 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
+LISTENER = Callable[[Collection, List[PatchDict]], None]
-class Database(ABC):
+class Database(ABC, Generic[CollectionType]):
     """
     A Database provides access to named collections of data.
@@ -89,6 +106,8 @@ class Database(ABC):
     metadata: Optional[DatabaseConfig] = None
     collection_class: ClassVar[Optional[Type[Collection]]] = None
+    listeners: Optional[List[LISTENER]] = None
     def __init__(self, handle: Optional[str] = None, metadata: Optional[DatabaseConfig] = None, **kwargs):
         if metadata:
             self.metadata = metadata
@@ -233,7 +252,8 @@ class Database(ABC):
         :param kwargs:
         :return:
         """
-        raise NotImplementedError()
+        for coll in self.list_collections():
+            coll.commit()
     def close(self, **kwargs):
         """
@@ -301,6 +321,7 @@ class Database(ABC):
             alias = name
         self._collections[alias] = collection
         if recreate_if_exists:
+            logger.debug(f"Recreating collection {collection.name}")
             collection.delete_where({}, missing_ok=True)
         return collection
@@ -418,7 +439,11 @@ class Database(ABC):
         :return:
         """
-        raise NotImplementedError
+        if query.from_table:
+            collection = self.get_collection(query.from_table)
+            return collection.query(query, **kwargs)
+        else:
+            raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
     @property
     def schema_view(self) -> SchemaView:
@@ -689,3 +714,9 @@ class Database(ABC):
         logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
         with open(location, "w", encoding="utf-8") as stream:
             stream.write(render_output(obj, format=target_format))
+    def broadcast(self, source: Collection, patches: List[PatchDict]):
+        if not self.listeners:
+            return
+        for listener in self.listeners:
+            listener(source, patches)

linkml_store/api/stores/duckdb/duckdb_collection.py CHANGED Viewed

@@ -38,6 +38,7 @@ class DuckDBCollection(Collection):
             with conn.begin():
                 conn.execute(insert(table), objs)
             conn.commit()
+        self._post_insert_hook(objs)
     def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
         if not isinstance(objs, list):

linkml_store/api/stores/filesystem/__init__.py CHANGED Viewed

@@ -1,16 +1,15 @@
 """
-Adapter for DuckDB embedded database.
+Adapter for FileSystem wrapper
 Handles have the form:
- - ``duckdb:///<path>`` for a file-based database
- - ``duckdb:///:memory:`` for an in-memory database
-"""
+ - ``file:<path>`` for a local file
+ """
-from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
-from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
+from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
+from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
 __all__ = [
-    "DuckDBCollection",
-    "DuckDBDatabase",
+    "FileSystemCollection",
+    "FileSystemDatabase",
 ]

linkml_store/api/stores/filesystem/filesystem_collection.py CHANGED Viewed

@@ -1,142 +1,177 @@
 import logging
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-import sqlalchemy as sqla
-from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
-from sqlalchemy import Column, Table, delete, insert, inspect, text
-from sqlalchemy.sql.ddl import CreateTable
 from linkml_store.api import Collection
 from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
-from linkml_store.api.queries import Query
-from linkml_store.api.stores.duckdb.mappings import TMAP
-from linkml_store.utils.sql_utils import facet_count_sql
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.api.types import DatabaseType
+from linkml_store.utils.query_utils import mongo_query_to_match_function
 logger = logging.getLogger(__name__)
-class FileSystemCollection(Collection):
-    _table_created: bool = None
+class FileSystemCollection(Collection[DatabaseType]):
+    path: Optional[Path] = None
+    file_format: Optional[str] = None
+    encoding: Optional[str] = None
+    _objects_list: List[OBJECT] = None
+    _object_map: Dict[str, OBJECT] = None
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        parent: DatabaseType = self.parent
+        if not self.path:
+            if self.parent:
+                self.path = Path(parent.directory_path)
+        self._objects_list = []
+        self._object_map = {}
+        if not self.file_format:
+            self.file_format = "json"
+    @property
+    def path_to_file(self):
+        return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
+    @property
+    def objects_as_list(self) -> List[OBJECT]:
+        if self._object_map:
+            return list(self._object_map.values())
+        else:
+            return self._objects_list
+    def _set_objects(self, objs: List[OBJECT]):
+        pk = self.identifier_attribute_name
+        if pk:
+            self._object_map = {obj[pk]: obj for obj in objs}
+            self._objects_list = []
+        else:
+            self._objects_list = objs
+            self._object_map = {}
+    def commit(self):
+        path = self.path_to_file
+        if not path:
+            raise ValueError("Path not set")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._save(path)
+    def _save(self, path: Path):
+        encoding = self.encoding or "utf-8"
+        fmt = self.file_format or "json"
+        mode = "w"
+        if fmt == "parquet":
+            mode = "wb"
+            encoding = None
+        with open(path, mode, encoding=encoding) as stream:
+            if fmt == "json":
+                import json
+                json.dump(self.objects_as_list, stream, indent=2)
+            elif fmt == "jsonl":
+                import jsonlines
+                writer = jsonlines.Writer(stream)
+                writer.write_all(self.objects_as_list)
+            elif fmt == "yaml":
+                import yaml
+                yaml.dump_all(self.objects_as_list, stream)
+            elif fmt == "parquet":
+                import pandas as pd
+                import pyarrow
+                import pyarrow.parquet as pq
+                df = pd.DataFrame(self.objects_as_list)
+                table = pyarrow.Table.from_pandas(df)
+                pq.write_table(table, stream)
+            elif fmt in {"csv", "tsv"}:
+                import csv
+                delimiter = "\t" if fmt == "tsv" else ","
+                fieldnames = list(self.objects_as_list[0].keys())
+                for obj in self.objects_as_list[1:]:
+                    fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
+                writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
+                writer.writeheader()
+                for obj in self.objects_as_list:
+                    writer.writerow(obj)
+            else:
+                raise ValueError(f"Unsupported file format: {fmt}")
     def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
         if not isinstance(objs, list):
             objs = [objs]
         if not objs:
             return
-        cd = self.class_definition()
-        if not cd:
-            cd = self.induce_class_definition_from_objects(objs)
-        self._create_table(cd)
-        table = self._sqla_table(cd)
-        logger.info(f"Inserting into: {self.alias} // T={table.name}")
-        engine = self.parent.engine
-        col_names = [c.name for c in table.columns]
-        objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
-        with engine.connect() as conn:
-            with conn.begin():
-                conn.execute(insert(table), objs)
-            conn.commit()
+        pk = self.identifier_attribute_name
+        if pk:
+            for obj in objs:
+                if pk not in obj:
+                    raise ValueError(f"Primary key {pk} not found in object {obj}")
+                pk_val = obj[pk]
+                self._object_map[pk_val] = obj
+        else:
+            self._objects_list.extend(objs)
     def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
         if not isinstance(objs, list):
             objs = [objs]
-        cd = self.class_definition()
-        if not cd:
-            cd = self.induce_class_definition_from_objects(objs)
-        table = self._sqla_table(cd)
-        engine = self.parent.engine
-        with engine.connect() as conn:
+        if not objs:
+            return 0
+        pk = self.identifier_attribute_name
+        n = 0
+        if pk:
             for obj in objs:
-                conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
-                stmt = delete(table).where(*conditions)
-                stmt = stmt.compile(engine)
-                conn.execute(stmt)
-                conn.commit()
-        return
+                pk_val = obj[pk]
+                if pk_val in self._object_map:
+                    del self._object_map[pk_val]
+                    n += 1
+        else:
+            n = len(objs)
+            self._objects_list = [o for o in self._objects_list if o not in objs]
+            n = n - len(objs)
+        return n
     def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
         logger.info(f"Deleting from {self.target_class_name} where: {where}")
         if where is None:
             where = {}
-        cd = self.class_definition()
-        if not cd:
-            logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
-            return 0
-        table = self._sqla_table(cd)
-        engine = self.parent.engine
-        inspector = inspect(engine)
-        table_exists = table.name in inspector.get_table_names()
-        if not table_exists:
-            logger.info(f"Table {table.name} does not exist, assuming no data")
-            return 0
-        with engine.connect() as conn:
-            conditions = [table.c[k] == v for k, v in where.items()]
-            stmt = delete(table).where(*conditions)
-            stmt = stmt.compile(engine)
-            result = conn.execute(stmt)
-            deleted_rows_count = result.rowcount
-            if deleted_rows_count == 0 and not missing_ok:
-                raise ValueError(f"No rows found for {where}")
-            conn.commit()
-            return deleted_rows_count if deleted_rows_count > -1 else None
+        def matches(obj: OBJECT):
+            for k, v in where.items():
+                if obj.get(k) != v:
+                    return False
+            return True
+        print(type(self))
+        print(self)
+        print(vars(self))
+        curr_objects = [o for o in self.objects_as_list if not matches(o)]
+        self._set_objects(curr_objects)
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        where = query.where_clause or {}
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        count = len(rows)
+        return QueryResult(query=query, num_rows=count, rows=rows)
     def query_facets(
         self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
     ) -> Dict[str, Dict[str, int]]:
-        results = {}
-        cd = self.class_definition()
-        with self.parent.engine.connect() as conn:
-            if not facet_columns:
-                facet_columns = list(self.class_definition().attributes.keys())
-            for col in facet_columns:
-                logger.debug(f"Faceting on {col}")
-                if isinstance(col, tuple):
-                    sd = SlotDefinition(name="PLACEHOLDER")
-                else:
-                    sd = cd.attributes[col]
-                facet_query = self._create_query(where_clause=where)
-                facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
-                logger.debug(f"Facet query: {facet_query_str}")
-                rows = list(conn.execute(text(facet_query_str)))
-                results[col] = rows
-            return results
-    def _sqla_table(self, cd: ClassDefinition) -> Table:
-        schema_view = self.parent.schema_view
-        metadata_obj = sqla.MetaData()
-        cols = []
-        for att in schema_view.class_induced_slots(cd.name):
-            typ = TMAP.get(att.range, sqla.String)
-            if att.inlined:
-                typ = sqla.JSON
-            if att.multivalued:
-                typ = sqla.ARRAY(typ, dimensions=1)
-            if att.array:
-                typ = sqla.ARRAY(typ, dimensions=1)
-            col = Column(att.name, typ)
-            cols.append(col)
-        t = Table(self.alias, metadata_obj, *cols)
-        return t
-    def _create_table(self, cd: ClassDefinition):
-        if self._table_created or self.metadata.is_prepopulated:
-            logger.info(f"Already have table for: {cd.name}")
-            return
-        query = Query(
-            from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
-        )
-        qr = self.parent.query(query)
-        if qr.num_rows > 0:
-            logger.info(f"Table already exists for {cd.name}")
-            self._table_created = True
-            self.metadata.is_prepopulated = True
-            return
-        logger.info(f"Creating table for {cd.name}")
-        t = self._sqla_table(cd)
-        ct = CreateTable(t)
-        ddl = str(ct.compile(self.parent.engine))
-        with self.parent.engine.connect() as conn:
-            conn.execute(text(ddl))
-            conn.commit()
-        self._table_created = True
-        self.metadata.is_prepopulated = True
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        if not facet_columns:
+            facet_columns = self.class_definition().attributes.keys()
+        facet_results = {c: {} for c in facet_columns}
+        for row in rows:
+            for fc in facet_columns:
+                if fc in row:
+                    v = row[fc]
+                    if v not in facet_results[fc]:
+                        facet_results[fc][v] = 1
+                    else:
+                        facet_results[fc][v] += 1
+        return {fc: list(facet_results[fc].items()) for fc in facet_results}

linkml-store 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

Potentially problematic release.

linkml-store 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl