PyPI - linkml-store - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

linkml-store 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (35) hide show

linkml_store/api/client.py +15 -4
linkml_store/api/collection.py +185 -15
linkml_store/api/config.py +11 -3
linkml_store/api/database.py +36 -5
linkml_store/api/stores/duckdb/duckdb_collection.py +6 -3
linkml_store/api/stores/duckdb/duckdb_database.py +20 -1
linkml_store/api/stores/filesystem/__init__.py +7 -8
linkml_store/api/stores/filesystem/filesystem_collection.py +150 -113
linkml_store/api/stores/filesystem/filesystem_database.py +57 -21
linkml_store/api/stores/mongodb/mongodb_collection.py +82 -34
linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
linkml_store/api/types.py +4 -0
linkml_store/cli.py +97 -8
linkml_store/index/__init__.py +5 -3
linkml_store/index/indexer.py +7 -2
linkml_store/utils/change_utils.py +17 -0
linkml_store/utils/format_utils.py +89 -8
linkml_store/utils/patch_utils.py +126 -0
linkml_store/utils/query_utils.py +89 -0
linkml_store/utils/schema_utils.py +23 -0
linkml_store/webapi/__init__.py +0 -0
linkml_store/webapi/html/__init__.py +3 -0
linkml_store/webapi/html/base.html.j2 +24 -0
linkml_store/webapi/html/collection_details.html.j2 +15 -0
linkml_store/webapi/html/database_details.html.j2 +16 -0
linkml_store/webapi/html/databases.html.j2 +14 -0
linkml_store/webapi/html/generic.html.j2 +46 -0
linkml_store/webapi/main.py +572 -0
linkml_store-0.1.10.dist-info/METADATA +138 -0
linkml_store-0.1.10.dist-info/RECORD +58 -0
{linkml_store-0.1.8.dist-info → linkml_store-0.1.10.dist-info}/entry_points.txt +1 -0
linkml_store-0.1.8.dist-info/METADATA +0 -58
linkml_store-0.1.8.dist-info/RECORD +0 -45
{linkml_store-0.1.8.dist-info → linkml_store-0.1.10.dist-info}/LICENSE +0 -0
{linkml_store-0.1.8.dist-info → linkml_store-0.1.10.dist-info}/WHEEL +0 -0

linkml_store/api/stores/filesystem/filesystem_collection.py CHANGED Viewed

@@ -1,142 +1,179 @@
 import logging
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-import sqlalchemy as sqla
-from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
-from sqlalchemy import Column, Table, delete, insert, inspect, text
-from sqlalchemy.sql.ddl import CreateTable
 from linkml_store.api import Collection
 from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
-from linkml_store.api.queries import Query
-from linkml_store.api.stores.duckdb.mappings import TMAP
-from linkml_store.utils.sql_utils import facet_count_sql
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.api.types import DatabaseType
+from linkml_store.utils.query_utils import mongo_query_to_match_function
 logger = logging.getLogger(__name__)
-class FileSystemCollection(Collection):
-    _table_created: bool = None
+class FileSystemCollection(Collection[DatabaseType]):
+    path: Optional[Path] = None
+    file_format: Optional[str] = None
+    encoding: Optional[str] = None
+    _objects_list: List[OBJECT] = None
+    _object_map: Dict[str, OBJECT] = None
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        parent: DatabaseType = self.parent
+        if not self.path:
+            if self.parent:
+                self.path = Path(parent.directory_path)
+        self._objects_list = []
+        self._object_map = {}
+        if not self.file_format:
+            self.file_format = "json"
+    @property
+    def path_to_file(self):
+        return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
+    @property
+    def objects_as_list(self) -> List[OBJECT]:
+        if self._object_map:
+            return list(self._object_map.values())
+        else:
+            return self._objects_list
+    def _set_objects(self, objs: List[OBJECT]):
+        pk = self.identifier_attribute_name
+        if pk:
+            self._object_map = {obj[pk]: obj for obj in objs}
+            self._objects_list = []
+        else:
+            self._objects_list = objs
+            self._object_map = {}
+    def commit(self):
+        path = self.path_to_file
+        if not path:
+            raise ValueError("Path not set")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._save(path)
+    def _save(self, path: Path):
+        encoding = self.encoding or "utf-8"
+        fmt = self.file_format or "json"
+        mode = "w"
+        if fmt == "parquet":
+            mode = "wb"
+            encoding = None
+        with open(path, mode, encoding=encoding) as stream:
+            if fmt == "json":
+                import json
+                json.dump(self.objects_as_list, stream, indent=2)
+            elif fmt == "jsonl":
+                import jsonlines
+                writer = jsonlines.Writer(stream)
+                writer.write_all(self.objects_as_list)
+            elif fmt == "yaml":
+                import yaml
+                yaml.dump_all(self.objects_as_list, stream)
+            elif fmt == "parquet":
+                import pandas as pd
+                import pyarrow
+                import pyarrow.parquet as pq
+                df = pd.DataFrame(self.objects_as_list)
+                table = pyarrow.Table.from_pandas(df)
+                pq.write_table(table, stream)
+            elif fmt in {"csv", "tsv"}:
+                import csv
+                delimiter = "\t" if fmt == "tsv" else ","
+                fieldnames = list(self.objects_as_list[0].keys())
+                for obj in self.objects_as_list[1:]:
+                    fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
+                writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
+                writer.writeheader()
+                for obj in self.objects_as_list:
+                    writer.writerow(obj)
+            else:
+                raise ValueError(f"Unsupported file format: {fmt}")
     def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
         if not isinstance(objs, list):
             objs = [objs]
         if not objs:
             return
-        cd = self.class_definition()
-        if not cd:
-            cd = self.induce_class_definition_from_objects(objs)
-        self._create_table(cd)
-        table = self._sqla_table(cd)
-        logger.info(f"Inserting into: {self.alias} // T={table.name}")
-        engine = self.parent.engine
-        col_names = [c.name for c in table.columns]
-        objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
-        with engine.connect() as conn:
-            with conn.begin():
-                conn.execute(insert(table), objs)
-            conn.commit()
+        pk = self.identifier_attribute_name
+        if pk:
+            for obj in objs:
+                if pk not in obj:
+                    raise ValueError(f"Primary key {pk} not found in object {obj}")
+                pk_val = obj[pk]
+                self._object_map[pk_val] = obj
+        else:
+            self._objects_list.extend(objs)
     def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
         if not isinstance(objs, list):
             objs = [objs]
-        cd = self.class_definition()
-        if not cd:
-            cd = self.induce_class_definition_from_objects(objs)
-        table = self._sqla_table(cd)
-        engine = self.parent.engine
-        with engine.connect() as conn:
+        if not objs:
+            return 0
+        pk = self.identifier_attribute_name
+        n = 0
+        if pk:
             for obj in objs:
-                conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
-                stmt = delete(table).where(*conditions)
-                stmt = stmt.compile(engine)
-                conn.execute(stmt)
-                conn.commit()
-        return
+                pk_val = obj[pk]
+                if pk_val in self._object_map:
+                    del self._object_map[pk_val]
+                    n += 1
+        else:
+            n = len(objs)
+            self._objects_list = [o for o in self._objects_list if o not in objs]
+            n = n - len(objs)
+        return n
     def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
         logger.info(f"Deleting from {self.target_class_name} where: {where}")
         if where is None:
             where = {}
-        cd = self.class_definition()
-        if not cd:
-            logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
-            return 0
-        table = self._sqla_table(cd)
-        engine = self.parent.engine
-        inspector = inspect(engine)
-        table_exists = table.name in inspector.get_table_names()
-        if not table_exists:
-            logger.info(f"Table {table.name} does not exist, assuming no data")
-            return 0
-        with engine.connect() as conn:
-            conditions = [table.c[k] == v for k, v in where.items()]
-            stmt = delete(table).where(*conditions)
-            stmt = stmt.compile(engine)
-            result = conn.execute(stmt)
-            deleted_rows_count = result.rowcount
-            if deleted_rows_count == 0 and not missing_ok:
-                raise ValueError(f"No rows found for {where}")
-            conn.commit()
-            return deleted_rows_count if deleted_rows_count > -1 else None
+        def matches(obj: OBJECT):
+            for k, v in where.items():
+                if obj.get(k) != v:
+                    return False
+            return True
+        print(type(self))
+        print(self)
+        print(vars(self))
+        curr_objects = [o for o in self.objects_as_list if not matches(o)]
+        self._set_objects(curr_objects)
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        where = query.where_clause or {}
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        count = len(rows)
+        return QueryResult(query=query, num_rows=count, rows=rows)
     def query_facets(
         self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
     ) -> Dict[str, Dict[str, int]]:
-        results = {}
-        cd = self.class_definition()
-        with self.parent.engine.connect() as conn:
-            if not facet_columns:
-                facet_columns = list(self.class_definition().attributes.keys())
-            for col in facet_columns:
-                logger.debug(f"Faceting on {col}")
-                if isinstance(col, tuple):
-                    sd = SlotDefinition(name="PLACEHOLDER")
-                else:
-                    sd = cd.attributes[col]
-                facet_query = self._create_query(where_clause=where)
-                facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
-                logger.debug(f"Facet query: {facet_query_str}")
-                rows = list(conn.execute(text(facet_query_str)))
-                results[col] = rows
-            return results
-    def _sqla_table(self, cd: ClassDefinition) -> Table:
-        schema_view = self.parent.schema_view
-        metadata_obj = sqla.MetaData()
-        cols = []
-        for att in schema_view.class_induced_slots(cd.name):
-            typ = TMAP.get(att.range, sqla.String)
-            if att.inlined:
-                typ = sqla.JSON
-            if att.multivalued:
-                typ = sqla.ARRAY(typ, dimensions=1)
-            if att.array:
-                typ = sqla.ARRAY(typ, dimensions=1)
-            col = Column(att.name, typ)
-            cols.append(col)
-        t = Table(self.alias, metadata_obj, *cols)
-        return t
-    def _create_table(self, cd: ClassDefinition):
-        if self._table_created or self.metadata.is_prepopulated:
-            logger.info(f"Already have table for: {cd.name}")
-            return
-        query = Query(
-            from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
-        )
-        qr = self.parent.query(query)
-        if qr.num_rows > 0:
-            logger.info(f"Table already exists for {cd.name}")
-            self._table_created = True
-            self.metadata.is_prepopulated = True
-            return
-        logger.info(f"Creating table for {cd.name}")
-        t = self._sqla_table(cd)
-        ct = CreateTable(t)
-        ddl = str(ct.compile(self.parent.engine))
-        with self.parent.engine.connect() as conn:
-            conn.execute(text(ddl))
-            conn.commit()
-        self._table_created = True
-        self.metadata.is_prepopulated = True
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        if not facet_columns:
+            facet_columns = self.class_definition().attributes.keys()
+        facet_results = {c: {} for c in facet_columns}
+        for row in rows:
+            for fc in facet_columns:
+                if fc in row:
+                    v = row[fc]
+                    if not isinstance(v, str):
+                        v = str(v)
+                    if v not in facet_results[fc]:
+                        facet_results[fc][v] = 1
+                    else:
+                        facet_results[fc][v] += 1
+        return {fc: list(facet_results[fc].items()) for fc in facet_results}

linkml_store/api/stores/filesystem/filesystem_database.py CHANGED Viewed

@@ -1,36 +1,72 @@
 import logging
+from pathlib import Path
 from typing import Optional
-from linkml_store.api import Collection, Database
-from linkml_store.api.config import CollectionConfig
-from linkml_store.api.stores.duckdb import DuckDBDatabase
+import yaml
+from linkml.utils.schema_builder import SchemaBuilder
+from linkml_runtime import SchemaView
+from linkml_store.api import Database
+from linkml_store.api.config import DatabaseConfig
 from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
+from linkml_store.utils.format_utils import Format, load_objects
 logger = logging.getLogger(__name__)
 class FileSystemDatabase(Database):
     collection_class = FileSystemCollection
-    wrapped_database: Database = None
-    def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
-        self.wrapped_database = DuckDBDatabase("duckdb:///:memory:")
+    directory_path: Optional[Path] = None
+    default_file_format: Optional[str] = None
+    def __init__(self, handle: Optional[str] = None, **kwargs):
+        handle = handle.replace("file:", "")
+        if handle.startswith("//"):
+            handle = handle[2:]
+        self.directory_path = Path(handle)
+        self.load_metadata()
         super().__init__(handle=handle, **kwargs)
-    def commit(self, **kwargs):
-        # TODO: sync
-        pass
+    @property
+    def metadata_path(self) -> Path:
+        return self.directory_path / ".linkml_metadata.yaml"
+    def load_metadata(self):
+        if self.metadata_path.exists():
+            md_dict = yaml.safe_load(open(self.metadata_path))
+            metadata = DatabaseConfig(**md_dict)
+        else:
+            metadata = DatabaseConfig()
+        self.metadata = metadata
     def close(self, **kwargs):
-        self.wrapped_database.close()
-    def create_collection(
-        self,
-        name: str,
-        alias: Optional[str] = None,
-        metadata: Optional[CollectionConfig] = None,
-        recreate_if_exists=False,
-        **kwargs,
-    ) -> Collection:
-        wd = self.wrapped_database
-        wd.create_collection()
+        pass
+    def init_collections(self):
+        metadata = self.metadata
+        if self._collections is None:
+            self._collections = {}
+        for name, collection_config in metadata.collections.items():
+            collection = FileSystemCollection(parent=self, **collection_config.dict())
+            self._collections[name] = collection
+        path = self.directory_path
+        if path.exists():
+            for fmt in Format:
+                suffix = fmt.value
+                logger.info(f"Looking for {suffix} files in {path}")
+                for f in path.glob(f"*.{suffix}"):
+                    logger.info(f"Found {f}")
+                    n = f.stem
+                    objs = load_objects(f, suffix, expected_type=list)
+                    collection = FileSystemCollection(parent=self, name=n)
+                    self._collections[n] = collection
+                    collection._set_objects(objs)
+    def induce_schema_view(self) -> SchemaView:
+        logger.info(f"Inducing schema view for {self.handle}")
+        sb = SchemaBuilder()
+        for collection_name in self.list_collection_names():
+            sb.add_class(collection_name)
+        return SchemaView(sb.schema)

linkml_store/api/stores/mongodb/mongodb_collection.py CHANGED Viewed

@@ -2,7 +2,6 @@ import logging
 from copy import copy
 from typing import Any, Dict, List, Optional, Tuple, Union
-from linkml_runtime.linkml_model import SlotDefinition
 from pymongo.collection import Collection as MongoCollection
 from linkml_store.api import Collection
@@ -26,19 +25,27 @@ class MongoDBCollection(Collection):
     def mongo_collection(self) -> MongoCollection:
         if not self.name:
             raise ValueError("Collection name not set")
-        return self.parent.native_db[self.name]
+        collection_name = self.alias or self.name
+        return self.parent.native_db[collection_name]
     def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
         if not isinstance(objs, list):
             objs = [objs]
         self.mongo_collection.insert_many(objs)
+        # TODO: allow mapping of _id to id for efficiency
+        for obj in objs:
+            del obj["_id"]
+        self._post_insert_hook(objs)
-    def query(self, query: Query, **kwargs) -> QueryResult:
+    def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
         mongo_filter = self._build_mongo_filter(query.where_clause)
-        if query.limit:
-            cursor = self.mongo_collection.find(mongo_filter).limit(query.limit)
-        else:
-            cursor = self.mongo_collection.find(mongo_filter)
+        limit = limit or query.limit
+        cursor = self.mongo_collection.find(mongo_filter)
+        if limit and limit >= 0:
+            cursor = cursor.limit(limit)
+        offset = offset or query.offset
+        if offset and offset >= 0:
+            cursor = cursor.skip(offset)
         def _as_row(row: dict):
             row = copy(row)
@@ -57,46 +64,87 @@ class MongoDBCollection(Collection):
                 mongo_filter[field] = value
         return mongo_filter
+    from typing import Any, Dict, List, Union
     def query_facets(
-        self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
-    ) -> Dict[str, List[Tuple[Any, int]]]:
+        self,
+        where: Dict = None,
+        facet_columns: List[Union[str, Tuple[str, ...]]] = None,
+        facet_limit=DEFAULT_FACET_LIMIT,
+        **kwargs,
+    ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
         results = {}
-        cd = self.class_definition()
         if not facet_columns:
             facet_columns = list(self.class_definition().attributes.keys())
         for col in facet_columns:
             logger.debug(f"Faceting on {col}")
+            # Handle tuple columns
+            if isinstance(col, tuple):
+                group_id = {k.replace(".", "_"): f"${k}" for k in col}
+                all_fields = col
+            else:
+                group_id = f"${col}"
+                all_fields = [col]
+            # Initial pipeline without unwinding
+            facet_pipeline = [
+                {"$match": where} if where else {"$match": {}},
+                {"$group": {"_id": group_id, "count": {"$sum": 1}}},
+                {"$sort": {"count": -1}},
+                {"$limit": facet_limit},
+            ]
+            logger.info(f"Initial facet pipeline: {facet_pipeline}")
+            initial_results = list(self.mongo_collection.aggregate(facet_pipeline))
+            # Check if we need to unwind based on the results
+            needs_unwinding = False
             if isinstance(col, tuple):
-                sd = SlotDefinition(name="PLACEHOLDER")
+                needs_unwinding = any(
+                    isinstance(result["_id"], dict) and any(isinstance(v, list) for v in result["_id"].values())
+                    for result in initial_results
+                )
+            else:
+                needs_unwinding = any(isinstance(result["_id"], list) for result in initial_results)
+            if needs_unwinding:
+                logger.info(f"Detected array values for {col}, unwinding...")
+                facet_pipeline = [{"$match": where} if where else {"$match": {}}]
+                # Unwind each field if needed
+                for field in all_fields:
+                    field_parts = field.split(".")
+                    for i in range(len(field_parts)):
+                        facet_pipeline.append({"$unwind": f"${'.'.join(field_parts[:i + 1])}"})
+                facet_pipeline.extend(
+                    [
+                        {"$group": {"_id": group_id, "count": {"$sum": 1}}},
+                        {"$sort": {"count": -1}},
+                        {"$limit": facet_limit},
+                    ]
+                )
+                logger.info(f"Updated facet pipeline with unwinding: {facet_pipeline}")
+                facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
             else:
-                if col in cd.attributes:
-                    sd = cd.attributes[col]
-                else:
-                    logger.info(f"No schema metadata for {col}")
-                    sd = SlotDefinition(name=col)
-            group = {"$group": {"_id": f"${col}", "count": {"$sum": 1}}}
+                facet_results = initial_results
+            logger.info(f"Facet results: {facet_results}")
+            # Process results
             if isinstance(col, tuple):
-                q = {k.replace(".", ""): f"${k}" for k in col}
-                group["$group"]["_id"] = q
-            if sd and sd.multivalued:
-                facet_pipeline = [
-                    {"$match": where} if where else {"$match": {}},
-                    {"$unwind": f"${col}"},
-                    group,
-                    {"$sort": {"count": -1}},
-                    {"$limit": facet_limit},
+                results[col] = [
+                    (tuple(result["_id"].values()), result["count"])
+                    for result in facet_results
+                    if result["_id"] is not None and all(v is not None for v in result["_id"].values())
                 ]
             else:
-                facet_pipeline = [
-                    {"$match": where} if where else {"$match": {}},
-                    group,
-                    {"$sort": {"count": -1}},
-                    {"$limit": facet_limit},
+                results[col] = [
+                    (result["_id"], result["count"]) for result in facet_results if result["_id"] is not None
                 ]
-            logger.info(f"Facet pipeline: {facet_pipeline}")
-            facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
-            results[col] = [(result["_id"], result["count"]) for result in facet_results]
         return results

linkml_store/api/stores/mongodb/mongodb_database.py CHANGED Viewed

@@ -29,9 +29,17 @@ class MongoDBDatabase(Database):
     def __init__(self, handle: Optional[str] = None, **kwargs):
         if handle is None:
-            handle = "mongodb://localhost:27017"
+            handle = "mongodb://localhost:27017/test"
         super().__init__(handle=handle, **kwargs)
+    @property
+    def _db_name(self) -> str:
+        if self.handle:
+            db = self.handle.split("/")[-1]
+        else:
+            db = "default"
+        return db
     @property
     def native_client(self) -> MongoClient:
         if self._native_client is None:
@@ -44,7 +52,7 @@ class MongoDBDatabase(Database):
             alias = self.metadata.alias
             if not alias:
                 alias = "default"
-            self._native_db = self.native_client[alias]
+            self._native_db = self.native_client[self._db_name]
         return self._native_db
     def commit(self, **kwargs):
@@ -58,9 +66,12 @@ class MongoDBDatabase(Database):
         self.native_client.drop_database(self.metadata.alias)
     def query(self, query: Query, **kwargs) -> QueryResult:
+        # TODO: DRY
         if query.from_table:
             collection = self.get_collection(query.from_table)
             return collection.query(query, **kwargs)
+        else:
+            raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
     def init_collections(self):
         if self._collections is None:

linkml_store/api/types.py ADDED Viewed

@@ -0,0 +1,4 @@
+from typing import TypeVar
+DatabaseType = TypeVar("DatabaseType", bound="Database")  # noqa: F821
+CollectionType = TypeVar("CollectionType", bound="Collection")  # noqa: F821

linkml-store 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

Potentially problematic release.

linkml-store 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl