PyPI - linkml-store - Versions diffs - 0.3.0__py3-none-any.whl - Mend

linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

linkml_store/__init__.py +7 -0
linkml_store/api/__init__.py +8 -0
linkml_store/api/client.py +414 -0
linkml_store/api/collection.py +1280 -0
linkml_store/api/config.py +187 -0
linkml_store/api/database.py +862 -0
linkml_store/api/queries.py +69 -0
linkml_store/api/stores/__init__.py +0 -0
linkml_store/api/stores/chromadb/__init__.py +7 -0
linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
linkml_store/api/stores/dremio/__init__.py +10 -0
linkml_store/api/stores/dremio/dremio_collection.py +555 -0
linkml_store/api/stores/dremio/dremio_database.py +1052 -0
linkml_store/api/stores/dremio/mappings.py +105 -0
linkml_store/api/stores/dremio_rest/__init__.py +11 -0
linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
linkml_store/api/stores/duckdb/__init__.py +16 -0
linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
linkml_store/api/stores/duckdb/mappings.py +8 -0
linkml_store/api/stores/filesystem/__init__.py +15 -0
linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
linkml_store/api/stores/hdf5/__init__.py +7 -0
linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
linkml_store/api/stores/ibis/__init__.py +5 -0
linkml_store/api/stores/ibis/ibis_collection.py +488 -0
linkml_store/api/stores/ibis/ibis_database.py +328 -0
linkml_store/api/stores/mongodb/__init__.py +25 -0
linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
linkml_store/api/stores/neo4j/__init__.py +0 -0
linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
linkml_store/api/stores/solr/__init__.py +3 -0
linkml_store/api/stores/solr/solr_collection.py +224 -0
linkml_store/api/stores/solr/solr_database.py +83 -0
linkml_store/api/stores/solr/solr_utils.py +0 -0
linkml_store/api/types.py +4 -0
linkml_store/cli.py +1147 -0
linkml_store/constants.py +7 -0
linkml_store/graphs/__init__.py +0 -0
linkml_store/graphs/graph_map.py +24 -0
linkml_store/index/__init__.py +53 -0
linkml_store/index/implementations/__init__.py +0 -0
linkml_store/index/implementations/llm_indexer.py +174 -0
linkml_store/index/implementations/simple_indexer.py +43 -0
linkml_store/index/indexer.py +211 -0
linkml_store/inference/__init__.py +13 -0
linkml_store/inference/evaluation.py +195 -0
linkml_store/inference/implementations/__init__.py +0 -0
linkml_store/inference/implementations/llm_inference_engine.py +154 -0
linkml_store/inference/implementations/rag_inference_engine.py +276 -0
linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
linkml_store/inference/inference_config.py +66 -0
linkml_store/inference/inference_engine.py +209 -0
linkml_store/inference/inference_engine_registry.py +74 -0
linkml_store/plotting/__init__.py +5 -0
linkml_store/plotting/cli.py +826 -0
linkml_store/plotting/dimensionality_reduction.py +453 -0
linkml_store/plotting/embedding_plot.py +489 -0
linkml_store/plotting/facet_chart.py +73 -0
linkml_store/plotting/heatmap.py +383 -0
linkml_store/utils/__init__.py +0 -0
linkml_store/utils/change_utils.py +17 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/embedding_matcher.py +424 -0
linkml_store/utils/embedding_utils.py +299 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/file_utils.py +37 -0
linkml_store/utils/format_utils.py +550 -0
linkml_store/utils/io.py +38 -0
linkml_store/utils/llm_utils.py +122 -0
linkml_store/utils/mongodb_utils.py +145 -0
linkml_store/utils/neo4j_utils.py +42 -0
linkml_store/utils/object_utils.py +190 -0
linkml_store/utils/pandas_utils.py +93 -0
linkml_store/utils/patch_utils.py +126 -0
linkml_store/utils/query_utils.py +89 -0
linkml_store/utils/schema_utils.py +23 -0
linkml_store/utils/sklearn_utils.py +193 -0
linkml_store/utils/sql_utils.py +177 -0
linkml_store/utils/stats_utils.py +53 -0
linkml_store/utils/vector_utils.py +158 -0
linkml_store/webapi/__init__.py +0 -0
linkml_store/webapi/html/__init__.py +3 -0
linkml_store/webapi/html/base.html.j2 +24 -0
linkml_store/webapi/html/collection_details.html.j2 +15 -0
linkml_store/webapi/html/database_details.html.j2 +16 -0
linkml_store/webapi/html/databases.html.j2 +14 -0
linkml_store/webapi/html/generic.html.j2 +43 -0
linkml_store/webapi/main.py +855 -0
linkml_store-0.3.0.dist-info/METADATA +226 -0
linkml_store-0.3.0.dist-info/RECORD +101 -0
linkml_store-0.3.0.dist-info/WHEEL +4 -0
linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0

linkml_store/api/stores/filesystem/filesystem_collection.py ADDED Viewed

@@ -0,0 +1,186 @@
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+from linkml_store.api import Collection
+from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.api.types import DatabaseType
+from linkml_store.utils.query_utils import mongo_query_to_match_function
+logger = logging.getLogger(__name__)
+class FileSystemCollection(Collection[DatabaseType]):
+    path: Optional[Path] = None
+    file_format: Optional[str] = None
+    encoding: Optional[str] = None
+    _objects_list: List[OBJECT] = None
+    _object_map: Dict[str, OBJECT] = None
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        parent: DatabaseType = self.parent
+        if not self.path:
+            if self.parent:
+                self.path = Path(parent.directory_path)
+        self._objects_list = []
+        self._object_map = {}
+        if not self.file_format:
+            self.file_format = "json"
+    @property
+    def path_to_file(self):
+        return Path(self.parent.directory_path) / f"{self.alias}.{self.file_format}"
+    @property
+    def objects_as_list(self) -> List[OBJECT]:
+        if self._object_map:
+            return list(self._object_map.values())
+        else:
+            return self._objects_list
+    def _set_objects(self, objs: List[OBJECT]):
+        pk = self.identifier_attribute_name
+        if pk:
+            self._object_map = {obj[pk]: obj for obj in objs}
+            self._objects_list = []
+        else:
+            self._objects_list = objs
+            self._object_map = {}
+    def commit(self):
+        path = self.path_to_file
+        if not path:
+            raise ValueError("Path not set")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._save(path)
+    def _save(self, path: Path):
+        encoding = self.encoding or "utf-8"
+        fmt = self.file_format or "json"
+        mode = "w"
+        if fmt == "parquet":
+            mode = "wb"
+            encoding = None
+        with open(path, mode, encoding=encoding) as stream:
+            if fmt == "json":
+                import json
+                json.dump(self.objects_as_list, stream, indent=2)
+            elif fmt == "jsonl":
+                import jsonlines
+                writer = jsonlines.Writer(stream)
+                writer.write_all(self.objects_as_list)
+            elif fmt == "yaml":
+                import yaml
+                yaml.dump_all(self.objects_as_list, stream)
+            elif fmt == "parquet":
+                import pandas as pd
+                import pyarrow
+                import pyarrow.parquet as pq
+                df = pd.DataFrame(self.objects_as_list)
+                table = pyarrow.Table.from_pandas(df)
+                pq.write_table(table, stream)
+            elif fmt in {"csv", "tsv"}:
+                import csv
+                delimiter = "\t" if fmt == "tsv" else ","
+                fieldnames = list(self.objects_as_list[0].keys())
+                for obj in self.objects_as_list[1:]:
+                    fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
+                writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
+                writer.writeheader()
+                for obj in self.objects_as_list:
+                    writer.writerow(obj)
+            else:
+                raise ValueError(f"Unsupported file format: {fmt}")
+    def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
+        if not isinstance(objs, list):
+            objs = [objs]
+        if not objs:
+            return
+        pk = self.identifier_attribute_name
+        if pk:
+            for obj in objs:
+                if pk not in obj:
+                    raise ValueError(f"Primary key {pk} not found in object {obj}")
+                pk_val = obj[pk]
+                self._object_map[pk_val] = obj
+        else:
+            self._objects_list.extend(objs)
+    def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
+        if not isinstance(objs, list):
+            objs = [objs]
+        if not objs:
+            return 0
+        pk = self.identifier_attribute_name
+        n = 0
+        if pk:
+            for obj in objs:
+                pk_val = obj[pk]
+                if pk_val in self._object_map:
+                    del self._object_map[pk_val]
+                    n += 1
+        else:
+            n = len(objs)
+            self._objects_list = [o for o in self._objects_list if o not in objs]
+            n = n - len(objs)
+        return n
+    def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
+        logger.info(f"Deleting from {self.target_class_name} where: {where}")
+        if where is None:
+            where = {}
+        def matches(obj: OBJECT):
+            for k, v in where.items():
+                if obj.get(k) != v:
+                    return False
+            return True
+        print(type(self))
+        print(self)
+        print(vars(self))
+        curr_objects = [o for o in self.objects_as_list if not matches(o)]
+        self._set_objects(curr_objects)
+    def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
+        limit = limit or query.limit
+        offset = offset or query.offset
+        if offset is None:
+            offset = 0
+        where = query.where_clause or {}
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        count = len(rows)
+        if limit is None or limit < 0:
+            limit = count
+        # TODO: avoid recalculating
+        returned_row = rows[offset : offset + limit]
+        return QueryResult(query=query, num_rows=count, rows=returned_row)
+    def query_facets(
+        self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
+    ) -> Dict[str, Dict[str, int]]:
+        match = mongo_query_to_match_function(where)
+        rows = [o for o in self.objects_as_list if match(o)]
+        if not facet_columns:
+            facet_columns = self.class_definition().attributes.keys()
+        facet_results = {c: {} for c in facet_columns}
+        for row in rows:
+            for fc in facet_columns:
+                if fc in row:
+                    v = row[fc]
+                    if not isinstance(v, str):
+                        v = str(v)
+                    if v not in facet_results[fc]:
+                        facet_results[fc][v] = 1
+                    else:
+                        facet_results[fc][v] += 1
+        return {fc: list(facet_results[fc].items()) for fc in facet_results}

linkml_store/api/stores/filesystem/filesystem_database.py ADDED Viewed

@@ -0,0 +1,81 @@
+import logging
+from pathlib import Path
+from typing import Optional
+import yaml
+from linkml_runtime import SchemaView
+from linkml_runtime.utils.schema_builder import SchemaBuilder
+from linkml_store.api import Database
+from linkml_store.api.config import DatabaseConfig
+from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
+from linkml_store.utils.file_utils import safe_remove_directory
+from linkml_store.utils.format_utils import Format, load_objects
+logger = logging.getLogger(__name__)
+class FileSystemDatabase(Database):
+    collection_class = FileSystemCollection
+    directory_path: Optional[Path] = None
+    default_file_format: Optional[str] = None
+    no_backup_on_drop: bool = False
+    def __init__(self, handle: Optional[str] = None, **kwargs):
+        handle = handle.replace("file:", "")
+        if handle.startswith("//"):
+            handle = handle[2:]
+        self.directory_path = Path(handle)
+        self.load_metadata()
+        super().__init__(handle=handle, **kwargs)
+    @property
+    def metadata_path(self) -> Path:
+        return self.directory_path / ".linkml_metadata.yaml"
+    def load_metadata(self):
+        if self.metadata_path.exists():
+            md_dict = yaml.safe_load(open(self.metadata_path))
+            metadata = DatabaseConfig(**md_dict)
+        else:
+            metadata = DatabaseConfig()
+        self.metadata = metadata
+    def close(self, **kwargs):
+        pass
+    def drop(self, no_backup=False, **kwargs):
+        self.close()
+        path = self.directory_path
+        if path.exists():
+            safe_remove_directory(path, no_backup=self.no_backup_on_drop or no_backup)
+    def init_collections(self):
+        metadata = self.metadata
+        if self._collections is None:
+            self._collections = {}
+        for name, collection_config in metadata.collections.items():
+            collection = FileSystemCollection(parent=self, **collection_config.dict())
+            self._collections[name] = collection
+        path = self.directory_path
+        if path.exists():
+            for fmt in Format:
+                suffix = fmt.value
+                logger.info(f"Looking for {suffix} files in {path}")
+                for f in path.glob(f"*.{suffix}"):
+                    logger.info(f"Found {f}")
+                    n = f.stem
+                    objs = load_objects(f, suffix, expected_type=list)
+                    collection = FileSystemCollection(parent=self, name=n)
+                    self._collections[n] = collection
+                    collection._set_objects(objs)
+    def xxxinduce_schema_view(self) -> SchemaView:
+        logger.info(f"Inducing schema view for {self.handle}")
+        sb = SchemaBuilder()
+        for collection_name in self.list_collection_names():
+            sb.add_class(collection_name)
+        return SchemaView(sb.schema)

linkml_store/api/stores/hdf5/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Adapter for HDF5 file storage.
+.. warning::
+    Experimental support for HDF5 storage.
+"""

linkml_store/api/stores/hdf5/hdf5_collection.py ADDED Viewed

@@ -0,0 +1,104 @@
+import json
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+import h5py
+from linkml_store.api import Collection
+from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
+from linkml_store.api.queries import Query, QueryResult
+logger = logging.getLogger(__name__)
+class HDF5Collection(Collection):
+    @property
+    def hdf5_group(self) -> h5py.Group:
+        return self.parent.file[self.name]
+    def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
+        if not isinstance(objs, list):
+            objs = [objs]
+        for obj in objs:
+            if "id" not in obj:
+                raise ValueError("Each object must have an 'id' field.")
+            obj_id = str(obj["id"])
+            for key, value in obj.items():
+                if key == "id":
+                    continue
+                if isinstance(value, (dict, list)):
+                    value = json.dumps(value)
+                self.hdf5_group.create_dataset(f"{obj_id}/{key}", data=value)
+    def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
+        if not isinstance(objs, list):
+            objs = [objs]
+        count = 0
+        for obj in objs:
+            if "id" not in obj:
+                raise ValueError("Each object must have an 'id' field.")
+            obj_id = str(obj["id"])
+            if obj_id in self.hdf5_group:
+                del self.hdf5_group[obj_id]
+                count += 1
+        return count
+    def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
+        logger.info(f"Deleting from {self.target_class_name} where: {where}")
+        if where is None:
+            where = {}
+        results = self.query(Query(where_clause=where)).rows
+        count = self.delete(results)
+        return count
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        results = []
+        for obj_id in self.hdf5_group:
+            obj = {"id": obj_id}
+            for key, value in self.hdf5_group[obj_id].items():
+                try:
+                    obj[key] = json.loads(value[()])
+                except json.JSONDecodeError:
+                    obj[key] = value[()]
+            if self._match_where_clause(obj, query.where_clause):
+                results.append(obj)
+        count = len(results)
+        if query.limit:
+            results = results[: query.limit]
+        return QueryResult(query=query, num_rows=count, rows=results)
+    def query_facets(
+        self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
+    ) -> Dict[str, List[Tuple[Any, int]]]:
+        results = {}
+        if not facet_columns:
+            facet_columns = list(self.class_definition().attributes.keys())
+        for col in facet_columns:
+            logger.debug(f"Faceting on {col}")
+            facet_counts = {}
+            for obj in self.query(Query(where_clause=where)).rows:
+                if col in obj:
+                    value = obj[col]
+                    if isinstance(value, list):
+                        for v in value:
+                            facet_counts[v] = facet_counts.get(v, 0) + 1
+                    else:
+                        facet_counts[value] = facet_counts.get(value, 0) + 1
+            facet_counts = sorted(facet_counts.items(), key=lambda x: x[1], reverse=True)[:facet_limit]
+            results[col] = facet_counts
+        return results
+    def _match_where_clause(self, obj: Dict[str, Any], where_clause: Optional[Dict[str, Any]]) -> bool:
+        if where_clause is None:
+            return True
+        for key, value in where_clause.items():
+            if key not in obj:
+                return False
+            if obj[key] != value:
+                return False
+        return True

linkml_store/api/stores/hdf5/hdf5_database.py ADDED Viewed

@@ -0,0 +1,79 @@
+# hdf5_database.py
+import logging
+from typing import Optional
+import h5py
+from linkml_runtime import SchemaView
+from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
+from linkml_runtime.utils.schema_builder import SchemaBuilder
+from linkml_store.api import Database
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.api.stores.hdf5.hdf5_collection import HDF5Collection
+logger = logging.getLogger(__name__)
+class HDF5Database(Database):
+    _file: h5py.File = None
+    collection_class = HDF5Collection
+    def __init__(self, handle: Optional[str] = None, **kwargs):
+        if handle is None:
+            handle = "linkml_store.h5"
+        super().__init__(handle=handle, **kwargs)
+    @property
+    def file(self) -> h5py.File:
+        if self._file is None:
+            self._file = h5py.File(self.handle, "a")
+        return self._file
+    def commit(self, **kwargs):
+        self.file.flush()
+    def close(self, **kwargs):
+        if self._file:
+            self._file.close()
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        if query.from_table:
+            collection = self.get_collection(query.from_table)
+            return collection.query(query, **kwargs)
+    def init_collections(self):
+        if self._collections is None:
+            self._collections = {}
+        for collection_name in self.file:
+            if collection_name not in self._collections:
+                collection = HDF5Collection(name=collection_name, parent=self)
+                self._collections[collection_name] = collection
+    def induce_schema_view(self) -> SchemaView:
+        logger.info(f"Inducing schema view for {self.handle}")
+        sb = SchemaBuilder()
+        schema = sb.schema
+        for collection_name in self.file:
+            sb.add_class(collection_name)
+            hdf5_group = self.file[collection_name]
+            for field in hdf5_group:
+                if field == "_id":
+                    continue
+                sd = SlotDefinition(field)
+                if isinstance(hdf5_group[field][()], list):
+                    sd.multivalued = True
+                sb.schema.classes[collection_name].attributes[sd.name] = sd
+        sb.add_defaults()
+        for cls_name in schema.classes:
+            if cls_name in self.metadata.collections:
+                collection_metadata = self.metadata.collections[cls_name]
+                if collection_metadata.attributes:
+                    del schema.classes[cls_name]
+                    cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
+                    schema.classes[cls.name] = cls
+        return SchemaView(schema)

linkml_store/api/stores/ibis/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Ibis backend for linkml-store."""
+from linkml_store.api.stores.ibis.ibis_database import IbisDatabase
+__all__ = ["IbisDatabase"]