PyPI - linkml-store - Versions diffs - 0.3.0__py3-none-any.whl - Mend

linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

linkml_store/__init__.py +7 -0
linkml_store/api/__init__.py +8 -0
linkml_store/api/client.py +414 -0
linkml_store/api/collection.py +1280 -0
linkml_store/api/config.py +187 -0
linkml_store/api/database.py +862 -0
linkml_store/api/queries.py +69 -0
linkml_store/api/stores/__init__.py +0 -0
linkml_store/api/stores/chromadb/__init__.py +7 -0
linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
linkml_store/api/stores/dremio/__init__.py +10 -0
linkml_store/api/stores/dremio/dremio_collection.py +555 -0
linkml_store/api/stores/dremio/dremio_database.py +1052 -0
linkml_store/api/stores/dremio/mappings.py +105 -0
linkml_store/api/stores/dremio_rest/__init__.py +11 -0
linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
linkml_store/api/stores/duckdb/__init__.py +16 -0
linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
linkml_store/api/stores/duckdb/mappings.py +8 -0
linkml_store/api/stores/filesystem/__init__.py +15 -0
linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
linkml_store/api/stores/hdf5/__init__.py +7 -0
linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
linkml_store/api/stores/ibis/__init__.py +5 -0
linkml_store/api/stores/ibis/ibis_collection.py +488 -0
linkml_store/api/stores/ibis/ibis_database.py +328 -0
linkml_store/api/stores/mongodb/__init__.py +25 -0
linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
linkml_store/api/stores/neo4j/__init__.py +0 -0
linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
linkml_store/api/stores/solr/__init__.py +3 -0
linkml_store/api/stores/solr/solr_collection.py +224 -0
linkml_store/api/stores/solr/solr_database.py +83 -0
linkml_store/api/stores/solr/solr_utils.py +0 -0
linkml_store/api/types.py +4 -0
linkml_store/cli.py +1147 -0
linkml_store/constants.py +7 -0
linkml_store/graphs/__init__.py +0 -0
linkml_store/graphs/graph_map.py +24 -0
linkml_store/index/__init__.py +53 -0
linkml_store/index/implementations/__init__.py +0 -0
linkml_store/index/implementations/llm_indexer.py +174 -0
linkml_store/index/implementations/simple_indexer.py +43 -0
linkml_store/index/indexer.py +211 -0
linkml_store/inference/__init__.py +13 -0
linkml_store/inference/evaluation.py +195 -0
linkml_store/inference/implementations/__init__.py +0 -0
linkml_store/inference/implementations/llm_inference_engine.py +154 -0
linkml_store/inference/implementations/rag_inference_engine.py +276 -0
linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
linkml_store/inference/inference_config.py +66 -0
linkml_store/inference/inference_engine.py +209 -0
linkml_store/inference/inference_engine_registry.py +74 -0
linkml_store/plotting/__init__.py +5 -0
linkml_store/plotting/cli.py +826 -0
linkml_store/plotting/dimensionality_reduction.py +453 -0
linkml_store/plotting/embedding_plot.py +489 -0
linkml_store/plotting/facet_chart.py +73 -0
linkml_store/plotting/heatmap.py +383 -0
linkml_store/utils/__init__.py +0 -0
linkml_store/utils/change_utils.py +17 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/embedding_matcher.py +424 -0
linkml_store/utils/embedding_utils.py +299 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/file_utils.py +37 -0
linkml_store/utils/format_utils.py +550 -0
linkml_store/utils/io.py +38 -0
linkml_store/utils/llm_utils.py +122 -0
linkml_store/utils/mongodb_utils.py +145 -0
linkml_store/utils/neo4j_utils.py +42 -0
linkml_store/utils/object_utils.py +190 -0
linkml_store/utils/pandas_utils.py +93 -0
linkml_store/utils/patch_utils.py +126 -0
linkml_store/utils/query_utils.py +89 -0
linkml_store/utils/schema_utils.py +23 -0
linkml_store/utils/sklearn_utils.py +193 -0
linkml_store/utils/sql_utils.py +177 -0
linkml_store/utils/stats_utils.py +53 -0
linkml_store/utils/vector_utils.py +158 -0
linkml_store/webapi/__init__.py +0 -0
linkml_store/webapi/html/__init__.py +3 -0
linkml_store/webapi/html/base.html.j2 +24 -0
linkml_store/webapi/html/collection_details.html.j2 +15 -0
linkml_store/webapi/html/database_details.html.j2 +16 -0
linkml_store/webapi/html/databases.html.j2 +14 -0
linkml_store/webapi/html/generic.html.j2 +43 -0
linkml_store/webapi/main.py +855 -0
linkml_store-0.3.0.dist-info/METADATA +226 -0
linkml_store-0.3.0.dist-info/RECORD +101 -0
linkml_store-0.3.0.dist-info/WHEEL +4 -0
linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0

linkml_store/api/stores/mongodb/mongodb_collection.py ADDED Viewed

@@ -0,0 +1,379 @@
+import logging
+from copy import copy
+from typing import Any, Dict, List, Optional, Tuple, Union
+from pymongo.collection import Collection as MongoCollection
+from linkml_store.api import Collection
+from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.utils.object_utils import object_path_get
+logger = logging.getLogger(__name__)
+class MongoDBCollection(Collection):
+    """
+    Adapter for collections in a MongoDB database.
+    .. note::
+        You should not use or manipulate this class directly.
+        Instead, use the general :class:`linkml_store.api.Collection`
+    """
+    @property
+    def mongo_collection(self) -> MongoCollection:
+        # collection_name = self.alias or self.name
+        collection_name = self.alias
+        if not collection_name:
+            raise ValueError("Collection name not set")
+        return self.parent.native_db[collection_name]
+    def _check_if_initialized(self) -> bool:
+        return self.alias in self.parent.native_db.list_collection_names()
+    def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
+        if not isinstance(objs, list):
+            objs = [objs]
+        self.mongo_collection.insert_many(objs)
+        # TODO: allow mapping of _id to id for efficiency
+        for obj in objs:
+            del obj["_id"]
+        self._post_insert_hook(objs)
+    def index(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        index_name: Optional[str] = None,
+        replace: bool = False,
+        unique: bool = False,
+        **kwargs,
+    ):
+        """
+        Create indexes on the collection.
+        :param objs: Field(s) to index.
+        :param index_name: Optional name for the index.
+        :param replace: If True, the index will be dropped and recreated.
+        :param unique: If True, creates a unique index (default: False).
+        """
+        if not isinstance(objs, list):
+            objs = [objs]
+        existing_indexes = self.mongo_collection.index_information()
+        for obj in objs:
+            field_exists = False
+            index_to_drop = None
+            # Extract existing index details
+            for index_name_existing, index_details in existing_indexes.items():
+                indexed_fields = [field[0] for field in index_details.get("key", [])]  # Extract field names
+                if obj in indexed_fields:  # If this field is already indexed
+                    field_exists = True
+                    index_to_drop = index_name_existing if replace else None
+            # Drop the index if replace=True and index_to_drop is valid
+            if index_to_drop:
+                self.mongo_collection.drop_index(index_to_drop)
+                logging.debug(f"Dropped existing index: {index_to_drop}")
+            # Create the new index only if it doesn't exist or was dropped
+            if not field_exists or replace:
+                self.mongo_collection.create_index(obj, name=index_name, unique=unique)
+                logging.debug(f"Created new index: {index_name} on field {obj}, unique={unique}")
+            else:
+                logging.debug(f"Index already exists for field {obj}, skipping creation.")
+    def upsert(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        filter_fields: List[str],
+        update_fields: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Upsert one or more documents into the MongoDB collection.
+        :param objs: The document(s) to insert or update.
+        :param filter_fields: List of field names to use as the filter for matching existing documents.
+        :param update_fields: List of field names to include in the update. If None, all fields are updated.
+        """
+        if not isinstance(objs, list):
+            objs = [objs]
+        for obj in objs:
+            # Ensure filter fields exist in the object
+            filter_criteria = {field: obj[field] for field in filter_fields if field in obj}
+            if not filter_criteria:
+                raise ValueError("At least one valid filter field must be present in each object.")
+            # Check if a document already exists
+            existing_doc = self.mongo_collection.find_one(filter_criteria)
+            if existing_doc:
+                # Update only changed fields
+                updates = {key: obj[key] for key in update_fields if key in obj and obj[key] != existing_doc.get(key)}
+                if updates:
+                    self.mongo_collection.update_one(filter_criteria, {"$set": updates})
+                    logging.debug(f"Updated existing document: {filter_criteria} with {updates}")
+                else:
+                    logging.debug(f"No changes detected for document: {filter_criteria}. Skipping update.")
+            else:
+                # Insert a new document
+                self.mongo_collection.insert_one(obj)
+                logging.debug(f"Inserted new document: {obj}")
+    def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
+        mongo_filter = self._build_mongo_filter(query.where_clause)
+        limit = limit or query.limit
+        # Build projection if select_cols are provided
+        projection = None
+        if query.select_cols:
+            projection = {"_id": 0}
+            for col in query.select_cols:
+                projection[col] = 1
+        cursor = self.mongo_collection.find(mongo_filter, projection)
+        if limit and limit >= 0:
+            cursor = cursor.limit(limit)
+        offset = offset or query.offset
+        if offset and offset >= 0:
+            cursor = cursor.skip(offset)
+        select_cols = query.select_cols
+        def _as_row(row: dict):
+            row = copy(row)
+            if "_id" in row:
+                del row["_id"]
+            if select_cols:
+                # For nested fields, ensure we handle them properly
+                result = {}
+                for col in select_cols:
+                    # If it's a nested field (contains dots)
+                    if "." in col or "[" in col:
+                        result[col]  = object_path_get(row, col)
+                    elif col in row:
+                        result[col] = row[col]
+                return result
+            return row
+        rows = [_as_row(row) for row in cursor]
+        count = self.mongo_collection.count_documents(mongo_filter)
+        return QueryResult(query=query, num_rows=count, rows=rows)
+    def _build_mongo_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
+        mongo_filter = {}
+        if where_clause:
+            for field, value in where_clause.items():
+                mongo_filter[field] = value
+        return mongo_filter
+    from typing import Any, Dict, List, Union
+    def query_facets(
+        self,
+        where: Dict = None,
+        facet_columns: List[Union[str, Tuple[str, ...]]] = None,
+        facet_limit=DEFAULT_FACET_LIMIT,
+        **kwargs,
+    ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
+        if facet_limit is None:
+            facet_limit = DEFAULT_FACET_LIMIT
+        results = {}
+        if not facet_columns:
+            facet_columns = list(self.class_definition().attributes.keys())
+        for col in facet_columns:
+            logger.debug(f"Faceting on {col}")
+            # Handle tuple columns
+            if isinstance(col, tuple):
+                group_id = {k.replace(".", "_"): f"${k}" for k in col}
+                all_fields = col
+            else:
+                group_id = f"${col}"
+                all_fields = [col]
+            # Initial pipeline without unwinding
+            facet_pipeline = [
+                {"$match": where} if where else {"$match": {}},
+                {"$group": {"_id": group_id, "count": {"$sum": 1}}},
+                {"$sort": {"count": -1}},
+                {"$limit": facet_limit},
+            ]
+            logger.info(f"Initial facet pipeline: {facet_pipeline}")
+            initial_results = list(self.mongo_collection.aggregate(facet_pipeline))
+            # Check if we need to unwind based on the results
+            needs_unwinding = False
+            if isinstance(col, tuple):
+                needs_unwinding = any(
+                    isinstance(result["_id"], dict) and any(isinstance(v, list) for v in result["_id"].values())
+                    for result in initial_results
+                )
+            else:
+                needs_unwinding = any(isinstance(result["_id"], list) for result in initial_results)
+            if needs_unwinding:
+                logger.info(f"Detected array values for {col}, unwinding...")
+                facet_pipeline = [{"$match": where} if where else {"$match": {}}]
+                # Unwind each field if needed
+                for field in all_fields:
+                    field_parts = field.split(".")
+                    for i in range(len(field_parts)):
+                        facet_pipeline.append({"$unwind": f"${'.'.join(field_parts[:i + 1])}"})
+                facet_pipeline.extend(
+                    [
+                        {"$group": {"_id": group_id, "count": {"$sum": 1}}},
+                        {"$sort": {"count": -1}},
+                        {"$limit": facet_limit},
+                    ]
+                )
+                logger.info(f"Updated facet pipeline with unwinding: {facet_pipeline}")
+                facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
+            else:
+                facet_results = initial_results
+            logger.info(f"Facet results: {facet_results}")
+            # Process results
+            if isinstance(col, tuple):
+                results[col] = [
+                    (tuple(result["_id"].values()), result["count"])
+                    for result in facet_results
+                    if result["_id"] is not None and all(v is not None for v in result["_id"].values())
+                ]
+            else:
+                results[col] = [
+                    (result["_id"], result["count"]) for result in facet_results if result["_id"] is not None
+                ]
+        return results
+    def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
+        if not isinstance(objs, list):
+            objs = [objs]
+        filter_conditions = []
+        for obj in objs:
+            filter_condition = {}
+            for key, value in obj.items():
+                filter_condition[key] = value
+            filter_conditions.append(filter_condition)
+        result = self.mongo_collection.delete_many({"$or": filter_conditions})
+        return result.deleted_count
+    def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
+        logger.info(f"Deleting from {self.target_class_name} where: {where}")
+        if where is None:
+            where = {}
+        result = self.mongo_collection.delete_many(where)
+        deleted_rows_count = result.deleted_count
+        if deleted_rows_count == 0 and not missing_ok:
+            raise ValueError(f"No rows found for {where}")
+        return deleted_rows_count
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
+        This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
+        :param group_by_fields: List of fields to group by
+        :param inlined_field: Field name to store aggregated objects
+        :param agg_map: Dictionary mapping aggregation types to fields
+        :param where: Filter conditions
+        :param kwargs: Additional arguments
+        :return: Query result containing grouped data
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+        # Build the group key for MongoDB
+        if len(group_by_fields) == 1:
+            # Single field grouping
+            group_id = f"${group_by_fields[0]}"
+        else:
+            # Multi-field grouping
+            group_id = {field: f"${field}" for field in group_by_fields}
+        # Start building the pipeline
+        pipeline = []
+        # Add match stage if where clause is provided
+        if where:
+            pipeline.append({"$match": where})
+        # Add the group stage
+        group_stage = {"$group": {"_id": group_id, "objects": {"$push": "$$ROOT"}}}
+        pipeline.append(group_stage)
+        # Execute the aggregation
+        logger.debug(f"MongoDB group_by pipeline: {pipeline}")
+        aggregation_results = list(self.mongo_collection.aggregate(pipeline))
+        # Transform the results to match the expected format
+        results = []
+        for result in aggregation_results:
+            # Skip null groups if needed
+            if result["_id"] is None and kwargs.get("skip_nulls", False):
+                continue
+            # Create the group object
+            if isinstance(result["_id"], dict):
+                # Multi-field grouping
+                group_obj = result["_id"]
+            else:
+                # Single field grouping
+                group_obj = {group_by_fields[0]: result["_id"]}
+            # Add the grouped objects
+            objects = result["objects"]
+            # Remove MongoDB _id field from each object
+            for obj in objects:
+                if "_id" in obj:
+                    del obj["_id"]
+            # Apply any field selection or transformations based on agg_map
+            if agg_map:
+                # Get first fields (fields to keep as single values)
+                first_fields = agg_map.get("first", [])
+                if first_fields:
+                    # These are already in the group_obj from the _id
+                    pass
+                # Get list fields (fields to aggregate as lists)
+                list_fields = agg_map.get("list", [])
+                if list_fields:
+                    # Filter objects to only include specified fields
+                    objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
+                elif not list_fields and first_fields:
+                    # If list_fields is empty but first_fields is specified,
+                    # filter out first_fields from objects to avoid duplication
+                    objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
+            # Add the objects to the group
+            group_obj[inlined_field] = objects
+            results.append(group_obj)
+        return QueryResult(num_rows=len(results), rows=results)

linkml_store/api/stores/mongodb/mongodb_database.py ADDED Viewed

@@ -0,0 +1,114 @@
+# mongodb_database.py
+import logging
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import urlparse
+from pymongo import MongoClient
+from pymongo.database import Database as NativeDatabase
+from linkml_store.api import Database
+from linkml_store.api.queries import Query, QueryResult
+from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
+from linkml_store.utils.file_utils import safe_remove_directory
+from linkml_store.utils.format_utils import Format
+from linkml_store.utils.mongodb_utils import import_mongodb
+logger = logging.getLogger(__name__)
+class MongoDBDatabase(Database):
+    """
+    An adapter for MongoDB databases.
+    The LinkML-Store Database abstraction combines mongodb Client and Database.
+    """
+    _native_client: MongoClient = None
+    _native_db = None
+    collection_class = MongoDBCollection
+    def __init__(self, handle: Optional[str] = None, **kwargs):
+        if handle is None:
+            handle = "mongodb://localhost:27017/test"
+        if handle == "mongodb":
+            handle = "mongodb://localhost:27017/temporary"
+        super().__init__(handle=handle, **kwargs)
+    @property
+    def _db_name(self) -> str:
+        if self.handle:
+            parsed_url = urlparse(self.handle)
+            path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
+            db_name = path_parts[0] if path_parts else "default"
+            if not db_name:
+                db_name = self.alias
+        else:
+            db_name = "default"
+        return db_name
+    @property
+    def native_client(self) -> MongoClient:
+        if self._native_client is None:
+            self._native_client = MongoClient(self.handle)
+        return self._native_client
+    @property
+    def native_db(self) -> NativeDatabase:
+        if self._native_db is None:
+            alias = self.metadata.alias
+            if not alias:
+                alias = "default"
+            self._native_db = self.native_client[self._db_name]
+        return self._native_db
+    def commit(self, **kwargs):
+        pass
+    def close(self, **kwargs):
+        if self._native_client:
+            self._native_client.close()
+    def drop(self, **kwargs):
+        self.native_client.drop_database(self.native_db.name)
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        if query.from_table:
+            collection = self.get_collection(query.from_table)
+            return collection.query(query, **kwargs)
+        else:
+            raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
+    def init_collections(self):
+        if self._collections is None:
+            self._collections = {}
+        for collection_name in self.native_db.list_collection_names():
+            if collection_name not in self._collections:
+                collection = MongoDBCollection(name=collection_name, parent=self)
+                self._collections[collection_name] = collection
+    def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
+        if target_format == Format.DUMP_MONGODB.value or target_format == Format.DUMP_MONGODB:
+            path = Path(location)
+            if path.exists():
+                safe_remove_directory(path, no_backup=True)
+            from linkml_store.utils.mongodb_utils import export_mongodb
+            export_mongodb(self.handle, location)
+        else:
+            super().export_database(location, target_format=target_format, **kwargs)
+    def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
+        """
+        Import a database from a file or location.
+        :param location: location of the file
+        :param source_format: source format
+        :param kwargs: additional arguments
+        """
+        if source_format == Format.DUMP_MONGODB.value or source_format == Format.DUMP_MONGODB:
+            import_mongodb(self.handle, location, drop=True)
+        else:
+            super().import_database(location, source_format=source_format, **kwargs)

linkml_store/api/stores/neo4j/__init__.py ADDED Viewed

File without changes