PyPI - linkml-store - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show

linkml_store/api/client.py +9 -6
linkml_store/api/collection.py +118 -5
linkml_store/api/database.py +45 -14
linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
linkml_store/api/stores/solr/solr_collection.py +7 -1
linkml_store/cli.py +202 -21
linkml_store/index/implementations/llm_indexer.py +14 -6
linkml_store/index/indexer.py +7 -4
linkml_store/inference/implementations/llm_inference_engine.py +13 -9
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +1 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +183 -3
linkml_store/utils/llm_utils.py +3 -1
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +7 -1
linkml_store/utils/vector_utils.py +4 -11
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0

linkml_store/api/stores/duckdb/duckdb_database.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional, Union, List
 import pandas as pd
 import sqlalchemy
@@ -14,7 +14,7 @@ from linkml_store.api import Database
 from linkml_store.api.queries import Query, QueryResult
 from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
 from linkml_store.utils.format_utils import Format
-from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
+from linkml_store.utils.sql_utils import introspect_schema, query_to_sql, where_clause_to_sql
 TYPE_MAP = {
     "VARCHAR": "string",
@@ -62,7 +62,7 @@ class DuckDBDatabase(Database):
     def engine(self) -> sqlalchemy.Engine:
         if not self._engine:
             handle = self.handle
-            if not handle.startswith("duckdb://") and not handle.startswith(":"):
+            if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
                 handle = f"duckdb:///{handle}"
             if ":memory:" not in handle:
                 # TODO: investigate this; duckdb appears to be prematurely caching
@@ -71,6 +71,10 @@ class DuckDBDatabase(Database):
                 self._engine = sqlalchemy.create_engine(handle)
         return self._engine
+    @property
+    def _is_sqlite(self) -> bool:
+        return self.handle and self.handle.startswith("sqlite:")
     def commit(self, **kwargs):
         with self.engine.connect() as conn:
             conn.commit()
@@ -89,34 +93,60 @@ class DuckDBDatabase(Database):
             if not missing_ok:
                 raise FileNotFoundError(f"Database file not found: {path}")
-    def query(self, query: Query, **kwargs) -> QueryResult:
+    def _table_exists(self, table: str) -> bool:
+        if self._is_sqlite:
+            if table == "sqlite_master":
+                return True
+            meta_query = Query(
+                from_table="sqlite_master",
+                where_clause={
+                    # "type": "table",
+                    "name": table,
+                },
+            )
+        else:
+            if table.startswith("information_schema"):
+                return True
+            meta_query = Query(
+                from_table="information_schema.tables",
+                where_clause={
+                    "table_type": "BASE TABLE",
+                    "table_name": table,
+                },
+            )
+        qr = self.query(meta_query)
+        if qr.num_rows == 0:
+            logger.debug(f"Table {self.alias} not created yet")
+            return False
+        return True
+    def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
         json_encoded_cols = []
-        if query.from_table:
-            if not query.from_table.startswith("information_schema"):
-                meta_query = Query(
-                    from_table="information_schema.tables", where_clause={"table_name": query.from_table}
-                )
-                qr = self.query(meta_query)
-                if qr.num_rows == 0:
-                    logger.debug(f"Table {query.from_table} not created yet")
-                    return QueryResult(query=query, num_rows=0, rows=[])
-            if not query.from_table.startswith("information_schema"):
-                sv = self.schema_view
-            else:
-                sv = None
+        if table_name:
+            if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
+                return []
+            sv = self.schema_view
             if sv:
                 cd = None
                 for c in self._collections.values():
-                    # if c.name == query.from_table or c.metadata.alias == query.from_table:
-                    if c.alias == query.from_table or c.target_class_name == query.from_table:
+                    if c.alias == table_name or c.target_class_name == table_name:
                         cd = c.class_definition()
                         break
                 if cd:
                     for att in sv.class_induced_slots(cd.name):
                         if att.inlined or att.inlined_as_list:
                             json_encoded_cols.append(att.name)
+        return json_encoded_cols
+    def query(self, query: Query, **kwargs) -> QueryResult:
+        if not self._table_exists(query.from_table):
+            return QueryResult(query=query, num_rows=0, rows=[])
+        json_encoded_cols = self._json_encoded_cols(query.from_table)
         with self.engine.connect() as conn:
             count_query_str = text(query_to_sql(query, count=True))
+            logger.debug(f"count_query_str: {count_query_str}")
             num_rows = list(conn.execute(count_query_str))[0][0]
             logger.debug(f"num_rows: {num_rows}")
             query_str = query_to_sql(query, **kwargs)  # include offset, limit
@@ -167,6 +197,9 @@ class DuckDBDatabase(Database):
         logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
         sb = SchemaBuilder()
         schema = sb.schema
+        logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
+        if self._is_sqlite:
+            return SchemaView(schema)
         query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
         qr = self.query(query)
         logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")

linkml_store/api/stores/filesystem/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
 Handles have the form:
  - ``file:<path>`` for a local file
- """
+"""
 from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
 from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase

linkml_store/api/stores/mongodb/mongodb_collection.py CHANGED Viewed

@@ -41,6 +41,92 @@ class MongoDBCollection(Collection):
             del obj["_id"]
         self._post_insert_hook(objs)
+    def index(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        index_name: Optional[str] = None,
+        replace: bool = False,
+        unique: bool = False,
+        **kwargs,
+    ):
+        """
+        Create indexes on the collection.
+        :param objs: Field(s) to index.
+        :param index_name: Optional name for the index.
+        :param replace: If True, the index will be dropped and recreated.
+        :param unique: If True, creates a unique index (default: False).
+        """
+        if not isinstance(objs, list):
+            objs = [objs]
+        existing_indexes = self.mongo_collection.index_information()
+        for obj in objs:
+            field_exists = False
+            index_to_drop = None
+            # Extract existing index details
+            for index_name_existing, index_details in existing_indexes.items():
+                indexed_fields = [field[0] for field in index_details.get("key", [])]  # Extract field names
+                if obj in indexed_fields:  # If this field is already indexed
+                    field_exists = True
+                    index_to_drop = index_name_existing if replace else None
+            # Drop the index if replace=True and index_to_drop is valid
+            if index_to_drop:
+                self.mongo_collection.drop_index(index_to_drop)
+                logging.debug(f"Dropped existing index: {index_to_drop}")
+            # Create the new index only if it doesn't exist or was dropped
+            if not field_exists or replace:
+                self.mongo_collection.create_index(obj, name=index_name, unique=unique)
+                logging.debug(f"Created new index: {index_name} on field {obj}, unique={unique}")
+            else:
+                logging.debug(f"Index already exists for field {obj}, skipping creation.")
+    def upsert(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        filter_fields: List[str],
+        update_fields: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Upsert one or more documents into the MongoDB collection.
+        :param objs: The document(s) to insert or update.
+        :param filter_fields: List of field names to use as the filter for matching existing documents.
+        :param update_fields: List of field names to include in the update. If None, all fields are updated.
+        """
+        if not isinstance(objs, list):
+            objs = [objs]
+        for obj in objs:
+            # Ensure filter fields exist in the object
+            filter_criteria = {field: obj[field] for field in filter_fields if field in obj}
+            if not filter_criteria:
+                raise ValueError("At least one valid filter field must be present in each object.")
+            # Check if a document already exists
+            existing_doc = self.mongo_collection.find_one(filter_criteria)
+            if existing_doc:
+                # Update only changed fields
+                updates = {key: obj[key] for key in update_fields if key in obj and obj[key] != existing_doc.get(key)}
+                if updates:
+                    self.mongo_collection.update_one(filter_criteria, {"$set": updates})
+                    logging.debug(f"Updated existing document: {filter_criteria} with {updates}")
+                else:
+                    logging.debug(f"No changes detected for document: {filter_criteria}. Skipping update.")
+            else:
+                # Insert a new document
+                self.mongo_collection.insert_one(obj)
+                logging.debug(f"Inserted new document: {obj}")
     def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
         mongo_filter = self._build_mongo_filter(query.where_clause)
         limit = limit or query.limit
@@ -81,6 +167,8 @@ class MongoDBCollection(Collection):
         facet_limit=DEFAULT_FACET_LIMIT,
         **kwargs,
     ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
+        if facet_limit is None:
+            facet_limit = DEFAULT_FACET_LIMIT
         results = {}
         if not facet_columns:
             facet_columns = list(self.class_definition().attributes.keys())
@@ -177,3 +265,101 @@ class MongoDBCollection(Collection):
         if deleted_rows_count == 0 and not missing_ok:
             raise ValueError(f"No rows found for {where}")
         return deleted_rows_count
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by specified fields using MongoDB's aggregation pipeline.
+        This implementation leverages MongoDB's native aggregation capabilities for efficient grouping.
+        :param group_by_fields: List of fields to group by
+        :param inlined_field: Field name to store aggregated objects
+        :param agg_map: Dictionary mapping aggregation types to fields
+        :param where: Filter conditions
+        :param kwargs: Additional arguments
+        :return: Query result containing grouped data
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+        # Build the group key for MongoDB
+        if len(group_by_fields) == 1:
+            # Single field grouping
+            group_id = f"${group_by_fields[0]}"
+        else:
+            # Multi-field grouping
+            group_id = {field: f"${field}" for field in group_by_fields}
+        # Start building the pipeline
+        pipeline = []
+        # Add match stage if where clause is provided
+        if where:
+            pipeline.append({"$match": where})
+        # Add the group stage
+        group_stage = {
+            "$group": {
+                "_id": group_id,
+                "objects": {"$push": "$$ROOT"}
+            }
+        }
+        pipeline.append(group_stage)
+        # Execute the aggregation
+        logger.debug(f"MongoDB group_by pipeline: {pipeline}")
+        aggregation_results = list(self.mongo_collection.aggregate(pipeline))
+        # Transform the results to match the expected format
+        results = []
+        for result in aggregation_results:
+            # Skip null groups if needed
+            if result["_id"] is None and kwargs.get("skip_nulls", False):
+                continue
+            # Create the group object
+            if isinstance(result["_id"], dict):
+                # Multi-field grouping
+                group_obj = result["_id"]
+            else:
+                # Single field grouping
+                group_obj = {group_by_fields[0]: result["_id"]}
+            # Add the grouped objects
+            objects = result["objects"]
+            # Remove MongoDB _id field from each object
+            for obj in objects:
+                if "_id" in obj:
+                    del obj["_id"]
+            # Apply any field selection or transformations based on agg_map
+            if agg_map:
+                # Get first fields (fields to keep as single values)
+                first_fields = agg_map.get("first", [])
+                if first_fields:
+                    # These are already in the group_obj from the _id
+                    pass
+                # Get list fields (fields to aggregate as lists)
+                list_fields = agg_map.get("list", [])
+                if list_fields:
+                    # Filter objects to only include specified fields
+                    objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
+                elif not list_fields and first_fields:
+                    # If list_fields is empty but first_fields is specified,
+                    # filter out first_fields from objects to avoid duplication
+                    objects = [{k: v for k, v in obj.items() if k not in first_fields} for obj in objects]
+            # Add the objects to the group
+            group_obj[inlined_field] = objects
+            results.append(group_obj)
+        return QueryResult(num_rows=len(results), rows=results)

linkml_store/api/stores/mongodb/mongodb_database.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 from typing import Optional, Union
+from urllib.parse import urlparse
 from pymongo import MongoClient
 from pymongo.database import Database as NativeDatabase
@@ -38,10 +39,14 @@ class MongoDBDatabase(Database):
     @property
     def _db_name(self) -> str:
         if self.handle:
-            db = self.handle.split("/")[-1]
+            parsed_url = urlparse(self.handle)
+            path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
+            db_name = path_parts[0] if path_parts else "default"
+            if not db_name:
+                db_name = self.alias
         else:
-            db = "default"
-        return db
+            db_name = "default"
+        return db_name
     @property
     def native_client(self) -> MongoClient:

linkml_store/api/stores/solr/solr_collection.py CHANGED Viewed

@@ -62,12 +62,18 @@ class SolrCollection(Collection):
         return QueryResult(query=query, num_rows=num_rows, rows=rows)
     def query_facets(
-        self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
+        self,
+        where: Optional[Dict] = None,
+        facet_columns: List[str] = None,
+        facet_limit=DEFAULT_FACET_LIMIT,
+        facet_min_count: int = 1,
+        **kwargs,
     ) -> Dict[str, Dict[str, int]]:
         solr_query = self._build_solr_query(where)
         solr_query["facet"] = "true"
         solr_query["facet.field"] = facet_columns
         solr_query["facet.limit"] = facet_limit
+        solr_query["facet.mincount"] = facet_min_count
         logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")

linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl