PyPI - linkml-store - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show

linkml_store/api/client.py +9 -6
linkml_store/api/collection.py +118 -5
linkml_store/api/database.py +45 -14
linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
linkml_store/api/stores/solr/solr_collection.py +7 -1
linkml_store/cli.py +202 -21
linkml_store/index/implementations/llm_indexer.py +14 -6
linkml_store/index/indexer.py +7 -4
linkml_store/inference/implementations/llm_inference_engine.py +13 -9
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +1 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +183 -3
linkml_store/utils/llm_utils.py +3 -1
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +7 -1
linkml_store/utils/vector_utils.py +4 -11
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0

linkml_store/api/client.py CHANGED Viewed

@@ -12,9 +12,9 @@ from linkml_store.api.config import ClientConfig
 logger = logging.getLogger(__name__)
 HANDLE_MAP = {
     "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
+    "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
     "solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
     "mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
     "chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
@@ -24,6 +24,8 @@ HANDLE_MAP = {
 SUFFIX_MAP = {
     "ddb": "duckdb:///{path}",
+    "duckdb": "duckdb:///{path}",
+    "db": "duckdb:///{path}",
 }
@@ -204,9 +206,10 @@ class Client:
         if ":" not in handle:
             if alias is None:
                 alias = handle
-            suffix = handle.split(".")[-1]
-            if suffix in SUFFIX_MAP:
-                handle = SUFFIX_MAP[suffix].format(path=handle)
+            if "." in handle:
+                suffix = handle.split(".")[-1]
+                if suffix in SUFFIX_MAP:
+                    handle = SUFFIX_MAP[suffix].format(path=handle)
         if ":" not in handle:
             scheme = handle
             handle = None
@@ -216,14 +219,14 @@ class Client:
             scheme, _ = handle.split(":", 1)
         if scheme not in HANDLE_MAP:
             raise ValueError(f"Unknown scheme: {scheme}")
-        module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
+        module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
         try:
             module = importlib.import_module(module_path)
             cls = getattr(module, class_name)
         except ImportError as e:
             raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
-        #cls = HANDLE_MAP[scheme]
+        # cls = HANDLE_MAP[scheme]
         db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
         if schema_view:
             db.set_schema_view(schema_view)

linkml_store/api/collection.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """A structure for representing collections of similar objects."""
 import hashlib
+import json
 import logging
 from collections import defaultdict
 from pathlib import Path
@@ -210,8 +211,62 @@ class Collection(Generic[DatabaseType]):
         """
         raise NotImplementedError
+    def index(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        index_name: Optional[str] = None,
+        replace: bool = False,
+        unique: bool = False,
+        **kwargs,
+    ) -> None:
+        """
+        Index objects in the collection.
+        :param objs:
+        :param index_name:
+        :param replace: replace the index, or not
+        :param unique: boolean used to declare the index unique or not
+        :param kwargs:
+        :return:
+        """
+        raise NotImplementedError
+    def upsert(
+        self,
+        objs: Union[OBJECT, List[OBJECT]],
+        filter_fields: List[str],
+        update_fields: Union[List[str], None] = None,
+        **kwargs,
+    ):
+        """
+        Add one or more objects to the collection.
+        >>> from linkml_store import Client
+        >>> client = Client()
+        >>> db = client.attach_database("mongodb", alias="test")
+        >>> collection = db.create_collection("Person")
+        >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
+        >>> collection.upsert(objs)
+        :param objs:
+        :param filter_fields: List of field names to use as the filter for matching existing collections.
+        :param update_fields: List of field names to include in the update. If None, all fields are updated.
+        :param kwargs:
+        :return:
+        """
+        raise NotImplementedError
     def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
-        logger.info(f"Pre-query hook (state: {self._initialized}; Q= {query}")
+        """
+        Pre-query hook.
+        This is called before a query is executed. It is used to materialize derivations and indexes.
+        :param query:
+        :param kwargs:
+        :return:
+        """
+        logger.debug(f"Pre-query hook (state: {self._initialized}; Q= {query}")  # if logging.info, this is very noisy.
         if not self._initialized:
             self._materialize_derivations()
             self._initialized = True
@@ -402,7 +457,12 @@ class Collection(Generic[DatabaseType]):
             return qr.rows[0]
         return None
-    def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
+    def find(
+        self,
+        where: Optional[Any] = None,
+        select_cols: Optional[List[str]] = None,
+        **kwargs,
+    ) -> QueryResult:
         """
         Find objects in the collection using a where query.
@@ -432,10 +492,14 @@ class Collection(Generic[DatabaseType]):
         :param where:
+        :param select_cols:
         :param kwargs:
         :return:
         """
-        query = self._create_query(where_clause=where)
+        query = self._create_query(
+            where_clause=where,
+            select_cols=select_cols,
+        )
         self._pre_query_hook(query)
         return self.query(query, **kwargs)
@@ -535,8 +599,16 @@ class Collection(Generic[DatabaseType]):
                 assert ix_coll.size() > 0
         qr = ix_coll.find(where=where, limit=-1, **kwargs)
         index_col = ix.index_field
         # TODO: optimize this for large indexes
-        vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
+        def row2array(row):
+            v = row[index_col]
+            if isinstance(v, str):
+                # sqlite stores arrays as strings
+                v = json.loads(v)
+            return np.array(v, dtype=float)
+        vector_pairs = [(row, row2array(row)) for row in qr.rows]
         results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
             del r[1][index_col]
@@ -550,6 +622,47 @@ class Collection(Generic[DatabaseType]):
         new_qr.rows = [r[1] for r in results]
         return new_qr
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by a column.
+        :param group_by:
+        :param where:
+        :param kwargs:
+        :return:
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+        df = self.find(where=where, limit=-1).rows_dataframe
+        # Handle the case where agg_map is None
+        if agg_map is None:
+            agg_map = {}
+        pk_fields = agg_map.get("first", []) + group_by_fields
+        list_fields = agg_map.get("list", [])
+        if not list_fields:
+            list_fields = [a for a in df.columns if a not in pk_fields]
+        grouped_objs = defaultdict(list)
+        for _, row in df.iterrows():
+            pk = tuple(row[pk_fields])
+            grouped_objs[pk].append({k: row[k] for k in list_fields})
+        results = []
+        for pk, objs in grouped_objs.items():
+            top_obj = {k: v for k, v in zip(pk_fields, pk)}
+            top_obj[inlined_field] = objs
+            results.append(top_obj)
+        r = QueryResult(num_rows=len(results), rows=results)
+        return r
     @property
     def is_internal(self) -> bool:
         """
@@ -1004,7 +1117,7 @@ class Collection(Generic[DatabaseType]):
             multivalued = any(multivalueds)
             inlined = any(inlineds)
             if multivalued and False in multivalueds:
-                raise ValueError(f"Mixed list non list: {vs} // inferred= {multivalueds}")
+                logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
             # if not rngs:
             #    raise AssertionError(f"Empty rngs for {k} = {vs}")
             rng = rngs[0] if rngs else None

linkml_store/api/database.py CHANGED Viewed

@@ -276,14 +276,15 @@ class Database(ABC, Generic[CollectionType]):
         Examples:
-        >>> from linkml_store.api.client import Client
-        >>> client = Client()
-        >>> db = client.attach_database("duckdb", alias="test")
-        >>> collection = db.create_collection("Person", alias="persons")
-        >>> collection.alias
-        'persons'
-        >>> collection.target_class_name
-        'Person'
+            >>> from linkml_store.api.client import Client
+            >>> client = Client()
+            >>> db = client.attach_database("duckdb", alias="test")
+            >>> collection = db.create_collection("Person", alias="persons")
+            >>> collection.alias
+            'persons'
+            >>> collection.target_class_name
+            'Person'
         If alias is not provided, it defaults to the name of the type.
@@ -419,7 +420,7 @@ class Database(ABC, Generic[CollectionType]):
         >>> from linkml_store.api.client import Client
         >>> from linkml_store.api.queries import Query
         >>> client = Client()
-        >>> db = client.attach_database("duckdb", alias="test")
+        >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
         >>> collection = db.create_collection("Person")
         >>> collection.insert([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
         >>> query = Query(from_table="Person", where_clause={"name": "John"})
@@ -451,7 +452,7 @@ class Database(ABC, Generic[CollectionType]):
         >>> from linkml_store.api.client import Client
         >>> client = Client()
-        >>> db = client.attach_database("duckdb", alias="test")
+        >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
         >>> collection = db.create_collection("Person", alias="persons")
         >>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25}])
         >>> schema_view = db.schema_view
@@ -594,7 +595,31 @@ class Database(ABC, Generic[CollectionType]):
             sb.add_class(coll.target_class_name)
         return SchemaView(sb.schema)
-    def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
+    def validate_database(self, **kwargs) -> List["ValidationResult"]:
+        """
+        Validate the contents of the database.
+        As `iter_validate_database`, but returns a list of validation results.
+        :param kwargs:
+        :return:
+        """
+        return list(self.iter_validate_database(**kwargs))
+    def validate_database(self, **kwargs) -> List["ValidationResult"]:
+        """
+        Validate the contents of the database.
+        As `iter_validate_database`, but returns a list of validation results.
+        :param kwargs:
+        :return:
+        """
+        return list(self.iter_validate_database(**kwargs))
+    def iter_validate_database(
+        self, ensure_referential_integrity: bool = None, **kwargs
+    ) -> Iterator["ValidationResult"]:
         """
         Validate the contents of the database.
@@ -634,12 +659,14 @@ class Database(ABC, Generic[CollectionType]):
         'capital' is a required property
         'continent' is a required proper
+        :param ensure_referential_integrity: ensure referential integrity
         :param kwargs:
         :return: iterator over validation results
         """
         for collection in self.list_collections():
             yield from collection.iter_validate_collection(**kwargs)
-        if self.metadata.ensure_referential_integrity:
+        if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
+            logger.info(f"Validating referential integrity on {self.alias}")
             yield from self._validate_referential_integrity(**kwargs)
     def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
@@ -660,7 +687,9 @@ class Database(ABC, Generic[CollectionType]):
             induced_slots = sv.class_induced_slots(cd.name)
             slot_map = {s.name: s for s in induced_slots}
             # rmap = {s.name: s.range for s in induced_slots}
+            # map slot ranges to a collection where that range is stored
             sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
+            logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
             for obj in collection.find_iter():
                 for k, v in obj.items():
                     if k not in sr_to_coll:
@@ -721,7 +750,7 @@ class Database(ABC, Generic[CollectionType]):
         >>> from linkml_store.api.client import Client
         >>> client = Client()
-        >>> db = client.attach_database("duckdb", alias="test")
+        >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
         >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
         >>> db.list_collection_names()
         ['iris']
@@ -741,7 +770,9 @@ class Database(ABC, Generic[CollectionType]):
                 # import into a test instance
                 tmp_handle = source_format.value
                 client = self.parent
-                tmp_db = client.attach_database(tmp_handle, alias="tmp")
+                tmp_alias = "tmp"
+                client.drop_database(tmp_alias, missing_ok=True)
+                tmp_db = client.attach_database(tmp_handle, alias=tmp_alias, recreate_if_exists=True)
                 # TODO: check for infinite recursion
                 tmp_db.import_database(location, source_format=source_format)
                 obj = {}

linkml_store/api/stores/duckdb/duckdb_collection.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple
 import sqlalchemy as sqla
 from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
@@ -8,7 +8,7 @@ from sqlalchemy.sql.ddl import CreateTable
 from linkml_store.api import Collection
 from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
-from linkml_store.api.queries import Query
+from linkml_store.api.queries import Query, QueryResult
 from linkml_store.api.stores.duckdb.mappings import TMAP
 from linkml_store.utils.sql_utils import facet_count_sql
@@ -94,7 +94,9 @@ class DuckDBCollection(Collection):
     def query_facets(
         self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
-    ) -> Dict[str, Dict[str, int]]:
+    ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
+        if facet_limit is None:
+            facet_limit = DEFAULT_FACET_LIMIT
         results = {}
         cd = self.class_definition()
         with self.parent.engine.connect() as conn:
@@ -143,20 +145,186 @@ class DuckDBCollection(Collection):
             return True
         return False
+    def group_by(
+        self,
+        group_by_fields: List[str],
+        inlined_field="objects",
+        agg_map: Optional[Dict[str, str]] = None,
+        where: Optional[Dict] = None,
+        **kwargs,
+    ) -> QueryResult:
+        """
+        Group objects in the collection by specified fields using SQLAlchemy.
+        This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
+        :param group_by_fields: List of fields to group by
+        :param inlined_field: Field name to store aggregated objects
+        :param agg_map: Dictionary mapping aggregation types to fields
+        :param where: Filter conditions
+        :param kwargs: Additional arguments
+        :return: Query result containing grouped data
+        """
+        if isinstance(group_by_fields, str):
+            group_by_fields = [group_by_fields]
+        cd = self.class_definition()
+        if not cd:
+            logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+        # Check if the table exists
+        if not self.parent._table_exists(self.alias):
+            logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+        # Get table definition
+        table = self._sqla_table(cd)
+        engine = self.parent.engine
+        # Create a SQLAlchemy select statement for groups
+        from sqlalchemy import select, func, and_, or_
+        group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
+        if not group_cols:
+            logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
+        stmt = select(*group_cols).distinct()
+        # Add where conditions if specified
+        if where:
+            conditions = []
+            for k, v in where.items():
+                if k in table.columns.keys():
+                    # Handle different operator types (dict values for operators)
+                    if isinstance(v, dict):
+                        for op, val in v.items():
+                            if op == "$gt":
+                                conditions.append(table.c[k] > val)
+                            elif op == "$gte":
+                                conditions.append(table.c[k] >= val)
+                            elif op == "$lt":
+                                conditions.append(table.c[k] < val)
+                            elif op == "$lte":
+                                conditions.append(table.c[k] <= val)
+                            elif op == "$ne":
+                                conditions.append(table.c[k] != val)
+                            elif op == "$in":
+                                conditions.append(table.c[k].in_(val))
+                            else:
+                                # Default to equality for unknown operators
+                                logger.warning(f"Unknown operator {op}, using equality")
+                                conditions.append(table.c[k] == val)
+                    else:
+                        # Direct equality comparison
+                        conditions.append(table.c[k] == v)
+            if conditions:
+                for condition in conditions:
+                    stmt = stmt.where(condition)
+        results = []
+        try:
+            with engine.connect() as conn:
+                # Get all distinct groups
+                group_result = conn.execute(stmt)
+                group_rows = list(group_result)
+                # For each group, get all objects
+                for group_row in group_rows:
+                    # Build conditions for this group
+                    group_conditions = []
+                    group_dict = {}
+                    for i, field in enumerate(group_by_fields):
+                        if field in table.columns.keys():
+                            value = group_row[i]
+                            group_dict[field] = value
+                            if value is None:
+                                group_conditions.append(table.c[field].is_(None))
+                            else:
+                                group_conditions.append(table.c[field] == value)
+                    # Get all rows for this group
+                    row_stmt = select(*table.columns)
+                    for condition in group_conditions:
+                        row_stmt = row_stmt.where(condition)
+                    # Add original where conditions
+                    if where:
+                        for k, v in where.items():
+                            if k in table.columns.keys():
+                                # Handle different operator types for the row query as well
+                                if isinstance(v, dict):
+                                    for op, val in v.items():
+                                        if op == "$gt":
+                                            row_stmt = row_stmt.where(table.c[k] > val)
+                                        elif op == "$gte":
+                                            row_stmt = row_stmt.where(table.c[k] >= val)
+                                        elif op == "$lt":
+                                            row_stmt = row_stmt.where(table.c[k] < val)
+                                        elif op == "$lte":
+                                            row_stmt = row_stmt.where(table.c[k] <= val)
+                                        elif op == "$ne":
+                                            row_stmt = row_stmt.where(table.c[k] != val)
+                                        elif op == "$in":
+                                            row_stmt = row_stmt.where(table.c[k].in_(val))
+                                        else:
+                                            # Default to equality for unknown operators
+                                            row_stmt = row_stmt.where(table.c[k] == val)
+                                else:
+                                    # Direct equality comparison
+                                    row_stmt = row_stmt.where(table.c[k] == v)
+                    row_result = conn.execute(row_stmt)
+                    rows = list(row_result)
+                    # Convert rows to dictionaries
+                    objects = []
+                    for row in rows:
+                        obj = {}
+                        for i, col in enumerate(row._fields):
+                            obj[col] = row[i]
+                        objects.append(obj)
+                    # Apply agg_map to filter fields if specified
+                    if agg_map and "list" in agg_map:
+                        list_fields = agg_map["list"]
+                        if list_fields:
+                            objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
+                    # Create the result object
+                    result_obj = group_dict.copy()
+                    result_obj[inlined_field] = objects
+                    results.append(result_obj)
+                return QueryResult(num_rows=len(results), rows=results)
+        except Exception as e:
+            logger.warning(f"Error in DuckDB group_by: {e}")
+            # Fall back to parent implementation
+            return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
     def _create_table(self, cd: ClassDefinition):
         if self._table_created or self.metadata.is_prepopulated:
             logger.info(f"Already have table for: {cd.name}")
             return
-        query = Query(
-            from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
-        )
-        qr = self.parent.query(query)
-        if qr.num_rows > 0:
+        if self.parent._table_exists(self.alias):
             logger.info(f"Table already exists for {cd.name}")
             self._table_created = True
             self._initialized = True
             self.metadata.is_prepopulated = True
             return
+        # query = Query(
+        #     from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
+        # )
+        # qr = self.parent.query(query)
+        # if qr.num_rows > 0:
+        #     logger.info(f"Table already exists for {cd.name}")
+        #     self._table_created = True
+        #     self._initialized = True
+        #     self.metadata.is_prepopulated = True
+        #     return
         logger.info(f"Creating table for {cd.name}")
         t = self._sqla_table(cd)
         ct = CreateTable(t)

linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl