PyPI - linkml-store - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

linkml-store 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (25) hide show

linkml_store/api/client.py +35 -8
linkml_store/api/collection.py +40 -5
linkml_store/api/config.py +20 -3
linkml_store/api/database.py +24 -3
linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
linkml_store/cli.py +140 -13
linkml_store/inference/__init__.py +13 -0
linkml_store/inference/implementations/__init__.py +0 -0
linkml_store/inference/implementations/rag_inference_engine.py +145 -0
linkml_store/inference/implementations/rule_based_inference_engine.py +158 -0
linkml_store/inference/implementations/sklearn_inference_engine.py +290 -0
linkml_store/inference/inference_config.py +62 -0
linkml_store/inference/inference_engine.py +173 -0
linkml_store/inference/inference_engine_registry.py +74 -0
linkml_store/utils/format_utils.py +21 -90
linkml_store/utils/llm_utils.py +95 -0
linkml_store/utils/object_utils.py +3 -1
linkml_store/utils/pandas_utils.py +55 -2
linkml_store/utils/sklearn_utils.py +193 -0
linkml_store/utils/stats_utils.py +53 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/METADATA +25 -2
{linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/RECORD +25 -14
{linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/LICENSE +0 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/WHEEL +0 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/entry_points.txt +0 -0

linkml_store/api/client.py CHANGED Viewed

@@ -100,7 +100,7 @@ class Client:
         """
         return self.metadata.base_dir
-    def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, **kwargs):
+    def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, auto_attach=False, **kwargs):
         """
         Create a client from a configuration.
@@ -109,6 +109,10 @@ class Client:
         >>> from linkml_store.api.config import ClientConfig
         >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
         >>> len(client.databases)
+        0
+        >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}),
+        ...                                auto_attach=True)
+        >>> len(client.databases)
         1
         >>> "test" in client.databases
         True
@@ -116,6 +120,8 @@ class Client:
         'duckdb:///:memory:'
         :param config:
+        :param base_dir:
+        :param auto_attach:
         :param kwargs:
         :return:
@@ -125,17 +131,17 @@ class Client:
         if isinstance(config, Path):
             config = str(config)
         if isinstance(config, str):
-            # if not base_dir:
-            #    base_dir = Path(config).parent
+            if not base_dir:
+                base_dir = Path(config).parent
             parsed_obj = yaml.safe_load(open(config))
             config = ClientConfig(**parsed_obj)
         self.metadata = config
         if base_dir:
             self.metadata.base_dir = base_dir
-        self._initialize_databases(**kwargs)
+        self._initialize_databases(auto_attach=auto_attach, **kwargs)
         return self
-    def _initialize_databases(self, **kwargs):
+    def _initialize_databases(self, auto_attach=False, **kwargs):
         for name, db_config in self.metadata.databases.items():
             base_dir = self.base_dir
             logger.info(f"Initializing database: {name}, base_dir: {base_dir}")
@@ -146,8 +152,22 @@ class Client:
             db_config.handle = handle
             if db_config.schema_location:
                 db_config.schema_location = db_config.schema_location.format(base_dir=base_dir)
-            db = self.attach_database(handle, alias=name, **kwargs)
-            db.from_config(db_config)
+            if auto_attach:
+                db = self.attach_database(handle, alias=name, **kwargs)
+                db.from_config(db_config)
+    def _set_database_config(self, db: Database):
+        """
+        Set the configuration for a database.
+        :param name:
+        :param config:
+        :return:
+        """
+        if not self.metadata:
+            return
+        if db.alias in self.metadata.databases:
+            db.from_config(self.metadata.databases[db.alias])
     def attach_database(
         self,
@@ -202,6 +222,7 @@ class Client:
                 raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
         else:
             db.metadata.alias = alias
+        self._set_database_config(db)
         return db
     def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -230,13 +251,19 @@ class Client:
             return list(self._databases.values())[0]
         if not self._databases:
             self._databases = {}
+        if name not in self._databases and name in self.metadata.databases:
+            db_config = self.metadata.databases[name]
+            db = self.attach_database(db_config.handle, alias=name, **kwargs)
+            self._databases[name] = db
         if name not in self._databases:
             if create_if_not_exists:
                 logger.info(f"Creating database: {name}")
                 self.attach_database(name, **kwargs)
             else:
                 raise ValueError(f"Database {name} does not exist")
-        return self._databases[name]
+        db = self._databases[name]
+        self._set_database_config(db)
+        return db
     @property
     def databases(self) -> Dict[str, Database]:

linkml_store/api/collection.py CHANGED Viewed

@@ -502,6 +502,7 @@ class Collection(Generic[DatabaseType]):
                 index_name = self.default_index_name
         ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
         if index_name not in self.indexers:
+            logger.debug(f"Indexer not found: {index_name} -- creating")
             ix = get_indexer(index_name)
             if not self._indexers:
                 self._indexers = {}
@@ -509,6 +510,11 @@ class Collection(Generic[DatabaseType]):
         ix = self.indexers.get(index_name)
         if not ix:
             raise ValueError(f"No index named {index_name}")
+        logger.debug(f"Using indexer {type(ix)} with name {index_name}")
+        if ix_coll.size() == 0:
+            logger.info(f"Index {index_name} is empty; indexing all objects")
+            all_objs = self.find(limit=-1).rows
+            self.index_objects(all_objs, index_name, replace=True, **kwargs)
         qr = ix_coll.find(where=where, limit=-1, **kwargs)
         index_col = ix.index_field
         # TODO: optimize this for large indexes
@@ -518,6 +524,7 @@ class Collection(Generic[DatabaseType]):
             del r[1][index_col]
         new_qr = QueryResult(num_rows=len(results))
         new_qr.ranked_rows = results
+        new_qr.rows = [r[1] for r in results]
         return new_qr
     @property
@@ -562,6 +569,7 @@ class Collection(Generic[DatabaseType]):
                     format=source.format,
                     expected_type=source.expected_type,
                     compression=source.compression,
+                    select_query=source.select_query,
                     **kwargs,
                 )
             elif metadata.source.url:
@@ -570,9 +578,12 @@ class Collection(Generic[DatabaseType]):
                     format=source.format,
                     expected_type=source.expected_type,
                     compression=source.compression,
+                    select_query=source.select_query,
                     **kwargs,
                 )
-        self.insert(objects)
+            else:
+                raise ValueError("No source local_path or url provided")
+            self.insert(objects)
     def _check_if_initialized(self) -> bool:
         return self._initialized
@@ -629,6 +640,14 @@ class Collection(Generic[DatabaseType]):
             self.insert(tr_objs)
             self.commit()
+    def size(self) -> int:
+        """
+        Return the number of objects in the collection.
+        :return: The number of objects in the collection.
+        """
+        return self.find({}, limit=1).num_rows
     def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
         """
         Attach an index to the collection.
@@ -777,6 +796,8 @@ class Collection(Generic[DatabaseType]):
         sv: SchemaView = self.parent.schema_view
         if sv:
             cls = sv.get_class(self.target_class_name)
+            # if not cls:
+            #     logger.warning(f"{self.target_class_name} not in {sv.all_classes().keys()} ")
             # cls = sv.schema.classes[self.target_class_name]
             if cls and not cls.attributes:
                 if not sv.class_induced_slots(cls.name):
@@ -900,11 +921,14 @@ class Collection(Generic[DatabaseType]):
                     exact_dimensions_list.append(v.shape)
                     break
                 if isinstance(v, list):
+                    # sample first item. TODO: more robust strategy
                     v = v[0] if v else None
                     multivalueds.append(True)
                 elif isinstance(v, dict):
-                    v = list(v.values())[0]
-                    multivalueds.append(True)
+                    pass
+                    # TODO: check if this is a nested object or key-value list
+                    # v = list(v.values())[0]
+                    # multivalueds.append(True)
                 else:
                     multivalueds.append(False)
                 if not v:
@@ -933,10 +957,21 @@ class Collection(Generic[DatabaseType]):
             #    raise AssertionError(f"Empty rngs for {k} = {vs}")
             rng = rngs[0] if rngs else None
             for other_rng in rngs:
+                coercions = {
+                    ("integer", "float"): "float",
+                }
                 if rng != other_rng:
-                    raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
+                    if (rng, other_rng) in coercions:
+                        rng = coercions[(rng, other_rng)]
+                    elif (other_rng, rng) in coercions:
+                        rng = coercions[(other_rng, rng)]
+                    else:
+                        raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
             logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
-            cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
+            inlined_as_list = inlined and multivalued
+            cd.attributes[k] = SlotDefinition(
+                k, range=rng, multivalued=multivalued, inlined=inlined, inlined_as_list=inlined_as_list
+            )
             if exact_dimensions_list:
                 array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
                 cd.attributes[k].array = array_expr

linkml_store/api/config.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 from pydantic import BaseModel, Field
-from linkml_store.graphs.graph_map import GraphProjection
+from linkml_store.graphs.graph_map import EdgeProjection, NodeProjection
 class ConfiguredBaseModel(BaseModel, extra="forbid"):
@@ -30,13 +30,30 @@ class CollectionSource(ConfiguredBaseModel):
     """
     url: Optional[str] = None
+    """Remote URL to fetch data from"""
     local_path: Optional[str] = None
+    """Local path to fetch data from"""
     source_location: Optional[str] = None
     refresh_interval_days: Optional[float] = None
+    """How often to refresh the data, in days"""
     expected_type: Optional[str] = None
+    """The expected type of the data, e.g list"""
     format: Optional[str] = None
+    """The format of the data, e.g., json, yaml, csv"""
     compression: Optional[str] = None
+    """The compression of the data, e.g., tgz, gzip, zip"""
+    select_query: Optional[str] = None
+    """A jsonpath query to preprocess the objects with"""
     arguments: Optional[Dict[str, Any]] = None
+    """Optional arguments to pass to the source"""
 class CollectionConfig(ConfiguredBaseModel):
@@ -81,7 +98,7 @@ class CollectionConfig(ConfiguredBaseModel):
         description="LinkML-Map derivations",
     )
     page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
-    graph_projection: Optional[GraphProjection] = Field(
+    graph_projection: Optional[Union[EdgeProjection, NodeProjection]] = Field(
         default=None,
         description="Optional graph projection configuration",
     )

linkml_store/api/database.py CHANGED Viewed

@@ -707,12 +707,29 @@ class Database(ABC, Generic[CollectionType]):
         """
         raise NotImplementedError()
-    def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
+    def import_database(
+        self,
+        location: str,
+        source_format: Optional[Union[str, Format]] = None,
+        collection_name: Optional[str] = None,
+        **kwargs,
+    ):
         """
         Import a database from a file or location.
+        >>> from linkml_store.api.client import Client
+        >>> client = Client()
+        >>> db = client.attach_database("duckdb", alias="test")
+        >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
+        >>> db.list_collection_names()
+        ['iris']
+        >>> collection = db.get_collection("iris")
+        >>> collection.find({}).num_rows
+        150
         :param location: location of the file
         :param source_format: source format
+        :param collection_name: (Optional) name of the collection, for data that is flat
         :param kwargs: additional arguments
         """
         if isinstance(source_format, str):
@@ -732,8 +749,12 @@ class Database(ABC, Generic[CollectionType]):
                 self.store(obj)
                 return
         objects = load_objects(location, format=source_format)
-        for obj in objects:
-            self.store(obj)
+        if collection_name:
+            collection = self.get_collection(collection_name, create_if_not_exists=True)
+            collection.insert(objects)
+        else:
+            for obj in objects:
+                self.store(obj)
     def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
         """

linkml_store/api/stores/mongodb/mongodb_collection.py CHANGED Viewed

@@ -51,9 +51,13 @@ class MongoDBCollection(Collection):
         if offset and offset >= 0:
             cursor = cursor.skip(offset)
+        select_cols = query.select_cols
         def _as_row(row: dict):
             row = copy(row)
             del row["_id"]
+            if select_cols:
+                row = {k: row[k] for k in select_cols if k in row}
             return row
         rows = [_as_row(row) for row in cursor]

linkml_store/cli.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import sys
 import warnings
+from pathlib import Path
 from typing import Optional
 import click
@@ -10,14 +11,22 @@ from pydantic import BaseModel
 from linkml_store import Client
 from linkml_store.api import Collection, Database
+from linkml_store.api.config import ClientConfig
 from linkml_store.api.queries import Query
 from linkml_store.index import get_indexer
 from linkml_store.index.implementations.simple_indexer import SimpleIndexer
 from linkml_store.index.indexer import Indexer
+from linkml_store.inference import get_inference_engine
+from linkml_store.inference.inference_config import InferenceConfig
+from linkml_store.inference.inference_engine import ModelSerialization
 from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
 from linkml_store.utils.object_utils import object_path_update
 from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
+DEFAULT_LOCAL_CONF_PATH = Path("linkml.yaml")
+# global path is ~/.linkml.yaml in the user's home directory
+DEFAULT_GLOBAL_CONF_PATH = Path("~/.linkml.yaml").expanduser()
 index_type_option = click.option(
     "--index-type",
     "-t",
@@ -84,6 +93,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
 @click.group()
 @click.option("--database", "-d", help="Database name")
 @click.option("--collection", "-c", help="Collection name")
+@click.option("--input", "-i", help="Input file (alternative to database/collection)")
 @click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
 @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
 @click.option("-v", "--verbose", count=True)
@@ -96,7 +106,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
     help="If set then show full stacktrace on error",
 )
 @click.pass_context
-def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, **kwargs):
+def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, input, **kwargs):
     """A CLI for interacting with the linkml-store."""
     if not stacktrace:
         sys.tracebacklimit = 0
@@ -119,13 +129,25 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
     if quiet:
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
+    if input:
+        stem = Path(input).stem
+        database = "duckdb"
+        collection = stem
+        config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
+        # collection = Path(input).stem
+        # database = f"file:{Path(input).parent}"
+    if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
+        config = DEFAULT_LOCAL_CONF_PATH
+    if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
+        config = DEFAULT_GLOBAL_CONF_PATH
+    if config == ".":
+        config = None
+    if not collection and database and "::" in database:
+        database, collection = database.split("::")
     client = Client().from_config(config, **kwargs) if config else Client()
     settings = ContextSettings(client=client, database_name=database, collection_name=collection)
     ctx.obj["settings"] = settings
-    # DEPRECATED
-    ctx.obj["client"] = client
-    ctx.obj["database"] = database
-    ctx.obj["collection"] = collection
     if settings.database_name:
         db = client.get_database(database)
         if set:
@@ -136,12 +158,6 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
                 val = yaml.safe_load(val)
                 logger.info(f"Setting {path} to {val}")
                 db.metadata = object_path_update(db.metadata, path, val)
-        # settings.database = db
-        # DEPRECATED
-        ctx.obj["database_obj"] = db
-        if collection:
-            collection_obj = db.get_collection(collection)
-            ctx.obj["collection_obj"] = collection_obj
     if not settings.database_name:
         # if len(client.databases) != 1:
         #    raise ValueError("Database must be specified if there are multiple databases.")
@@ -323,11 +339,12 @@ def apply(ctx, patch_files, identifier_attribute):
 @cli.command()
 @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
+@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
 @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
 @click.pass_context
-def query(ctx, where, limit, output_type, output):
+def query(ctx, where, select, limit, output_type, output):
     """Query objects from the specified collection.
@@ -353,7 +370,13 @@ def query(ctx, where, limit, output_type, output):
     """
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
-    query = Query(from_table=collection.alias, where_clause=where_clause, limit=limit)
+    select_clause = yaml.safe_load(select) if select else None
+    if select_clause:
+        if isinstance(select_clause, str):
+            select_clause = [select_clause]
+        if not isinstance(select_clause, list):
+            raise ValueError(f"SELECT clause must be a list. Got: {select_clause}")
+    query = Query(from_table=collection.alias, select_cols=select_clause, where_clause=where_clause, limit=limit)
     result = collection.query(query)
     output_data = render_output(result.rows, output_type)
     if output:
@@ -458,6 +481,110 @@ def describe(ctx, where, output_type, output, limit):
     write_output(df.describe(include="all").transpose(), output_type, target=output)
+@cli.command()
+@click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--target-attribute", "-T", type=click.STRING, multiple=True, help="Target attributes for inference")
+@click.option(
+    "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
+)
+@click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
+@click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
+@click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
+@click.option("--model-format", "-M", type=click.Choice([x.value for x in ModelSerialization]), help="Format for model")
+@click.option("--training-test-data-split", "-S", type=click.Tuple([float, float]), help="Training/test data split")
+@click.option(
+    "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
+)
+@click.option("--query", "-q", type=click.STRING, help="query term")
+@click.pass_context
+def infer(
+    ctx,
+    inference_config_file,
+    query,
+    training_test_data_split,
+    predictor_type,
+    target_attribute,
+    feature_attributes,
+    output_type,
+    output,
+    model_format,
+    export_model,
+    load_model,
+):
+    """
+    Predict a complete object from a partial object.
+    Currently two main prediction methods are provided: RAG and sklearn
+    ## RAG:
+    The RAG approach will use Retrieval Augmented Generation to inference the missing attributes of an object.
+    Example:
+        linkml-store  -i countries.jsonl inference -t rag  -q 'name: Uruguay'
+    Result:
+        capital: Montevideo, code: UY, continent: South America, languages: [Spanish]
+    You can pass in configurations as follows:
+        linkml-store  -i countries.jsonl inference -t rag:llm_config.model_name=llama-3  -q 'name: Uruguay'
+    ## SKLearn:
+    This uses scikit-learn (defaulting to simple decision trees) to do the prediction.
+        linkml-store -i tests/input/iris.csv inference -t sklearn \
+           -q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
+    """
+    if query:
+        query_obj = yaml.safe_load(query)
+    else:
+        query_obj = None
+    collection = ctx.obj["settings"].collection
+    atts = collection.class_definition().attributes.keys()
+    if model_format:
+        model_format = ModelSerialization(model_format)
+    if load_model:
+        predictor = get_inference_engine(predictor_type)
+        predictor = type(predictor).load_model(load_model)
+    else:
+        if feature_attributes:
+            features = feature_attributes.split(",")
+            features = [f.strip() for f in features]
+        else:
+            if query_obj:
+                features = query_obj.keys()
+            else:
+                features = None
+        if target_attribute:
+            target_attributes = list(target_attribute)
+        else:
+            target_attributes = [att for att in atts if att not in features]
+        if inference_config_file:
+            config = InferenceConfig.from_file(inference_config_file)
+        else:
+            config = InferenceConfig(target_attributes=target_attributes, feature_attributes=features)
+        if training_test_data_split:
+            config.train_test_split = training_test_data_split
+        predictor = get_inference_engine(predictor_type, config=config)
+        predictor.load_and_split_data(collection)
+        predictor.initialize_model()
+    if export_model:
+        logger.info(f"Exporting model to {export_model} in {model_format}")
+        predictor.export_model(export_model, model_format)
+    if not query_obj:
+        if not export_model:
+            raise ValueError("Query must be specified if not exporting model")
+    if query_obj:
+        result = predictor.derive(query_obj)
+        dumped_obj = result.model_dump(exclude_none=True)
+        write_output([dumped_obj], output_type, target=output)
 @cli.command()
 @index_type_option
 @click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")

linkml_store/inference/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+inference engine package.
+"""
+from linkml_store.inference.inference_config import InferenceConfig
+from linkml_store.inference.inference_engine import InferenceEngine
+from linkml_store.inference.inference_engine_registry import get_inference_engine
+__all__ = [
+    "InferenceEngine",
+    "InferenceConfig",
+    "get_inference_engine",
+]

linkml_store/inference/implementations/__init__.py ADDED Viewed

File without changes

linkml-store 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

Potentially problematic release.

linkml-store 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl