PyPI - linkml-store - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show

linkml_store/api/client.py +9 -6
linkml_store/api/collection.py +118 -5
linkml_store/api/database.py +45 -14
linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
linkml_store/api/stores/solr/solr_collection.py +7 -1
linkml_store/cli.py +202 -21
linkml_store/index/implementations/llm_indexer.py +14 -6
linkml_store/index/indexer.py +7 -4
linkml_store/inference/implementations/llm_inference_engine.py +13 -9
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +1 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +183 -3
linkml_store/utils/llm_utils.py +3 -1
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +7 -1
linkml_store/utils/vector_utils.py +4 -11
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
{linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0

linkml_store/cli.py CHANGED Viewed

@@ -3,6 +3,7 @@ import sys
 import warnings
 from collections import defaultdict
 from pathlib import Path
+from tokenize import group
 from typing import Optional, Tuple, Any
 import click
@@ -37,6 +38,11 @@ index_type_option = click.option(
     show_default=True,
     help="Type of index to create. Values: simple, llm",
 )
+json_select_query_option = click.option(
+    "--json-select-query",
+    "-J",
+    help="JSON SELECT query",
+)
 logger = logging.getLogger(__name__)
@@ -136,7 +142,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        database = "duckdb" # default: store in duckdb
+        database = "duckdb"  # default: store in duckdb
         if input.startswith("http"):
             parts = input.split("/")
             collection = parts[-1]
@@ -144,8 +150,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         else:
             stem = underscore(Path(input).stem)
             collection = stem
-        logger.info(f"Using input file: {input}, "
-                    f"default storage is {database} and collection is {collection}")
+        logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}")
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
     if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
         config = DEFAULT_LOCAL_CONF_PATH
@@ -186,12 +191,24 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
 @cli.command()
-@click.argument("files", type=click.Path(exists=True), nargs=-1)
+@click.pass_context
+def drop(ctx):
+    """
+    Drop database and all its collections.
+    """
+    database = ctx.obj["settings"].database
+    database.drop()
+@cli.command()
+@click.argument("files", type=click.Path(), nargs=-1)
 @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
+@click.option("--source-field", help="If provided, inject file path source as this field")
+@json_select_query_option
 @click.pass_context
-def insert(ctx, files, replace, object, format):
+def insert(ctx, files, replace, object, format, source_field, json_select_query):
     """Insert objects from files (JSON, YAML, TSV) into the specified collection.
     Using a configuration:
@@ -207,11 +224,17 @@ def insert(ctx, files, replace, object, format):
         raise ValueError("Collection must be specified.")
     if not files and not object:
         files = ["-"]
+    load_objects_args = {}
+    if json_select_query:
+        load_objects_args["select_query"] = json_select_query
     for file_path in files:
         if format:
-            objects = load_objects(file_path, format=format)
+            objects = load_objects(file_path, format=format, **load_objects_args)
         else:
-            objects = load_objects(file_path)
+            objects = load_objects(file_path, **load_objects_args)
+        if source_field:
+            for obj in objects:
+                obj[source_field] = str(file_path)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
         if replace:
             collection.replace(objects)
@@ -222,6 +245,8 @@ def insert(ctx, files, replace, object, format):
         for object_str in object:
             logger.info(f"Parsing: {object_str}")
             objects = yaml.safe_load(object_str)
+            if not isinstance(objects, list):
+                objects = [objects]
             if replace:
                 collection.replace(objects)
             else:
@@ -234,21 +259,41 @@ def insert(ctx, files, replace, object, format):
 @click.argument("files", type=click.Path(exists=True), nargs=-1)
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
+@json_select_query_option
 @click.pass_context
-def store(ctx, files, object, format):
+def store(ctx, files, object, format, json_select_query):
     """Store objects from files (JSON, YAML, TSV) into the database.
-    Note: this is similar to insert, but a collection does not need to be specified
+    Note: this is similar to insert, but a collection does not need to be specified.
+    For example, assume that `my-collection` is a dict with multiple keys,
+    and we want one collection per key:
+        linkml-store -d my.ddb store my-collection.yaml
+    Loading JSON (e.g OBO-JSON), with a --json-select-query:
+        linkml-store -d cl.ddb  store -J graphs  cl.obo.json
+    Loading XML (e.g OWL-XML), with a --json-select-query:
+        linkml-store -d cl.ddb  store -J Ontology  cl.owx
+    Because the XML uses a top level Ontology, with multiple
     """
     settings = ctx.obj["settings"]
     db = settings.database
     if not files and not object:
         files = ["-"]
+    load_objects_args = {}
+    if json_select_query:
+        load_objects_args["select_query"] = json_select_query
     for file_path in files:
         if format:
-            objects = load_objects(file_path, format=format)
+            objects = load_objects(file_path, format=format, **load_objects_args)
         else:
-            objects = load_objects(file_path)
+            objects = load_objects(file_path, **load_objects_args)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into database '{db}'.")
         for obj in objects:
             db.store(obj)
@@ -422,15 +467,32 @@ def list_collections(ctx, **kwargs):
 @cli.command()
 @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
-@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
+@click.option("--facet-min-count", "-M", type=click.INT, help="Minimum count for a facet to be included")
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
-@click.option("--columns", "-S", help="Columns to facet on")
+@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
 @click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
 @click.pass_context
-def fq(ctx, where, limit, columns, output_type, wide, output):
+def fq(ctx, where, limit, columns, output_type, wide, output, **kwargs):
     """
-    Query facets from the specified collection.
+    Query facet counts from the specified collection.
+    Assuming your .linkml.yaml includes an entry mapping `phenopackets` to a
+    mongodb
+    Facet counts (all columns)
+        linkml-store -d phenopackets fq
+    Nested columns:
+        linkml-store -d phenopackets fq subject.timeAtLastEncounter.age
+    Compound keys:
+        linkml-store -d phenopackets fq subject.sex+subject.timeAtLastEncounter.age
     """
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
@@ -439,7 +501,7 @@ def fq(ctx, where, limit, columns, output_type, wide, output):
         columns = [col.strip() for col in columns]
         columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
     logger.info(f"Faceting on columns: {columns}")
-    results = collection.query_facets(where_clause, facet_columns=columns, limit=limit)
+    results = collection.query_facets(where_clause, facet_columns=columns, facet_limit=limit, **kwargs)
     logger.info(f"Facet results: {results}")
     def _untuple(key):
@@ -471,6 +533,56 @@ def fq(ctx, where, limit, columns, output_type, wide, output):
         click.echo(output_data)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
+@click.pass_context
+def groupby(ctx, where, limit, columns, output_type, output, **kwargs):
+    """
+    Group by columns in the specified collection.
+    Assume a simple triple model:
+        linkml-store -d cl.ddb -c triple insert cl.owl
+    This makes a flat subject/predicate/object table
+    This can be grouped, e.g by subject:
+        linkml-store -d cl.ddb -c triple groupby -s subject
+    Or subject and predicate:
+        linkml-store -d cl.ddb -c triple groupby -s '[subject,predicate]'
+    """
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    columns = columns.split(",") if columns else None
+    if columns:
+        columns = [col.strip() for col in columns]
+        columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
+    logger.info(f"Group by: {columns}")
+    result = collection.group_by(
+        group_by_fields=columns,
+        where_clause=where_clause,
+        agg_map={},
+        limit=limit,
+        **kwargs,
+    )
+    logger.info(f"Group by results: {result}")
+    output_data = render_output(result.rows, output_type)
+    if output:
+        with open(output, "w") as f:
+            f.write(output_data)
+        click.echo(f"Query results saved to {output}")
+    else:
+        click.echo(output_data)
 def _get_index(index_type=None, **kwargs) -> Indexer:
     if index_type is None or index_type == "simple":
         return SimpleIndexer(name="test", **kwargs)
@@ -519,10 +631,12 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
         value_key = tuple([row.get(att) for att in value_atts])
         pivoted[index_key][column_key] = value_key
     pivoted_objs = []
     def detuple(t: Tuple) -> Any:
         if len(t) == 1:
             return t[0]
         return str(t)
     for index_key, data in pivoted.items():
         obj = {att: key for att, key in zip(index_atts, index_key)}
         for column_key, value_key in data.items():
@@ -531,6 +645,57 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
     write_output(pivoted_objs, output_type, target=output)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--sample-field", "-I", help="Field to use as the sample identifier")
+@click.option("--classification-field", "-L", help="Field to use as for classification")
+@click.option(
+    "--p-value-threshold",
+    "-P",
+    type=click.FLOAT,
+    default=0.05,
+    show_default=True,
+    help="P-value threshold for enrichment",
+)
+@click.option(
+    "--multiple-testing-correction",
+    "-M",
+    type=click.STRING,
+    default="bh",
+    show_default=True,
+    help="Multiple test correction method",
+)
+@click.argument("samples", type=click.STRING, nargs=-1)
+@click.pass_context
+def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs):
+    from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    column_atts = [sample_field, classification_field]
+    results = collection.find(where_clause, select_cols=column_atts, limit=-1)
+    df = results.rows_dataframe
+    ea = EnrichmentAnalyzer(df, sample_key=sample_field, classification_key=classification_field)
+    if not samples:
+        samples = df[sample_field].unique()
+    enrichment_results = []
+    for sample in samples:
+        enriched = ea.find_enriched_categories(sample, **kwargs)
+        for e in enriched:
+            obj = {"sample": sample, **e.model_dump()}
+            enrichment_results.append(obj)
+    output_data = render_output(enrichment_results, output_type)
+    if output:
+        with open(output, "w") as f:
+            f.write(output_data)
+        click.echo(f"Search results saved to {output}")
+    else:
+        click.echo(output_data)
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -538,7 +703,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
 @click.option(
     "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
 )
-@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
+@click.option("--training-collection", type=click.STRING, help="Collection to use for training")
 @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
 @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
 @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -753,12 +918,28 @@ def indexes(ctx):
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option(
+    "--collection-only/--no-collection-only",
+    default=False,
+    show_default=True,
+    help="Only validate specified collection",
+)
+@click.option(
+    "--ensure-referential-integrity/--no-ensure-referential-integrity",
+    default=True,
+    show_default=True,
+    help="Ensure referential integrity",
+)
 @click.pass_context
-def validate(ctx, output_type, output):
+def validate(ctx, output_type, output, collection_only, **kwargs):
     """Validate objects in the specified collection."""
-    collection = ctx.obj["settings"].collection
-    logger.info(f"Validating collection {collection.alias}")
-    validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
+    if collection_only:
+        collection = ctx.obj["settings"].collection
+        logger.info(f"Validating collection {collection.alias}")
+        validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)]
+    else:
+        db = ctx.obj["settings"].database
+        validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)]
     output_data = render_output(validation_results, output_type)
     if output:
         with open(output, "w") as f:

linkml_store/index/implementations/llm_indexer.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional
 import numpy as np
+import openai
 from linkml_store.api.config import CollectionConfig
 from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -11,6 +12,7 @@ from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
 if TYPE_CHECKING:
     import llm
+CHUNK_SIZE = 1000
 logger = logging.getLogger(__name__)
@@ -25,7 +27,7 @@ class LLMIndexer(Indexer):
     >>> vector = indexer.text_to_vector("hello")
     """
-    embedding_model_name: str = "ada-002"
+    embedding_model_name: str = "text-embedding-ada-002"
     _embedding_model: "llm.EmbeddingModel" = None
     cached_embeddings_database: str = None
     cached_embeddings_collection: str = None
@@ -52,7 +54,9 @@ class LLMIndexer(Indexer):
         """
         return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
-    def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
+    def texts_to_vectors(
+        self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs
+    ) -> List[INDEX_ITEM]:
         """
         Use LLM to embed.
@@ -60,18 +64,22 @@ class LLMIndexer(Indexer):
         >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
         :param texts:
+        :param cache:
+        :param token_limit_penalty:
         :return:
         """
         from tiktoken import encoding_for_model
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
         # TODO: make this more accurate
-        token_limit = get_token_limit(model.model_id) - 200
-        encoding = encoding_for_model("gpt-4o")
+        token_limit = get_token_limit(model.model_id) - token_limit_penalty
+        logging.info(f"Token limit for {model.model_id}: {token_limit}")
+        encoding = encoding_for_model(self.embedding_model_name)
         def truncate_text(text: str) -> str:
             # split into tokens every 1000 chars:
-            parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
+            parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
             truncated = render_formatted_text(
                 lambda x: "".join(x),
                 parts,
@@ -140,5 +148,5 @@ class LLMIndexer(Indexer):
                 embeddings_collection.commit()
         else:
             logger.info(f"Embedding {len(texts)} texts")
-            embeddings = model.embed_multi(texts)
+            embeddings = list(model.embed_multi(texts, batch_size=1))
         return [np.array(v, dtype=float) for v in embeddings]

linkml_store/index/indexer.py CHANGED Viewed

@@ -154,8 +154,11 @@ class Indexer(BaseModel):
         return str(obj)
     def search(
-        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
-            mmr_relevance_factor: Optional[float] = None
+        self,
+        query: str,
+        vectors: List[Tuple[str, INDEX_ITEM]],
+        limit: Optional[int] = None,
+        mmr_relevance_factor: Optional[float] = None,
     ) -> List[Tuple[float, Any]]:
         """
         Use the indexer to search against a database of vectors.
@@ -175,8 +178,8 @@ class Indexer(BaseModel):
             vlist = [v for _, v in vectors]
             idlist = [id for id, _ in vectors]
             sorted_indices = mmr_diversified_search(
-                query_vector, vlist,
-                relevance_factor=mmr_relevance_factor, top_n=limit)
+                query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit
+            )
             results = []
             # TODO: this is inefficient when limit is high
             for i in range(limit):

linkml_store/inference/implementations/llm_inference_engine.py CHANGED Viewed

@@ -79,21 +79,24 @@ class LLMInferenceEngine(InferenceEngine):
     def _schema_str(self) -> str:
         db = self.training_data.base_collection.parent
         from linkml_runtime.dumpers import json_dumper
         schema_dict = json_dumper.to_dict(db.schema_view.schema)
         return yaml.dump(schema_dict)
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[LLMInference]:
         import llm
         model: llm.Model = self.model
-        #model_name = self.config.llm_config.model_name
-        #feature_attributes = self.config.feature_attributes
+        # model_name = self.config.llm_config.model_name
+        # feature_attributes = self.config.feature_attributes
         target_attributes = self.config.target_attributes
         query_text = self.object_to_text(object)
         if not target_attributes:
             target_attributes = [k for k, v in object.items() if v is None or v == ""]
-        #if not feature_attributes:
+        # if not feature_attributes:
         #    feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
@@ -107,7 +110,9 @@ class LLMInferenceEngine(InferenceEngine):
             "```yaml\n"
             f"{stub}\n"
             "```\n"
-            "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
+            "---\nQuery:\n"
+            f"## INCOMPLETE OBJECT:\n{query_text}\n"
+            "## OUTPUT:\n"
         )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
@@ -130,9 +135,8 @@ class LLMInferenceEngine(InferenceEngine):
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return LLMInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
     def export_model(
         self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
@@ -149,4 +153,4 @@ class LLMInferenceEngine(InferenceEngine):
     @classmethod
     def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
-        raise NotImplementedError("Does not make sense for this engine")
+        raise NotImplementedError("Does not make sense for this engine")

linkml_store/inference/implementations/rag_inference_engine.py CHANGED Viewed

@@ -111,7 +111,9 @@ class RAGInferenceEngine(InferenceEngine):
     def object_to_text(self, object: OBJECT) -> str:
         return yaml.dump(object)
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[RAGInference]:
         import llm
         from tiktoken import encoding_for_model
@@ -131,8 +133,9 @@ class RAGInferenceEngine(InferenceEngine):
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
             logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
-            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
-                                            mmr_relevance_factor=mmr_relevance_factor)
+            rs = self.rag_collection.search(
+                query_text, limit=num_examples, index_name="llm", mmr_relevance_factor=mmr_relevance_factor
+            )
             examples = rs.rows
             logger.info(f"Found {len(examples)} examples")
             if not examples:
@@ -153,11 +156,11 @@ class RAGInferenceEngine(InferenceEngine):
             input_obj_text = self.object_to_text(input_obj)
             if input_obj_text == query_text:
                 continue
-                #raise ValueError(
+                # raise ValueError(
                 #    f"Query object {query_text} is the same as example object {input_obj_text}\n"
                 #    "This indicates possible test data leakage\n."
                 #    "TODO: allow an option that allows user to treat this as a basic lookup\n"
-                #)
+                # )
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -176,9 +179,9 @@ class RAGInferenceEngine(InferenceEngine):
         except KeyError:
             encoding = encoding_for_model("gpt-4")
         token_limit = get_token_limit(model_name)
-        prompt = render_formatted_text(make_text, values=prompt_clauses,
-                                       encoding=encoding, token_limit=token_limit,
-                                       additional_text=system_prompt)
+        prompt = render_formatted_text(
+            make_text, values=prompt_clauses, encoding=encoding, token_limit=token_limit, additional_text=system_prompt
+        )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
         yaml_str = response.text()
@@ -199,8 +202,8 @@ class RAGInferenceEngine(InferenceEngine):
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return RAGInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
     def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
         if "```" in yaml_str:

linkml_store/inference/implementations/sklearn_inference_engine.py CHANGED Viewed

@@ -94,6 +94,8 @@ class SklearnInferenceEngine(InferenceEngine):
         if not feature_cols:
             feature_cols = df.columns.difference(target_cols).tolist()
             self.config.feature_attributes = feature_cols
+            if not feature_cols:
+                raise ValueError("No features found in the data")
         target_col = target_cols[0]
         logger.info(f"Feature columns: {feature_cols}")
         X = df[feature_cols].copy()
@@ -102,6 +104,8 @@ class SklearnInferenceEngine(InferenceEngine):
         # find list of features to skip (categorical with > N categories)
         skip_features = []
+        if not len(X.columns):
+            raise ValueError("No features to train on")
         for col in X.columns:
             unique_values = self._get_unique_values(X[col])
             if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]):
@@ -115,6 +119,8 @@ class SklearnInferenceEngine(InferenceEngine):
         # Encode features
         encoded_features = []
+        if not len(X.columns):
+            raise ValueError(f"No features to train on from after skipping {skip_features}")
         for col in X.columns:
             logger.info(f"Checking whether to encode: {col}")
             col_encoder = self._get_encoder(X[col])
@@ -153,7 +159,7 @@ class SklearnInferenceEngine(InferenceEngine):
             y = y_encoder.fit_transform(y.values.ravel())  # Convert to 1D numpy array
             self.transformed_targets = y_encoder.classes_
-        # print(f"Fitting model with features: {X.columns}")
+        # print(f"Fitting model with features: {X.columns}, y={y}, X={X}")
         clf = DecisionTreeClassifier(random_state=42)
         clf.fit(X, y)
         self.classifier = clf

linkml_store/inference/inference_config.py CHANGED Viewed

@@ -59,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
     """
     Result of an inference derivation.
     """
     query: Optional[OBJECT] = Field(default=None, description="The query object.")
     predicted_object: OBJECT = Field(..., description="The predicted object.")
     confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)

linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.5py3-none-any.whl → 0.2.9py3-none-any.whl