PyPI - linkml-store - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.10rc1__py3-none-any.whl - Mend

linkml-store 0.2.6py3-none-any.whl → 0.2.10rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (35) hide show

linkml_store/api/client.py +2 -3
linkml_store/api/collection.py +63 -8
linkml_store/api/database.py +20 -3
linkml_store/api/stores/duckdb/duckdb_collection.py +168 -4
linkml_store/api/stores/duckdb/duckdb_database.py +5 -5
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +132 -15
linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
linkml_store/api/stores/neo4j/neo4j_database.py +1 -1
linkml_store/api/stores/solr/solr_collection.py +107 -18
linkml_store/cli.py +201 -21
linkml_store/index/implementations/llm_indexer.py +13 -6
linkml_store/index/indexer.py +9 -5
linkml_store/inference/implementations/llm_inference_engine.py +15 -13
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +2 -1
linkml_store/inference/inference_engine.py +1 -1
linkml_store/plotting/__init__.py +5 -0
linkml_store/plotting/cli.py +172 -0
linkml_store/plotting/heatmap.py +356 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +124 -3
linkml_store/utils/llm_utils.py +4 -2
linkml_store/utils/object_utils.py +9 -3
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +1 -1
linkml_store/utils/vector_utils.py +3 -10
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/METADATA +3 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/RECORD +35 -30
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/WHEEL +1 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/LICENSE +0 -0
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/entry_points.txt +0 -0

linkml_store/inference/implementations/llm_inference_engine.py CHANGED Viewed

@@ -1,18 +1,16 @@
-import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
 from typing import ClassVar, List, Optional, TextIO, Union
 import yaml
-from linkml_store.utils.llm_utils import parse_yaml_payload
 from llm import get_key
 from pydantic import BaseModel
-from linkml_store.api.collection import OBJECT, Collection
+from linkml_store.api.collection import OBJECT
 from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
 from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
-from linkml_store.utils.object_utils import select_nested
+from linkml_store.utils.llm_utils import parse_yaml_payload
 logger = logging.getLogger(__name__)
@@ -79,21 +77,24 @@ class LLMInferenceEngine(InferenceEngine):
     def _schema_str(self) -> str:
         db = self.training_data.base_collection.parent
         from linkml_runtime.dumpers import json_dumper
         schema_dict = json_dumper.to_dict(db.schema_view.schema)
         return yaml.dump(schema_dict)
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[LLMInference]:
         import llm
         model: llm.Model = self.model
-        #model_name = self.config.llm_config.model_name
-        #feature_attributes = self.config.feature_attributes
+        # model_name = self.config.llm_config.model_name
+        # feature_attributes = self.config.feature_attributes
         target_attributes = self.config.target_attributes
         query_text = self.object_to_text(object)
         if not target_attributes:
             target_attributes = [k for k, v in object.items() if v is None or v == ""]
-        #if not feature_attributes:
+        # if not feature_attributes:
         #    feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
@@ -107,7 +108,9 @@ class LLMInferenceEngine(InferenceEngine):
             "```yaml\n"
             f"{stub}\n"
             "```\n"
-            "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
+            "---\nQuery:\n"
+            f"## INCOMPLETE OBJECT:\n{query_text}\n"
+            "## OUTPUT:\n"
         )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
@@ -130,9 +133,8 @@ class LLMInferenceEngine(InferenceEngine):
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return LLMInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
     def export_model(
         self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
@@ -149,4 +151,4 @@ class LLMInferenceEngine(InferenceEngine):
     @classmethod
     def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
-        raise NotImplementedError("Does not make sense for this engine")
+        raise NotImplementedError("Does not make sense for this engine")

linkml_store/inference/implementations/rag_inference_engine.py CHANGED Viewed

@@ -111,7 +111,9 @@ class RAGInferenceEngine(InferenceEngine):
     def object_to_text(self, object: OBJECT) -> str:
         return yaml.dump(object)
-    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
+    def derive(
+        self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
+    ) -> Optional[RAGInference]:
         import llm
         from tiktoken import encoding_for_model
@@ -131,8 +133,9 @@ class RAGInferenceEngine(InferenceEngine):
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
             logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
-            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
-                                            mmr_relevance_factor=mmr_relevance_factor)
+            rs = self.rag_collection.search(
+                query_text, limit=num_examples, index_name="llm", mmr_relevance_factor=mmr_relevance_factor
+            )
             examples = rs.rows
             logger.info(f"Found {len(examples)} examples")
             if not examples:
@@ -153,11 +156,11 @@ class RAGInferenceEngine(InferenceEngine):
             input_obj_text = self.object_to_text(input_obj)
             if input_obj_text == query_text:
                 continue
-                #raise ValueError(
+                # raise ValueError(
                 #    f"Query object {query_text} is the same as example object {input_obj_text}\n"
                 #    "This indicates possible test data leakage\n."
                 #    "TODO: allow an option that allows user to treat this as a basic lookup\n"
-                #)
+                # )
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -176,9 +179,9 @@ class RAGInferenceEngine(InferenceEngine):
         except KeyError:
             encoding = encoding_for_model("gpt-4")
         token_limit = get_token_limit(model_name)
-        prompt = render_formatted_text(make_text, values=prompt_clauses,
-                                       encoding=encoding, token_limit=token_limit,
-                                       additional_text=system_prompt)
+        prompt = render_formatted_text(
+            make_text, values=prompt_clauses, encoding=encoding, token_limit=token_limit, additional_text=system_prompt
+        )
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system=system_prompt)
         yaml_str = response.text()
@@ -199,8 +202,8 @@ class RAGInferenceEngine(InferenceEngine):
                     "\nThis was invalid.\n",
                     "Validation errors:\n",
                 ] + [self.object_to_text(e) for e in errs]
-                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
-        return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+                return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
+        return RAGInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
     def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
         if "```" in yaml_str:

linkml_store/inference/implementations/sklearn_inference_engine.py CHANGED Viewed

@@ -94,6 +94,8 @@ class SklearnInferenceEngine(InferenceEngine):
         if not feature_cols:
             feature_cols = df.columns.difference(target_cols).tolist()
             self.config.feature_attributes = feature_cols
+            if not feature_cols:
+                raise ValueError("No features found in the data")
         target_col = target_cols[0]
         logger.info(f"Feature columns: {feature_cols}")
         X = df[feature_cols].copy()
@@ -102,6 +104,8 @@ class SklearnInferenceEngine(InferenceEngine):
         # find list of features to skip (categorical with > N categories)
         skip_features = []
+        if not len(X.columns):
+            raise ValueError("No features to train on")
         for col in X.columns:
             unique_values = self._get_unique_values(X[col])
             if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]):
@@ -115,6 +119,8 @@ class SklearnInferenceEngine(InferenceEngine):
         # Encode features
         encoded_features = []
+        if not len(X.columns):
+            raise ValueError(f"No features to train on from after skipping {skip_features}")
         for col in X.columns:
             logger.info(f"Checking whether to encode: {col}")
             col_encoder = self._get_encoder(X[col])
@@ -153,7 +159,7 @@ class SklearnInferenceEngine(InferenceEngine):
             y = y_encoder.fit_transform(y.values.ravel())  # Convert to 1D numpy array
             self.transformed_targets = y_encoder.classes_
-        # print(f"Fitting model with features: {X.columns}")
+        # print(f"Fitting model with features: {X.columns}, y={y}, X={X}")
         clf = DecisionTreeClassifier(random_state=42)
         clf.fit(X, y)
         self.classifier = clf

linkml_store/inference/inference_config.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional, Tuple, Any
+from typing import Any, List, Optional, Tuple
 from pydantic import BaseModel, ConfigDict, Field
@@ -59,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
     """
     Result of an inference derivation.
     """
     query: Optional[OBJECT] = Field(default=None, description="The query object.")
     predicted_object: OBJECT = Field(..., description="The predicted object.")
     confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)

linkml_store/inference/inference_engine.py CHANGED Viewed

@@ -4,7 +4,7 @@ from abc import ABC
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Optional, TextIO, Tuple, Union, Any
+from typing import Any, Optional, TextIO, Tuple, Union
 import pandas as pd
 from pydantic import BaseModel, ConfigDict

linkml_store/plotting/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""
+Visualization and plotting functions for LinkML data.
+"""
+__version__ = "0.1.0"

linkml_store/plotting/cli.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""
+Command-line interface for the plotting package.
+"""
+import logging
+from pathlib import Path
+from typing import Optional, Union
+import click
+from linkml_store.plotting.heatmap import heatmap_from_file, export_heatmap_data
+from linkml_store.utils.format_utils import Format
+logger = logging.getLogger(__name__)
+@click.group()
+def plot_cli():
+    """Plotting utilities for LinkML data."""
+    pass
+@plot_cli.command()
+@click.argument("input_file", required=False)
+@click.option("--x-column", "-x", required=True, help="Column to use for x-axis")
+@click.option("--y-column", "-y", required=True, help="Column to use for y-axis")
+@click.option("--value-column", "-v", help="Column containing values (if not provided, counts will be used)")
+@click.option("--title", "-t", help="Title for the heatmap")
+@click.option("--width", "-w", type=int, default=10, show_default=True, help="Width of the figure in inches")
+@click.option("--height", "-h", type=int, default=8, show_default=True, help="Height of the figure in inches")
+@click.option("--cmap", "-c", default="YlGnBu", show_default=True, help="Colormap to use")
+@click.option("--output", "-o", required=True, help="Output file path")
+@click.option("--format", "-f", help="Input file format")
+@click.option("--dpi", type=int, default=300, show_default=True, help="DPI for output image")
+@click.option("--square/--no-square", default=False, show_default=True, help="Make cells square")
+@click.option("--annotate/--no-annotate", default=True, show_default=True, help="Annotate cells with values")
+@click.option("--font-size", type=int, default=10, show_default=True, help="Font size for annotations and labels")
+@click.option("--robust/--no-robust", default=False, show_default=True, help="Use robust quantiles for colormap scaling")
+@click.option("--remove-duplicates/--no-remove-duplicates", default=True, show_default=True,
+              help="Remove duplicate x,y combinations (default) or keep all occurrences")
+@click.option("--cluster", type=click.Choice(["none", "both", "x", "y"]), default="none", show_default=True,
+              help="Cluster axes: none (default), both, x-axis only, or y-axis only")
+@click.option("--cluster-method", type=click.Choice(["complete", "average", "single", "ward"]), default="complete", show_default=True,
+              help="Linkage method for hierarchical clustering")
+@click.option("--cluster-metric", type=click.Choice(["euclidean", "correlation", "cosine", "cityblock"]), default="euclidean", show_default=True,
+              help="Distance metric for clustering")
+@click.option("--export-data", "-e", help="Export the heatmap data to this file")
+@click.option("--export-format", "-E", type=click.Choice([f.value for f in Format]), default="csv", show_default=True,
+              help="Format for exported data")
+def heatmap(
+    input_file: Optional[str],
+    x_column: str,
+    y_column: str,
+    value_column: Optional[str],
+    title: Optional[str],
+    width: int,
+    height: int,
+    cmap: str,
+    output: str,
+    format: Optional[str],
+    dpi: int,
+    square: bool,
+    annotate: bool,
+    font_size: int,
+    robust: bool,
+    remove_duplicates: bool,
+    cluster: str,
+    cluster_method: str,
+    cluster_metric: str,
+    export_data: Optional[str],
+    export_format: Union[str, Format],
+):
+    """
+    Create a heatmap from a tabular data file.
+    Examples:
+      # From a file
+      linkml-store plot heatmap data.csv -x species -y country -o heatmap.png
+      # From stdin
+      cat data.csv | linkml-store plot heatmap -x species -y country -o heatmap.png
+    This will create a heatmap showing the frequency counts of species by country.
+    If you want to use a specific value column instead of counts:
+      linkml-store plot heatmap data.csv -x species -y country -v population -o heatmap.png
+    """
+    # Handle file path - if None, use stdin
+    if input_file is None:
+        input_file = "-"  # format_utils treats "-" as stdin
+    # Convert 'none' to False for clustering parameter
+    use_cluster = False if cluster == "none" else cluster
+    # Create heatmap visualization
+    fig, ax = heatmap_from_file(
+        file_path=input_file,
+        x_column=x_column,
+        y_column=y_column,
+        value_column=value_column,
+        title=title,
+        figsize=(width, height),
+        cmap=cmap,
+        output_file=output,
+        format=format,
+        dpi=dpi,
+        square=square,
+        annot=annotate,
+        font_size=font_size,
+        robust=robust,
+        remove_duplicates=remove_duplicates,
+        cluster=use_cluster,
+        cluster_method=cluster_method,
+        cluster_metric=cluster_metric,
+    )
+    # Export data if requested
+    if export_data:
+        # For export, reuse the data already loaded for the heatmap instead of loading again
+        # This avoids the "I/O operation on closed file" error when input_file is stdin
+        import pandas as pd
+        from matplotlib.axes import Axes
+        # Extract the data directly from the plot
+        if hasattr(ax, 'get_figure') and hasattr(ax, 'get_children'):
+            # Extract the heatmap data from the plot itself
+            heatmap_data = {}
+            for child in ax.get_children():
+                if isinstance(child, plt.matplotlib.collections.QuadMesh):
+                    # Get the colormap data
+                    data_values = child.get_array()
+                    rows = ax.get_yticks()
+                    cols = ax.get_xticks()
+                    row_labels = [item.get_text() for item in ax.get_yticklabels()]
+                    col_labels = [item.get_text() for item in ax.get_xticklabels()]
+                    # Create a dataframe from the plot data
+                    heatmap_df = pd.DataFrame(
+                        index=[label for label in row_labels if label],
+                        columns=[label for label in col_labels if label]
+                    )
+                    # Fill in the values (if we can)
+                    if len(data_values) == len(row_labels) * len(col_labels):
+                        for i, row in enumerate(row_labels):
+                            for j, col in enumerate(col_labels):
+                                if row and col:  # Skip empty labels
+                                    idx = i * len(col_labels) + j
+                                    if idx < len(data_values):
+                                        heatmap_df.at[row, col] = data_values[idx]
+                    # Reset index to make the y_column a regular column
+                    result_df = heatmap_df.reset_index()
+                    result_df.rename(columns={'index': y_column}, inplace=True)
+                    # Export the data
+                    from linkml_store.utils.format_utils import write_output
+                    records = result_df.to_dict(orient='records')
+                    write_output(records, format=export_format, target=export_data)
+                    click.echo(f"Heatmap data exported to {export_data}")
+                    break
+            else:
+                # If we couldn't extract data from the plot, inform the user
+                click.echo("Warning: Could not export data from the plot")
+        else:
+            click.echo("Warning: Could not export data from the plot")
+    click.echo(f"Heatmap created at {output}")
+if __name__ == "__main__":
+    plot_cli()

linkml-store 0.2.6__py3-none-any.whl → 0.2.10rc1__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.6py3-none-any.whl → 0.2.10rc1py3-none-any.whl