PyPI - linkml-store - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

linkml-store 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (27) hide show

linkml_store/api/client.py +35 -8
linkml_store/api/collection.py +40 -5
linkml_store/api/config.py +20 -3
linkml_store/api/database.py +24 -3
linkml_store/api/stores/duckdb/duckdb_collection.py +3 -0
linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
linkml_store/cli.py +149 -13
linkml_store/inference/__init__.py +13 -0
linkml_store/inference/evaluation.py +189 -0
linkml_store/inference/implementations/__init__.py +0 -0
linkml_store/inference/implementations/rag_inference_engine.py +145 -0
linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
linkml_store/inference/implementations/sklearn_inference_engine.py +308 -0
linkml_store/inference/inference_config.py +62 -0
linkml_store/inference/inference_engine.py +200 -0
linkml_store/inference/inference_engine_registry.py +74 -0
linkml_store/utils/format_utils.py +27 -90
linkml_store/utils/llm_utils.py +96 -0
linkml_store/utils/object_utils.py +103 -2
linkml_store/utils/pandas_utils.py +55 -2
linkml_store/utils/sklearn_utils.py +193 -0
linkml_store/utils/stats_utils.py +53 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/METADATA +28 -2
{linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/RECORD +27 -15
{linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/LICENSE +0 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/WHEEL +0 -0
{linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/entry_points.txt +0 -0

linkml_store/inference/evaluation.py ADDED Viewed

@@ -0,0 +1,189 @@
+import logging
+from collections.abc import Callable
+from typing import Any, List, Optional
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel
+from linkml_store.inference import InferenceEngine
+from linkml_store.utils.object_utils import select_nested
+logger = logging.getLogger(__name__)
+def score_match(target: Optional[Any], candidate: Optional[Any], match_function: Optional[Callable] = None) -> float:
+    """
+    Compute a score for a match between two objects
+    >>> score_match("a", "a")
+    1.0
+    >>> score_match("a", "b")
+    0.0
+    >>> score_match("a", None)
+    0.0
+    >>> score_match(None, "a")
+    0.0
+    >>> score_match(None, None)
+    1.0
+    >>> score_match(["a", "b"], ["a", "b"])
+    1.0
+    >>> score_match(["a", "b"], ["b", "a"])
+    1.0
+    >>> round(score_match(["a"], ["b", "a"]), 2)
+    0.67
+    >>> score_match({"a": 1}, {"a": 1})
+    1.0
+    >>> score_match({"a": 1}, {"a": 2})
+    0.0
+    >>> score_match({"a": 1, "b": None}, {"a": 1})
+    1.0
+    >>> score_match([{"a": 1, "b": 2}, {"a": 3, "b": 4}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+    1.0
+    >>> score_match([{"a": 1, "b": 4}, {"a": 3, "b": 2}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+    0.5
+    >>> def char_match(x, y):
+    ...    return len(set(x).intersection(set(y))) / len(set(x).union(set(y)))
+    >>> score_match("abcd", "abc", char_match)
+    0.75
+    >>> score_match(["abcd", "efgh"], ["ac", "gh"], char_match)
+    0.5
+    :param target:
+    :param candidate:
+    :param match_function:
+    :return:
+    """
+    if target == candidate:
+        return 1.0
+    if target is None or candidate is None:
+        return 0.0
+    if isinstance(target, (set, list)) and isinstance(candidate, (set, list)):
+        # create an all by all matrix using numpy
+        # for each pair of elements, compute the score
+        # return the average score
+        score_matrix = np.array([[score_match(t, c, match_function) for c in candidate] for t in target])
+        best_matches0 = np.max(score_matrix, axis=0)
+        best_matches1 = np.max(score_matrix, axis=1)
+        return (np.sum(best_matches0) + np.sum(best_matches1)) / (len(target) + len(candidate))
+    if isinstance(target, dict) and isinstance(candidate, dict):
+        keys = set(target.keys()).union(candidate.keys())
+        scores = [score_match(target.get(k), candidate.get(k), match_function) for k in keys]
+        return np.mean(scores)
+    if match_function:
+        return match_function(target, candidate)
+    return 0.0
+class Outcome(BaseModel):
+    true_positive_count: float
+    total_count: int
+    @property
+    def accuracy(self) -> float:
+        return self.true_positive_count / self.total_count
+def evaluate_predictor(
+    predictor: InferenceEngine,
+    target_attributes: List[str],
+    feature_attributes: Optional[List[str]] = None,
+    test_data: pd.DataFrame = None,
+    evaluation_count: Optional[int] = 10,
+    match_function: Optional[Callable] = None,
+) -> Outcome:
+    """
+    Evaluate a predictor by comparing its predictions to the expected values in the testing data.
+    :param predictor:
+    :param target_attributes:
+    :param feature_attributes:
+    :param evaluation_count:
+    :return:
+    """
+    n = 0
+    tp = 0
+    if test_data is None:
+        test_data = predictor.testing_data.as_dataframe()
+    for row in test_data.to_dict(orient="records"):
+        expected_obj = select_nested(row, target_attributes)
+        if feature_attributes:
+            test_obj = {k: v for k, v in row.items() if k not in target_attributes}
+        else:
+            test_obj = row
+        result = predictor.derive(test_obj)
+        logger.info(f"Predicted: {result.predicted_object} Expected: {expected_obj}")
+        tp += score_match(result.predicted_object, expected_obj, match_function)
+        n += 1
+        if evaluation_count is not None and n >= evaluation_count:
+            break
+    return Outcome(true_positive_count=tp, total_count=n)
+def score_text_overlap(str1: Any, str2: Any) -> float:
+    """
+    Compute the overlap score between two strings.
+    :param str1:
+    :param str2:
+    :return:
+    """
+    if str1 == str2:
+        return 1.0
+    if not str1 or not str2:
+        return 0.0
+    overlap, length = find_longest_overlap(str1, str2)
+    return len(overlap) / max(len(str1), len(str2))
+def find_longest_overlap(str1: str, str2: str):
+    """
+    Find the longest overlapping substring between two strings.
+    Args:
+    str1 (str): The first string
+    str2 (str): The second string
+    Returns:
+    tuple: A tuple containing the longest overlapping substring and its length
+    Examples:
+    >>> find_longest_overlap("hello world", "world of programming")
+    ('world', 5)
+    >>> find_longest_overlap("abcdefg", "defghi")
+    ('defg', 4)
+    >>> find_longest_overlap("python", "java")
+    ('', 0)
+    >>> find_longest_overlap("", "test")
+    ('', 0)
+    >>> find_longest_overlap("aabbcc", "ddeeff")
+    ('', 0)
+    >>> find_longest_overlap("programming", "PROGRAMMING")
+    ('', 0)
+    """
+    if not str1 or not str2:
+        return "", 0
+    # Create a table to store lengths of matching substrings
+    m, n = len(str1), len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Variables to store the maximum length and ending position
+    max_length = 0
+    end_pos = 0
+    # Fill the dp table
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+                if dp[i][j] > max_length:
+                    max_length = dp[i][j]
+                    end_pos = i
+    # Extract the longest common substring
+    start_pos = end_pos - max_length
+    longest_substring = str1[start_pos:end_pos]
+    return longest_substring, max_length

linkml_store/inference/implementations/__init__.py ADDED Viewed

File without changes

linkml_store/inference/implementations/rag_inference_engine.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional
+import yaml
+from llm import get_key
+from linkml_store.api.collection import OBJECT, Collection
+from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
+from linkml_store.inference.inference_engine import InferenceEngine
+from linkml_store.utils.object_utils import select_nested
+logger = logging.getLogger(__name__)
+SYSTEM_PROMPT = """
+You are a {llm_config.role}, your task is to inference the YAML
+object output given the YAML object input. I will provide you
+with a collection of examples that will provide guidance both
+on the desired structure of the response, as well as the kind
+of content.
+You should return ONLY valid YAML in your response.
+"""
+# def select_object(obj: OBJECT, key_paths: List[str]) -> OBJECT:
+# return {k: obj.get(k, None) for k in keys}
+# return {k: object_path_get(obj, k, None) for k in key_paths}
+@dataclass
+class RAGInferenceEngine(InferenceEngine):
+    """
+    AI Retrieval Augmented Generation (RAG) based predictor.
+    >>> from linkml_store.api.client import Client
+    >>> from linkml_store.utils.format_utils import Format
+    >>> from linkml_store.inference.inference_config import LLMConfig
+    >>> client = Client()
+    >>> db = client.attach_database("duckdb", alias="test")
+    >>> db.import_database("tests/input/countries/countries.jsonl", Format.JSONL, collection_name="countries")
+    >>> db.list_collection_names()
+    ['countries']
+    >>> collection = db.get_collection("countries")
+    >>> features = ["name"]
+    >>> targets = ["code", "capital", "continent", "languages"]
+    >>> llm_config = LLMConfig(model_name="gpt-4o-mini",)
+    >>> config = InferenceConfig(target_attributes=targets, feature_attributes=features, llm_config=llm_config)
+    >>> ie = RAGInferenceEngine(config=config)
+    >>> ie.load_and_split_data(collection)
+    >>> ie.initialize_model()
+    >>> prediction = ie.derive({"name": "Uruguay"})
+    >>> prediction.predicted_object
+    {'capital': 'Montevideo', 'code': 'UY', 'continent': 'South America', 'languages': ['Spanish']}
+    """
+    classifier: Any = None
+    encoders: dict = None
+    _model: "llm.Model" = None  # noqa: F821
+    rag_collection: Collection = None
+    def __post_init__(self):
+        if not self.config:
+            self.config = InferenceConfig()
+        if not self.config.llm_config:
+            self.config.llm_config = LLMConfig()
+    @property
+    def model(self) -> "llm.Model":  # noqa: F821
+        import llm
+        if self._model is None:
+            self._model = llm.get_model(self.config.llm_config.model_name)
+            if self._model.needs_key:
+                key = get_key(None, key_alias=self._model.needs_key)
+                self._model.key = key
+        return self._model
+    def initialize_model(self, **kwargs):
+        rag_collection = self.training_data.collection
+        rag_collection.attach_indexer("llm", auto_index=False)
+        self.rag_collection = rag_collection
+    def object_to_text(self, object: OBJECT) -> str:
+        return yaml.dump(object)
+    def derive(self, object: OBJECT) -> Optional[Inference]:
+        import llm
+        from tiktoken import encoding_for_model
+        from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
+        model: llm.Model = self.model
+        model_name = self.config.llm_config.model_name
+        feature_attributes = self.config.feature_attributes
+        target_attributes = self.config.target_attributes
+        num_examples = self.config.llm_config.number_of_few_shot_examples or 5
+        query_text = self.object_to_text(object)
+        if not self.rag_collection.indexers:
+            raise ValueError("RAG collection must have an indexer attached")
+        rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
+        examples = rs.rows
+        if not examples:
+            raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
+        prompt_clauses = []
+        for example in examples:
+            # input_obj = {k: example.get(k, None) for k in feature_attributes}
+            input_obj = select_nested(example, feature_attributes)
+            # output_obj = {k: example.get(k, None) for k in target_attributes}
+            output_obj = select_nested(example, target_attributes)
+            prompt_clause = (
+                "---\nExample:\n"
+                f"## INPUT:\n{self.object_to_text(input_obj)}\n"
+                f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
+            )
+            prompt_clauses.append(prompt_clause)
+        # query_obj = {k: object.get(k, None) for k in feature_attributes}
+        query_obj = select_nested(object, feature_attributes)
+        query_text = self.object_to_text(query_obj)
+        prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
+        system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
+        def make_text(texts):
+            return "\n".join(prompt_clauses) + prompt_end
+        try:
+            encoding = encoding_for_model(model_name)
+        except KeyError:
+            encoding = encoding_for_model("gpt-4")
+        token_limit = get_token_limit(model_name)
+        prompt = render_formatted_text(make_text, prompt_clauses, encoding, token_limit)
+        logger.info(f"Prompt: {prompt}")
+        response = model.prompt(prompt, system_prompt)
+        yaml_str = response.text()
+        logger.info(f"Response: {yaml_str}")
+        try:
+            predicted_object = yaml.safe_load(yaml_str)
+            return Inference(predicted_object=predicted_object)
+        except yaml.parser.ParserError as e:
+            logger.error(f"Error parsing response: {yaml_str}\n{e}")
+            return None

linkml_store/inference/implementations/rule_based_inference_engine.py ADDED Viewed

@@ -0,0 +1,169 @@
+import logging
+from copy import copy
+from dataclasses import dataclass
+from io import StringIO
+from pathlib import Path
+from typing import Any, ClassVar, Dict, List, Optional, Union
+import yaml
+from linkml_map.utils.eval_utils import eval_expr
+from linkml_runtime import SchemaView
+from linkml_runtime.linkml_model.meta import AnonymousClassExpression, ClassRule
+from linkml_runtime.utils.formatutils import underscore
+from pydantic import BaseModel
+from linkml_store.api.collection import OBJECT, Collection
+from linkml_store.inference.inference_config import Inference, InferenceConfig
+from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
+logger = logging.getLogger(__name__)
+def expression_matches(ce: AnonymousClassExpression, object: OBJECT) -> bool:
+    """
+    Check if a class expression matches an object.
+    :param ce: The class expression
+    :param object: The object
+    :return: True if the class expression matches the object
+    """
+    if ce.any_of:
+        if not any(expression_matches(subce, object) for subce in ce.any_of):
+            return False
+    if ce.all_of:
+        if not all(expression_matches(subce, object) for subce in ce.all_of):
+            return False
+    if ce.none_of:
+        if any(expression_matches(subce, object) for subce in ce.none_of):
+            return False
+    if ce.slot_conditions:
+        for slot in ce.slot_conditions.values():
+            slot_name = slot.name
+            v = object.get(slot_name, None)
+            if slot.equals_string is not None:
+                if slot.equals_string != str(v):
+                    return False
+            if slot.equals_integer is not None:
+                if slot.equals_integer != v:
+                    return False
+            if slot.equals_expression is not None:
+                eval_v = eval_expr(slot.equals_expression, **object)
+                if v != eval_v:
+                    return False
+    return True
+def apply_rule(rule: ClassRule, object: OBJECT):
+    """
+    Apply a rule to an object.
+    Mutates the object
+    :param rule: The rule to apply
+    :param object: The object to apply the rule to
+    """
+    for condition in rule.preconditions:
+        if expression_matches(condition, object):
+            for postcondition in rule.postconditions:
+                all_of = [x for x in postcondition.all_of] + [postcondition]
+                for pc in all_of:
+                    sc = pc.slot_condition
+                    if sc:
+                        if sc.equals_string:
+                            object[sc.name] = sc.equals_string
+                        if sc.equals_integer:
+                            object[sc.name] = sc.equals_integer
+                        if sc.equals_expression:
+                            object[sc.name] = eval_expr(sc.equals_expression, **object)
+    return object
+@dataclass
+class RuleBasedInferenceEngine(InferenceEngine):
+    """
+    TODO
+    """
+    class_rules: Optional[List[ClassRule]] = None
+    slot_rules: Optional[Dict[str, List[ClassRule]]] = None
+    slot_expressions: Optional[Dict[str, str]] = None
+    PERSIST_COLS: ClassVar = ["config", "class_rules", "slot_rules", "slot_expressions"]
+    def initialize_model(self, **kwargs):
+        td = self.training_data
+        collection: Collection = td.collection
+        cd = collection.class_definition()
+        sv: SchemaView = collection.parent.schema_view
+        class_rules = cd.rules
+        if class_rules:
+            self.class_rules = class_rules
+        for slot in sv.class_induced_slots(cd.name):
+            if slot.equals_expression:
+                self.slot_expressions[slot.name] = slot.equals_expression
+    def derive(self, object: OBJECT) -> Optional[Inference]:
+        object = copy(object)
+        if self.class_rules:
+            for rule in self.class_rules:
+                apply_rule(rule, object)
+        object = {underscore(k): v for k, v in object.items()}
+        if self.slot_expressions:
+            for slot, expr in self.slot_expressions.items():
+                v = eval_expr(expr, **object)
+                if v is not None:
+                    object[slot] = v
+        if self.config and self.config.target_attributes:
+            predicted_object = {k: object.get(k, None) for k in self.config.target_attributes}
+        else:
+            predicted_object = object
+        if all(v is None for v in predicted_object.values()):
+            return None
+        return Inference(predicted_object=predicted_object)
+    def import_model_from(self, inference_engine: InferenceEngine, **kwargs):
+        io = StringIO()
+        inference_engine.export_model(io, model_serialization=ModelSerialization.LINKML_EXPRESSION)
+        config = inference_engine.config
+        if len(config.target_attributes) != 1:
+            raise ValueError("Can only import models with a single target attribute")
+        target_attribute = config.target_attributes[0]
+        if self.slot_expressions is None:
+            self.slot_expressions = {}
+        self.slot_expressions[target_attribute] = io.getvalue()
+        if not self.config:
+            self.config = inference_engine.config
+    def save_model(self, output: Union[str, Path]) -> None:
+        """
+        Save the trained model and related data to a file.
+        :param output: Path to save the model
+        """
+        def _serialize_value(v: Any) -> Any:
+            if isinstance(v, BaseModel):
+                return v.model_dump(exclude_unset=True)
+            return v
+        model_data = {k: _serialize_value(getattr(self, k)) for k in self.PERSIST_COLS}
+        with open(output, "w", encoding="utf-8") as f:
+            yaml.dump(model_data, f)
+    @classmethod
+    def load_model(cls, file_path: Union[str, Path]) -> "RuleBasedInferenceEngine":
+        model_data = yaml.safe_load(open(file_path))
+        if model_data["config"]:
+            config = InferenceConfig(**model_data["config"])
+        else:
+            config = None
+        engine = cls(config=config)
+        for k, v in model_data.items():
+            if k == "config":
+                continue
+            setattr(engine, k, v)
+        logger.info(f"Model loaded from {file_path}")
+        return engine

linkml-store 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

linkml-store 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl