PyPI - linkml-store - Versions diffs - 0.1.14__tar.gz → 0.2.0__tar.gz - Mend

linkml-store 0.1.14tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (79) hide show

{linkml_store-0.1.14 → linkml_store-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: linkml-store
-Version: 0.1.14
+Version: 0.2.0
 Summary: linkml-store
 License: MIT
 Author: Author 1
@@ -70,6 +70,8 @@ common query, index, and storage operations.
 For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
+See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
 __Warning__ LinkML-Store is still undergoing changes and refactoring,
 APIs and command line options are subject to change!
@@ -196,3 +198,4 @@ make app
 See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details

{linkml_store-0.1.14 → linkml_store-0.2.0}/README.md RENAMED Viewed

@@ -7,6 +7,8 @@ common query, index, and storage operations.
 For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
+See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
 __Warning__ LinkML-Store is still undergoing changes and refactoring,
 APIs and command line options are subject to change!
@@ -132,3 +134,4 @@ make app
 ## Background
 See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details

{linkml_store-0.1.14 → linkml_store-0.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "linkml-store"
-version = "0.1.14"
+version = "0.2.0"
 description = "linkml-store"
 authors = ["Author 1 <author@org.org>"]
 license = "MIT"

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/api/stores/duckdb/duckdb_collection.py RENAMED Viewed

@@ -36,6 +36,9 @@ class DuckDBCollection(Collection):
         logger.info(f"Inserting into: {self.alias} // T={table.name}")
         engine = self.parent.engine
         col_names = [c.name for c in table.columns]
+        bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
+        if bad_objs:
+            logger.error(f"Bad objects: {bad_objs}")
         objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
         with engine.connect() as conn:
             with conn.begin():

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/cli.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import Optional
 import click
 import yaml
 from linkml_runtime.dumpers import json_dumper
+from linkml_runtime.utils.formatutils import underscore
 from pydantic import BaseModel
 from linkml_store import Client
@@ -17,6 +18,7 @@ from linkml_store.index import get_indexer
 from linkml_store.index.implementations.simple_indexer import SimpleIndexer
 from linkml_store.index.indexer import Indexer
 from linkml_store.inference import get_inference_engine
+from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
 from linkml_store.inference.inference_config import InferenceConfig
 from linkml_store.inference.inference_engine import ModelSerialization
 from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
@@ -130,7 +132,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        stem = Path(input).stem
+        stem = underscore(Path(input).stem)
         database = "duckdb"
         collection = stem
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
@@ -496,12 +498,14 @@ def describe(ctx, where, output_type, output, limit):
 @click.option(
     "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
 )
+@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
 @click.option("--query", "-q", type=click.STRING, help="query term")
 @click.pass_context
 def infer(
     ctx,
     inference_config_file,
     query,
+    evaluation_count,
     training_test_data_split,
     predictor_type,
     target_attribute,
@@ -546,24 +550,24 @@ def infer(
         query_obj = None
     collection = ctx.obj["settings"].collection
     atts = collection.class_definition().attributes.keys()
+    if feature_attributes:
+        features = feature_attributes.split(",")
+        features = [f.strip() for f in features]
+    else:
+        if query_obj:
+            features = query_obj.keys()
+        else:
+            features = None
+    if target_attribute:
+        target_attributes = list(target_attribute)
+    else:
+        target_attributes = [att for att in atts if att not in features]
     if model_format:
         model_format = ModelSerialization(model_format)
     if load_model:
         predictor = get_inference_engine(predictor_type)
         predictor = type(predictor).load_model(load_model)
     else:
-        if feature_attributes:
-            features = feature_attributes.split(",")
-            features = [f.strip() for f in features]
-        else:
-            if query_obj:
-                features = query_obj.keys()
-            else:
-                features = None
-        if target_attribute:
-            target_attributes = list(target_attribute)
-        else:
-            target_attributes = [att for att in atts if att not in features]
         if inference_config_file:
             config = InferenceConfig.from_file(inference_config_file)
         else:
@@ -577,8 +581,13 @@ def infer(
         logger.info(f"Exporting model to {export_model} in {model_format}")
         predictor.export_model(export_model, model_format)
     if not query_obj:
-        if not export_model:
-            raise ValueError("Query must be specified if not exporting model")
+        if not export_model and not evaluation_count:
+            raise ValueError("Query or evaluate must be specified if not exporting model")
+    if evaluation_count:
+        outcome = evaluate_predictor(
+            predictor, target_attributes, evaluation_count=evaluation_count, match_function=score_text_overlap
+        )
+        print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
     if query_obj:
         result = predictor.derive(query_obj)
         dumped_obj = result.model_dump(exclude_none=True)

linkml_store-0.2.0/src/linkml_store/inference/evaluation.py ADDED Viewed

@@ -0,0 +1,189 @@
+import logging
+from collections.abc import Callable
+from typing import Any, List, Optional
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel
+from linkml_store.inference import InferenceEngine
+from linkml_store.utils.object_utils import select_nested
+logger = logging.getLogger(__name__)
+def score_match(target: Optional[Any], candidate: Optional[Any], match_function: Optional[Callable] = None) -> float:
+    """
+    Compute a score for a match between two objects
+    >>> score_match("a", "a")
+    1.0
+    >>> score_match("a", "b")
+    0.0
+    >>> score_match("a", None)
+    0.0
+    >>> score_match(None, "a")
+    0.0
+    >>> score_match(None, None)
+    1.0
+    >>> score_match(["a", "b"], ["a", "b"])
+    1.0
+    >>> score_match(["a", "b"], ["b", "a"])
+    1.0
+    >>> round(score_match(["a"], ["b", "a"]), 2)
+    0.67
+    >>> score_match({"a": 1}, {"a": 1})
+    1.0
+    >>> score_match({"a": 1}, {"a": 2})
+    0.0
+    >>> score_match({"a": 1, "b": None}, {"a": 1})
+    1.0
+    >>> score_match([{"a": 1, "b": 2}, {"a": 3, "b": 4}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+    1.0
+    >>> score_match([{"a": 1, "b": 4}, {"a": 3, "b": 2}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+    0.5
+    >>> def char_match(x, y):
+    ...    return len(set(x).intersection(set(y))) / len(set(x).union(set(y)))
+    >>> score_match("abcd", "abc", char_match)
+    0.75
+    >>> score_match(["abcd", "efgh"], ["ac", "gh"], char_match)
+    0.5
+    :param target:
+    :param candidate:
+    :param match_function:
+    :return:
+    """
+    if target == candidate:
+        return 1.0
+    if target is None or candidate is None:
+        return 0.0
+    if isinstance(target, (set, list)) and isinstance(candidate, (set, list)):
+        # create an all by all matrix using numpy
+        # for each pair of elements, compute the score
+        # return the average score
+        score_matrix = np.array([[score_match(t, c, match_function) for c in candidate] for t in target])
+        best_matches0 = np.max(score_matrix, axis=0)
+        best_matches1 = np.max(score_matrix, axis=1)
+        return (np.sum(best_matches0) + np.sum(best_matches1)) / (len(target) + len(candidate))
+    if isinstance(target, dict) and isinstance(candidate, dict):
+        keys = set(target.keys()).union(candidate.keys())
+        scores = [score_match(target.get(k), candidate.get(k), match_function) for k in keys]
+        return np.mean(scores)
+    if match_function:
+        return match_function(target, candidate)
+    return 0.0
+class Outcome(BaseModel):
+    true_positive_count: float
+    total_count: int
+    @property
+    def accuracy(self) -> float:
+        return self.true_positive_count / self.total_count
+def evaluate_predictor(
+    predictor: InferenceEngine,
+    target_attributes: List[str],
+    feature_attributes: Optional[List[str]] = None,
+    test_data: pd.DataFrame = None,
+    evaluation_count: Optional[int] = 10,
+    match_function: Optional[Callable] = None,
+) -> Outcome:
+    """
+    Evaluate a predictor by comparing its predictions to the expected values in the testing data.
+    :param predictor:
+    :param target_attributes:
+    :param feature_attributes:
+    :param evaluation_count:
+    :return:
+    """
+    n = 0
+    tp = 0
+    if test_data is None:
+        test_data = predictor.testing_data.as_dataframe()
+    for row in test_data.to_dict(orient="records"):
+        expected_obj = select_nested(row, target_attributes)
+        if feature_attributes:
+            test_obj = {k: v for k, v in row.items() if k not in target_attributes}
+        else:
+            test_obj = row
+        result = predictor.derive(test_obj)
+        logger.info(f"Predicted: {result.predicted_object} Expected: {expected_obj}")
+        tp += score_match(result.predicted_object, expected_obj, match_function)
+        n += 1
+        if evaluation_count is not None and n >= evaluation_count:
+            break
+    return Outcome(true_positive_count=tp, total_count=n)
+def score_text_overlap(str1: Any, str2: Any) -> float:
+    """
+    Compute the overlap score between two strings.
+    :param str1:
+    :param str2:
+    :return:
+    """
+    if str1 == str2:
+        return 1.0
+    if not str1 or not str2:
+        return 0.0
+    overlap, length = find_longest_overlap(str1, str2)
+    return len(overlap) / max(len(str1), len(str2))
+def find_longest_overlap(str1: str, str2: str):
+    """
+    Find the longest overlapping substring between two strings.
+    Args:
+    str1 (str): The first string
+    str2 (str): The second string
+    Returns:
+    tuple: A tuple containing the longest overlapping substring and its length
+    Examples:
+    >>> find_longest_overlap("hello world", "world of programming")
+    ('world', 5)
+    >>> find_longest_overlap("abcdefg", "defghi")
+    ('defg', 4)
+    >>> find_longest_overlap("python", "java")
+    ('', 0)
+    >>> find_longest_overlap("", "test")
+    ('', 0)
+    >>> find_longest_overlap("aabbcc", "ddeeff")
+    ('', 0)
+    >>> find_longest_overlap("programming", "PROGRAMMING")
+    ('', 0)
+    """
+    if not str1 or not str2:
+        return "", 0
+    # Create a table to store lengths of matching substrings
+    m, n = len(str1), len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    # Variables to store the maximum length and ending position
+    max_length = 0
+    end_pos = 0
+    # Fill the dp table
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+                if dp[i][j] > max_length:
+                    max_length = dp[i][j]
+                    end_pos = i
+    # Extract the longest common substring
+    start_pos = end_pos - max_length
+    longest_substring = str1[start_pos:end_pos]
+    return longest_substring, max_length

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/inference/implementations/rag_inference_engine.py RENAMED Viewed

@@ -8,6 +8,7 @@ from llm import get_key
 from linkml_store.api.collection import OBJECT, Collection
 from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
 from linkml_store.inference.inference_engine import InferenceEngine
+from linkml_store.utils.object_utils import select_nested
 logger = logging.getLogger(__name__)
@@ -22,6 +23,11 @@ You should return ONLY valid YAML in your response.
 """
+# def select_object(obj: OBJECT, key_paths: List[str]) -> OBJECT:
+# return {k: obj.get(k, None) for k in keys}
+# return {k: object_path_get(obj, k, None) for k in key_paths}
 @dataclass
 class RAGInferenceEngine(InferenceEngine):
     """
@@ -75,16 +81,7 @@ class RAGInferenceEngine(InferenceEngine):
         return self._model
     def initialize_model(self, **kwargs):
-        td = self.training_data
-        s = td.slice
-        if not s[0] and not s[1]:
-            rag_collection = td.collection
-        else:
-            base_collection = td.collection
-            objs = base_collection.find({}, offset=s[0], limit=s[1] - s[0]).rows
-            db = base_collection.parent
-            rag_collection = db.get_collection(f"{base_collection.alias}__rag_{s[0]}_{s[1]}", create_if_not_exists=True)
-            rag_collection.insert(objs)
+        rag_collection = self.training_data.collection
         rag_collection.attach_indexer("llm", auto_index=False)
         self.rag_collection = rag_collection
@@ -111,15 +108,18 @@ class RAGInferenceEngine(InferenceEngine):
             raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
         prompt_clauses = []
         for example in examples:
-            input_obj = {k: example.get(k, None) for k in feature_attributes}
-            output_obj = {k: example.get(k, None) for k in target_attributes}
+            # input_obj = {k: example.get(k, None) for k in feature_attributes}
+            input_obj = select_nested(example, feature_attributes)
+            # output_obj = {k: example.get(k, None) for k in target_attributes}
+            output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n"
                 f"## INPUT:\n{self.object_to_text(input_obj)}\n"
                 f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
             )
             prompt_clauses.append(prompt_clause)
-        query_obj = {k: object.get(k, None) for k in feature_attributes}
+        # query_obj = {k: object.get(k, None) for k in feature_attributes}
+        query_obj = select_nested(object, feature_attributes)
         query_text = self.object_to_text(query_obj)
         prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/inference/implementations/rule_based_inference_engine.py RENAMED Viewed

@@ -13,7 +13,7 @@ from linkml_runtime.utils.formatutils import underscore
 from pydantic import BaseModel
 from linkml_store.api.collection import OBJECT, Collection
-from linkml_store.inference.inference_config import Inference
+from linkml_store.inference.inference_config import Inference, InferenceConfig
 from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
 logger = logging.getLogger(__name__)
@@ -111,11 +111,16 @@ class RuleBasedInferenceEngine(InferenceEngine):
         object = {underscore(k): v for k, v in object.items()}
         if self.slot_expressions:
             for slot, expr in self.slot_expressions.items():
-                print(f"EVAL {object}")
                 v = eval_expr(expr, **object)
                 if v is not None:
                     object[slot] = v
-        return Inference(predicted_object=object)
+        if self.config and self.config.target_attributes:
+            predicted_object = {k: object.get(k, None) for k in self.config.target_attributes}
+        else:
+            predicted_object = object
+        if all(v is None for v in predicted_object.values()):
+            return None
+        return Inference(predicted_object=predicted_object)
     def import_model_from(self, inference_engine: InferenceEngine, **kwargs):
         io = StringIO()
@@ -127,6 +132,8 @@ class RuleBasedInferenceEngine(InferenceEngine):
         if self.slot_expressions is None:
             self.slot_expressions = {}
         self.slot_expressions[target_attribute] = io.getvalue()
+        if not self.config:
+            self.config = inference_engine.config
     def save_model(self, output: Union[str, Path]) -> None:
         """
@@ -148,7 +155,11 @@ class RuleBasedInferenceEngine(InferenceEngine):
     def load_model(cls, file_path: Union[str, Path]) -> "RuleBasedInferenceEngine":
         model_data = yaml.safe_load(open(file_path))
-        engine = cls(config=model_data["config"])
+        if model_data["config"]:
+            config = InferenceConfig(**model_data["config"])
+        else:
+            config = None
+        engine = cls(config=config)
         for k, v in model_data.items():
             if k == "config":
                 continue

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/inference/implementations/sklearn_inference_engine.py RENAMED Viewed

@@ -174,6 +174,7 @@ class SklearnInferenceEngine(InferenceEngine):
             if col in self.encoders:
                 encoder = self.encoders[col]
                 if isinstance(encoder, OneHotEncoder):
+                    print(f"Encoding: {col} v={object[col]} df={new_X[[col]]} encoder={encoder}")
                     encoded = encoder.transform(new_X[[col]])
                     feature_names = encoder.get_feature_names_out([col])
                     for i, name in enumerate(feature_names):
@@ -216,7 +217,24 @@ class SklearnInferenceEngine(InferenceEngine):
         return Inference(predicted_object=predicted_object, confidence=self.confidence)
     def _normalize(self, object: OBJECT) -> OBJECT:
-        return {k: object.get(k, None) for k in self.config.feature_attributes}
+        """
+        Normalize the input object to ensure it has all the expected attributes.
+        Also remove any numpy/pandas oddities
+        :param object:
+        :return:
+        """
+        np_map = {np.nan: None}
+        def _tr(x: Any):
+            # TODO: figure a more elegant way to do this
+            try:
+                return np_map.get(x, x)
+            except TypeError:
+                return x
+        return {k: _tr(object.get(k, None)) for k in self.config.feature_attributes}
     def export_model(
         self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/inference/inference_engine.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+import random
 from abc import ABC
 from dataclasses import dataclass
 from enum import Enum
@@ -6,7 +7,7 @@ from pathlib import Path
 from typing import Optional, TextIO, Tuple, Union
 import pandas as pd
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict
 from linkml_store.api.collection import OBJECT, Collection
 from linkml_store.inference.inference_config import Inference, InferenceConfig
@@ -59,9 +60,25 @@ class ModelSerialization(str, Enum):
 class CollectionSlice(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    collection: Optional[Collection] = None
-    dataframe: Optional[pd.DataFrame] = None
-    slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
+    name: Optional[str] = None
+    base_collection: Optional[Collection] = None
+    # _dataframe: Optional[pd.DataFrame] = None
+    # slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
+    indices: Optional[Tuple[int, ...]] = None
+    _collection: Optional[Collection] = None
+    @property
+    def collection(self) -> Collection:
+        if not self._collection:
+            rows = self.base_collection.find({}, limit=-1).rows
+            # subset based on indices
+            subset = [rows[i] for i in self.indices]
+            db = self.base_collection.parent
+            subset_name = f"{self.base_collection.alias}__rag_{self.name}"
+            subset_collection = db.get_collection(subset_name, create_if_not_exists=True)
+            subset_collection.insert(subset)
+            self._collection = subset_collection
+        return self._collection
     def as_dataframe(self, flattened=False) -> pd.DataFrame:
         """
@@ -69,17 +86,11 @@ class CollectionSlice(BaseModel):
         :return:
         """
-        if self.dataframe is not None:
-            df = self.dataframe
-            return df.iloc[self.slice[0] : self.slice[1]]
-        elif self.collection is not None:
-            rs = self.collection.find({}, offset=self.slice[0], limit=self.slice[1] - self.slice[0])
-            if flattened:
-                return nested_objects_to_dataframe(rs.rows)
-            else:
-                return rs.rows_dataframe
+        rs = self.collection.find({}, limit=-1)
+        if flattened:
+            return nested_objects_to_dataframe(rs.rows)
         else:
-            raise ValueError("No dataframe or collection provided")
+            return rs.rows_dataframe
 @dataclass
@@ -96,7 +107,7 @@ class InferenceEngine(ABC):
     training_data: Optional[CollectionSlice] = None
     testing_data: Optional[CollectionSlice] = None
-    def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None):
+    def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None, randomize=True):
         """
         Load the data and split it into training and testing sets.
@@ -109,8 +120,24 @@ class InferenceEngine(ABC):
             split = (0.7, 0.3)
         logger.info(f"Loading and splitting data from collection {collection.alias}")
         size = collection.size()
-        self.training_data = CollectionSlice(collection=collection, slice=(0, int(size * split[0])))
-        self.testing_data = CollectionSlice(collection=collection, slice=(int(size * split[0]), size))
+        indices = range(size)
+        if randomize:
+            train_indices = random.sample(indices, int(size * split[0]))
+            test_indices = set(indices) - set(train_indices)
+        else:
+            train_indices = indices[: int(size * split[0])]
+            test_indices = indices[int(size * split[0]) :]
+        self.training_data = CollectionSlice(name="train", base_collection=collection, indices=train_indices)
+        self.testing_data = CollectionSlice(name="test", base_collection=collection, indices=test_indices)
+        # all_data = collection.find({}, limit=size).rows
+        # all_data_df = nested_objects_to_dataframe(all_data)
+        # all_data_df = collection.find({}, limit=size).rows_dataframe
+        # randomize/shuffle order of rows in dataframe
+        # all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
+        # self.training_data = CollectionSlice(dataframe=all_data_df[: int(size * split[0])])
+        # self.testing_data = CollectionSlice(dataframe=all_data_df[int(size * split[0]) : size])
+        # self.training_data = CollectionSlice(base_collection=collection, slice=(0, int(size * split[0])))
+        # self.testing_data = CollectionSlice(base_collection=collection, slice=(int(size * split[0]), size))
     def initialize_model(self, **kwargs):
         """

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/utils/format_utils.py RENAMED Viewed

@@ -47,6 +47,7 @@ class Format(Enum):
             ".jsonl": cls.JSONL,
             ".yaml": cls.YAML,
             ".yml": cls.YAML,
+            ".yamll": cls.YAMLL,
             ".tsv": cls.TSV,
             ".csv": cls.CSV,
             ".py": cls.PYTHON,
@@ -98,6 +99,9 @@ def process_file(
     """
     Process a single file and return a list of objects.
     """
+    if format == Format.YAMLL:
+        format = Format.YAML
+        expected_type = list
     if format == Format.JSON:
         objs = json.load(f)
     elif format == Format.JSONL:
@@ -105,6 +109,8 @@ def process_file(
     elif format == Format.YAML:
         if expected_type and expected_type == list:  # noqa E721
             objs = list(yaml.safe_load_all(f))
+            # allow YAML with a `---` with no object before it
+            objs = [obj for obj in objs if obj is not None]
         else:
             objs = yaml.safe_load(f)
     elif format in [Format.TSV, Format.CSV]:

{linkml_store-0.1.14 → linkml_store-0.2.0}/src/linkml_store/utils/llm_utils.py RENAMED Viewed

@@ -32,6 +32,7 @@ MODEL_TOKEN_MAPPING = {
     "code-cushman-002": 2048,
     "code-cushman-001": 2048,
     "claude": 200_000,
+    "llama-3": 200_000,
 }

linkml_store-0.2.0/src/linkml_store/utils/object_utils.py ADDED Viewed

@@ -0,0 +1,182 @@
+import json
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Union
+from pydantic import BaseModel
+def object_path_update(
+    obj: Union[BaseModel, Dict[str, Any]], path: str, value: Any
+) -> Union[BaseModel, Dict[str, Any]]:
+    """
+    Updates a nested object based on a path description and a value. The path to the
+    desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
+    :param obj: The dictionary object to be updated.
+    :type obj: Dict[str, Any]
+    :param path: The path string indicating where to place the value within the object.
+    :type path: str
+    :param value: The value to be set at the specified path.
+    :type value: Any
+    :return: None. This function modifies the object in-place.
+    :rtype: None
+    **Example**::
+    >>> data = {}
+    >>> object_path_update(data, 'persons[0].foo.bar', 1)
+    {'persons': [{'foo': {'bar': 1}}]}
+    """
+    if isinstance(obj, BaseModel):
+        typ = type(obj)
+        obj = obj.model_dump(exclude_none=True)
+        obj = object_path_update(obj, path, value)
+        return typ(**obj)
+    obj = deepcopy(obj)
+    ret_obj = obj
+    parts = path.split(".")
+    for part in parts[:-1]:
+        if "[" in part:
+            key, index = part[:-1].split("[")
+            index = int(index)
+            # obj = obj.setdefault(key, [{} for _ in range(index+1)])
+            obj = obj.setdefault(key, [])
+            while len(obj) <= index:
+                obj.append({})
+            obj = obj[index]
+        else:
+            if part in obj and obj[part] is None:
+                del obj[part]
+            obj = obj.setdefault(part, {})
+    last_part = parts[-1]
+    if "[" in last_part:
+        key, index = last_part[:-1].split("[")
+        index = int(index)
+        if key not in obj or not isinstance(obj[key], list):
+            obj[key] = [{} for _ in range(index + 1)]
+        obj[key][index] = value
+    else:
+        obj[last_part] = value
+    return ret_obj
+def object_path_get(obj: Union[BaseModel, Dict[str, Any]], path: str, default_value=None) -> Any:
+    """
+    Retrieves a value from a nested object based on a path description. The path to the
+    desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
+    :param obj: The dictionary object to be updated.
+    :type obj: Dict[str, Any]
+    :param path: The path string indicating where to place the value within the object.
+    :type path: str
+    :return: The value at the specified path.
+    :rtype: Any
+    **Example**::
+    >>> data = {'persons': [{'foo': {'bar': 1}}]}
+    >>> object_path_get(data, 'persons[0].foo.bar')
+    1
+    >>> object_path_get(data, 'persons[0].foo')
+    {'bar': 1}
+    >>> object_path_get({}, 'not there', "NA")
+    'NA'
+    """
+    if isinstance(obj, BaseModel):
+        obj = obj.dict()
+    parts = path.split(".")
+    for part in parts:
+        if "[" in part:
+            key, index = part[:-1].split("[")
+            index = int(index)
+            obj = obj[key][index]
+        else:
+            obj = obj.get(part, default_value)
+    return obj
+def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
+    """
+    Parse a string expression of the form 'path.to.field=value' into a path and a value.
+    :param expr:
+    :return:
+    """
+    try:
+        path, val = expr.split("=", 1)
+        val = json.loads(val)
+    except ValueError:
+        return None
+    return path, val
+def clean_empties(value: Union[Dict, List]) -> Any:
+    if isinstance(value, dict):
+        value = {k: v for k, v in ((k, clean_empties(v)) for k, v in value.items()) if v is not None}
+    elif isinstance(value, list):
+        value = [v for v in (clean_empties(v) for v in value) if v is not None]
+    return value
+def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=None) -> Optional[dict]:
+    """
+    Select nested attributes from a complex dictionary based on selector strings.
+    Args:
+    data (dict): The input nested dictionary.
+    selectors (list): A list of selector strings.
+    Returns:
+    dict: A new dictionary with the same structure, but only the selected attributes.
+    Example:
+    >>> data = {
+    ...     "person": {
+    ...         "name": "John Doe",
+    ...         "age": 30,
+    ...         "address": {
+    ...             "street": "123 Main St",
+    ...             "city": "Anytown",
+    ...             "country": "USA"
+    ...         },
+    ...         "phones": [
+    ...             {"type": "home", "number": "555-1234"},
+    ...             {"type": "work", "number": "555-5678"}
+    ...         ]
+    ...     },
+    ...     "company": {
+    ...         "name": "Acme Inc",
+    ...         "location": "New York"
+    ...     }
+    ... }
+    >>> select_nested(data, ["person.address.street", "person.address.city"])
+    {'person': {'address': {'street': '123 Main St', 'city': 'Anytown'}}}
+    >>> select_nested(data, ["person.phones.number", "person.phones.type"])
+    {'person': {'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
+    >>> select_nested(data, ["person"])
+    {'person': {'name': 'John Doe', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Anytown',
+     'country': 'USA'}, 'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
+    >>> select_nested(data, ["person.phones.type"])
+    {'person': {'phones': [{'type': 'home'}, {'type': 'work'}]}}
+    """
+    if current_path is None:
+        current_path = []
+    matching_paths = []
+    for path in paths:
+        if isinstance(path, str):
+            path = path.split(".")
+        if path == current_path:
+            return data
+        if path[: len(current_path)] == current_path:
+            matching_paths.append(path)
+    if not matching_paths:
+        return None
+    if isinstance(data, dict):
+        new_obj = {k: select_nested(v, matching_paths, current_path + [k]) for k, v in data.items()}
+        new_obj = {k: v for k, v in new_obj.items() if v is not None}
+        return new_obj
+    if isinstance(data, list):
+        new_obj = [select_nested(v, matching_paths, current_path + []) for i, v in enumerate(data)]
+        new_obj = [v for v in new_obj if v is not None]
+        return new_obj
+    return data

linkml_store-0.1.14/src/linkml_store/utils/object_utils.py DELETED Viewed

@@ -1,83 +0,0 @@
-import json
-from copy import deepcopy
-from typing import Any, Dict, List, Union
-from pydantic import BaseModel
-def object_path_update(
-    obj: Union[BaseModel, Dict[str, Any]], path: str, value: Any
-) -> Union[BaseModel, Dict[str, Any]]:
-    """
-    Updates a nested object based on a path description and a value. The path to the
-    desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
-    :param obj: The dictionary object to be updated.
-    :type obj: Dict[str, Any]
-    :param path: The path string indicating where to place the value within the object.
-    :type path: str
-    :param value: The value to be set at the specified path.
-    :type value: Any
-    :return: None. This function modifies the object in-place.
-    :rtype: None
-    **Example**::
-    >>> data = {}
-    >>> object_path_update(data, 'persons[0].foo.bar', 1)
-    {'persons': [{'foo': {'bar': 1}}]}
-    """
-    if isinstance(obj, BaseModel):
-        typ = type(obj)
-        obj = obj.model_dump(exclude_none=True)
-        obj = object_path_update(obj, path, value)
-        return typ(**obj)
-    obj = deepcopy(obj)
-    ret_obj = obj
-    parts = path.split(".")
-    for part in parts[:-1]:
-        if "[" in part:
-            key, index = part[:-1].split("[")
-            index = int(index)
-            # obj = obj.setdefault(key, [{} for _ in range(index+1)])
-            obj = obj.setdefault(key, [])
-            while len(obj) <= index:
-                obj.append({})
-            obj = obj[index]
-        else:
-            if part in obj and obj[part] is None:
-                del obj[part]
-            obj = obj.setdefault(part, {})
-    last_part = parts[-1]
-    if "[" in last_part:
-        key, index = last_part[:-1].split("[")
-        index = int(index)
-        if key not in obj or not isinstance(obj[key], list):
-            obj[key] = [{} for _ in range(index + 1)]
-        obj[key][index] = value
-    else:
-        obj[last_part] = value
-    return ret_obj
-def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
-    """
-    Parse a string expression of the form 'path.to.field=value' into a path and a value.
-    :param expr:
-    :return:
-    """
-    try:
-        path, val = expr.split("=", 1)
-        val = json.loads(val)
-    except ValueError:
-        return None
-    return path, val
-def clean_empties(value: Union[Dict, List]) -> Any:
-    if isinstance(value, dict):
-        value = {k: v for k, v in ((k, clean_empties(v)) for k, v in value.items()) if v is not None}
-    elif isinstance(value, list):
-        value = [v for v in (clean_empties(v) for v in value) if v is not None]
-    return value