PyPI - linkml-store - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

linkml-store 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (19) hide show

linkml_store/api/collection.py +50 -6
linkml_store/api/database.py +7 -1
linkml_store/api/queries.py +3 -1
linkml_store/api/stores/duckdb/duckdb_collection.py +5 -2
linkml_store/cli.py +58 -13
linkml_store/index/implementations/llm_indexer.py +20 -2
linkml_store/index/indexer.py +70 -16
linkml_store/inference/evaluation.py +9 -3
linkml_store/inference/implementations/rag_inference_engine.py +151 -34
linkml_store/inference/implementations/sklearn_inference_engine.py +1 -1
linkml_store/inference/inference_config.py +5 -2
linkml_store/inference/inference_engine.py +20 -13
linkml_store/utils/llm_utils.py +1 -0
linkml_store/utils/vector_utils.py +165 -0
{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/METADATA +6 -1
{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/RECORD +19 -18
{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/LICENSE +0 -0
{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/WHEEL +0 -0
{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/entry_points.txt +0 -0

linkml_store/api/collection.py CHANGED Viewed

@@ -226,6 +226,18 @@ class Collection(Generic[DatabaseType]):
         self._initialized = True
         patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
         self._broadcast(patches, **kwargs)
+        self._post_modification_hook(**kwargs)
+    def _post_delete_hook(self, **kwargs):
+        self._post_modification_hook(**kwargs)
+    def _post_modification_hook(self, **kwargs):
+        for indexer in self.indexers.values():
+            ix_collection_name = self.get_index_collection_name(indexer)
+            ix_collection = self.parent.get_collection(ix_collection_name)
+            # Currently updating the source triggers complete reindexing
+            # TODO: make this more efficient by only deleting modified
+            ix_collection.delete_where({})
     def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
         """
@@ -458,6 +470,7 @@ class Collection(Generic[DatabaseType]):
         where: Optional[Any] = None,
         index_name: Optional[str] = None,
         limit: Optional[int] = None,
+        mmr_relevance_factor: Optional[float] = None,
         **kwargs,
     ) -> QueryResult:
         """
@@ -476,7 +489,7 @@ class Collection(Generic[DatabaseType]):
         Now let's index, using the simple trigram-based index
         >>> index = get_indexer("simple")
-        >>> collection.attach_indexer(index)
+        >>> _ = collection.attach_indexer(index)
         Now let's find all objects:
@@ -514,12 +527,15 @@ class Collection(Generic[DatabaseType]):
         if ix_coll.size() == 0:
             logger.info(f"Index {index_name} is empty; indexing all objects")
             all_objs = self.find(limit=-1).rows
-            self.index_objects(all_objs, index_name, replace=True, **kwargs)
+            if all_objs:
+                # print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
+                self.index_objects(all_objs, index_name, replace=True, **kwargs)
+                assert ix_coll.size() > 0
         qr = ix_coll.find(where=where, limit=-1, **kwargs)
         index_col = ix.index_field
         # TODO: optimize this for large indexes
         vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
-        results = ix.search(query, vector_pairs, limit=limit)
+        results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
             del r[1][index_col]
         new_qr = QueryResult(num_rows=len(results))
@@ -648,7 +664,31 @@ class Collection(Generic[DatabaseType]):
         """
         return self.find({}, limit=1).num_rows
-    def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
+    def rows_iter(self) -> Iterable[OBJECT]:
+        """
+        Return an iterator over the objects in the collection.
+        :return:
+        """
+        yield from self.find({}, limit=-1).rows
+    def rows(self) -> List[OBJECT]:
+        """
+        Return a list of objects in the collection.
+        :return:
+        """
+        return list(self.rows_iter())
+    def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
+        """
+        Return a list of objects in the collection, with scores.
+        """
+        return [(n, obj) for n, obj in enumerate(self.rows_iter())]
+    def attach_indexer(
+        self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
+    ) -> Indexer:
         """
         Attach an index to the collection.
@@ -669,8 +709,8 @@ class Collection(Generic[DatabaseType]):
         >>> full_index.name = "full"
         >>> name_index = get_indexer("simple", text_template="{name}")
         >>> name_index.name = "name"
-        >>> collection.attach_indexer(full_index)
-        >>> collection.attach_indexer(name_index)
+        >>> _ = collection.attach_indexer(full_index)
+        >>> _ = collection.attach_indexer(name_index)
         Now let's find objects using the full index, using the string "France".
         We expect the country France to be the top hit, but the score will
@@ -713,6 +753,10 @@ class Collection(Generic[DatabaseType]):
             all_objs = self.find(limit=-1).rows
             logger.info(f"Auto-indexing {len(all_objs)} objects")
             self.index_objects(all_objs, index_name, replace=True, **kwargs)
+        return index
+    def get_index_collection_name(self, indexer: Indexer) -> str:
+        return self._index_collection_name(indexer.name)
     def _index_collection_name(self, index_name: str) -> str:
         """

linkml_store/api/database.py CHANGED Viewed

@@ -268,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
         metadata: Optional[CollectionConfig] = None,
         recreate_if_exists=False,
         **kwargs,
-    ) -> CollectionType:
+    ) -> Collection:
         """
         Create a new collection in the current database.
@@ -760,6 +760,12 @@ class Database(ABC, Generic[CollectionType]):
         """
         Export a database to a file or location.
+        >>> from linkml_store.api.client import Client
+        >>> client = Client()
+        >>> db = client.attach_database("duckdb", alias="test")
+        >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
+        >>> db.export_database("/tmp/iris.yaml", Format.YAML)
         :param location: location of the file
         :param target_format: target format
         :param kwargs: additional arguments

linkml_store/api/queries.py CHANGED Viewed

@@ -40,7 +40,9 @@ class FacetCountResult(BaseModel):
 class QueryResult(BaseModel):
     """
-    A query result
+    A query result.
+    TODO: make this a subclass of Collection
     """
     query: Optional[Query] = None

linkml_store/api/stores/duckdb/duckdb_collection.py CHANGED Viewed

@@ -50,8 +50,9 @@ class DuckDBCollection(Collection):
         if not isinstance(objs, list):
             objs = [objs]
         cd = self.class_definition()
-        if not cd:
+        if not cd or not cd.attributes:
             cd = self.induce_class_definition_from_objects(objs)
+        assert cd.attributes
         table = self._sqla_table(cd)
         engine = self.parent.engine
         with engine.connect() as conn:
@@ -61,7 +62,8 @@ class DuckDBCollection(Collection):
                 stmt = stmt.compile(engine)
                 conn.execute(stmt)
                 conn.commit()
-        return
+        self._post_delete_hook()
+        return None
     def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
         logger.info(f"Deleting from {self.target_class_name} where: {where}")
@@ -87,6 +89,7 @@ class DuckDBCollection(Collection):
             if deleted_rows_count == 0 and not missing_ok:
                 raise ValueError(f"No rows found for {where}")
             conn.commit()
+            self._post_delete_hook()
             return deleted_rows_count if deleted_rows_count > -1 else None
     def query_facets(

linkml_store/cli.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import sys
 import warnings
+from collections import defaultdict
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple, Any
 import click
 import yaml
@@ -76,6 +77,8 @@ class ContextSettings(BaseModel):
         if name is None:
             # if len(self.database.list_collections()) > 1:
             #    raise ValueError("Collection must be specified if there are multiple collections.")
+            if not self.database:
+                return None
             if not self.database.list_collections():
                 return None
             name = list(self.database.list_collections())[0]
@@ -218,7 +221,10 @@ def insert(ctx, files, object, format):
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
 @click.pass_context
 def store(ctx, files, object, format):
-    """Store objects from files (JSON, YAML, TSV) into the specified collection."""
+    """Store objects from files (JSON, YAML, TSV) into the database.
+    Note: this is similar to insert, but a collection does not need to be specified
+    """
     settings = ctx.obj["settings"]
     db = settings.database
     if not files and not object:
@@ -410,14 +416,6 @@ def list_collections(ctx, **kwargs):
 def fq(ctx, where, limit, columns, output_type, wide, output):
     """
     Query facets from the specified collection.
-    :param ctx:
-    :param where:
-    :param limit:
-    :param columns:
-    :param output_type:
-    :param output:
-    :return:
     """
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
@@ -483,6 +481,41 @@ def describe(ctx, where, output_type, output, limit):
     write_output(df.describe(include="all").transpose(), output_type, target=output)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--index", "-I", help="Attributes to index on in pivot")
+@click.option("--columns", "-A", help="Attributes to use as columns in pivot")
+@click.option("--values", "-V", help="Attributes to use as values in pivot")
+@click.pass_context
+def pivot(ctx, where, limit, index, columns, values, output_type, output):
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    column_atts = columns.split(",") if columns else None
+    value_atts = values.split(",") if values else None
+    index_atts = index.split(",") if index else None
+    results = collection.find(where_clause, limit=limit)
+    pivoted = defaultdict(dict)
+    for row in results.rows:
+        index_key = tuple([row.get(att) for att in index_atts])
+        column_key = tuple([row.get(att) for att in column_atts])
+        value_key = tuple([row.get(att) for att in value_atts])
+        pivoted[index_key][column_key] = value_key
+    pivoted_objs = []
+    def detuple(t: Tuple) -> Any:
+        if len(t) == 1:
+            return t[0]
+        return str(t)
+    for index_key, data in pivoted.items():
+        obj = {att: key for att, key in zip(index_atts, index_key)}
+        for column_key, value_key in data.items():
+            obj[detuple(column_key)] = detuple(value_key)
+        pivoted_objs.append(obj)
+    write_output(pivoted_objs, output_type, target=output)
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -499,6 +532,7 @@ def describe(ctx, where, output_type, output, limit):
     "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
 )
 @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
+@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
 @click.option("--query", "-q", type=click.STRING, help="query term")
 @click.pass_context
 def infer(
@@ -506,6 +540,7 @@ def infer(
     inference_config_file,
     query,
     evaluation_count,
+    evaluation_match_function,
     training_test_data_split,
     predictor_type,
     target_attribute,
@@ -549,7 +584,10 @@ def infer(
     else:
         query_obj = None
     collection = ctx.obj["settings"].collection
-    atts = collection.class_definition().attributes.keys()
+    if collection:
+        atts = collection.class_definition().attributes.keys()
+    else:
+        atts = []
     if feature_attributes:
         features = feature_attributes.split(",")
         features = [f.strip() for f in features]
@@ -575,7 +613,8 @@ def infer(
         if training_test_data_split:
             config.train_test_split = training_test_data_split
         predictor = get_inference_engine(predictor_type, config=config)
-        predictor.load_and_split_data(collection)
+        if collection:
+            predictor.load_and_split_data(collection)
         predictor.initialize_model()
     if export_model:
         logger.info(f"Exporting model to {export_model} in {model_format}")
@@ -584,8 +623,14 @@ def infer(
         if not export_model and not evaluation_count:
             raise ValueError("Query or evaluate must be specified if not exporting model")
     if evaluation_count:
+        if evaluation_match_function == "score_text_overlap":
+            match_function_fn = score_text_overlap
+        elif evaluation_match_function is not None:
+            raise ValueError(f"Unknown match function: {evaluation_match_function}")
+        else:
+            match_function_fn = None
         outcome = evaluate_predictor(
-            predictor, target_attributes, evaluation_count=evaluation_count, match_function=score_text_overlap
+            predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
         )
         print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
     if query_obj:

linkml_store/index/implementations/llm_indexer.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
+from tiktoken import encoding_for_model
 from linkml_store.api.config import CollectionConfig
 from linkml_store.index.indexer import INDEX_ITEM, Indexer
+from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
 if TYPE_CHECKING:
     import llm
@@ -29,6 +31,7 @@ class LLMIndexer(Indexer):
     cached_embeddings_database: str = None
     cached_embeddings_collection: str = None
     cache_queries: bool = False
+    truncation_method: Optional[str] = None
     @property
     def embedding_model(self):
@@ -62,6 +65,21 @@ class LLMIndexer(Indexer):
         """
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
+        token_limit = get_token_limit(model.model_id)
+        encoding = encoding_for_model("gpt-4o")
+        def truncate_text(text: str) -> str:
+            # split into tokens every 1000 chars:
+            parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
+            return render_formatted_text(
+                lambda x: "".join(x),
+                parts,
+                encoding,
+                token_limit,
+            )
+        texts = [truncate_text(text) for text in texts]
         if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
             model_id = model.model_id
             if not model_id:
@@ -88,7 +106,7 @@ class LLMIndexer(Indexer):
                 embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
             else:
                 embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
-            texts = list(texts)
             embeddings = list([None] * len(texts))
             uncached_texts = []
             n = 0

linkml_store/index/indexer.py CHANGED Viewed

@@ -3,6 +3,7 @@ from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
+from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
 from pydantic import BaseModel
 INDEX_ITEM = np.ndarray
@@ -19,23 +20,57 @@ class TemplateSyntaxEnum(str, Enum):
     fstring = "fstring"
-def cosine_similarity(vector1, vector2) -> float:
+class Indexer(BaseModel):
     """
-    Calculate the cosine similarity between two vectors
+    An indexer operates on a collection in order to search for objects.
-    :param vector1:
-    :param vector2:
-    :return:
-    """
-    dot_product = np.dot(vector1, vector2)
-    norm1 = np.linalg.norm(vector1)
-    norm2 = np.linalg.norm(vector2)
-    return dot_product / (norm1 * norm2)
+    You should use a subcllass of this; this can be looked up dynqamically:
+    >>> from linkml_store.index import get_indexer
+    >>> indexer = get_indexer("simple")
+    You can customize how objects are indexed by passing in a text template.
+    For example, if your collection has objects with "name" and "profession" attributes,
+    you can index them as "{name} {profession}".
+    >>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
+    By default, python fstrings are assumed.
+    We can test this works using the :ref:`object_to_text` method (normally
+    you would never need to call this directly, but it's useful for testing):
+    >>> obj = {"name": "John", "profession": "doctor"}
+    >>> indexer.object_to_text(obj)
+    'John :: doctor'
+    You can also use Jinja2 templates; this gives more flexibility and logic,
+    e.g. conditional formatting:
+    >>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
+    >>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
+    >>> indexer.object_to_text(obj)
+    'John :: doctor'
+    >>> indexer.object_to_text({"name": "John"})
+    'John'
+    You can also specify which attributes to index:
+    >>> indexer = get_indexer("simple", index_attributes=["name"])
+    >>> indexer.object_to_text(obj)
+    'John'
+    The purpose of an indexer is to translate a collection of objects into a collection of objects
+    such as vectors for purposes such as search. Unless you are implementing your own indexer, you
+    generally don't need to use the methods that return vectors, but we can examine their behavior
+    to get a sense of how they work.
+    >>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
+    >>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
+    Note you should consult the documentation for the specific indexer you are using for more details on
+    how text is converted to vectors.
-class Indexer(BaseModel):
-    """
-    An indexer operates on a collection in order to search for objects.
     """
     name: Optional[str] = None
@@ -119,10 +154,13 @@ class Indexer(BaseModel):
         return str(obj)
     def search(
-        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
+        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
+            mmr_relevance_factor: Optional[float] = None
     ) -> List[Tuple[float, Any]]:
         """
-        Search the index for a query string
+        Use the indexer to search against a database of vectors.
+        Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
         :param query: The query string to search for
         :param vectors: A list of indexed items, where each item is a tuple of (id, vector)
@@ -133,13 +171,29 @@ class Indexer(BaseModel):
         # Convert the query string to a vector
         query_vector = self.text_to_vector(query, cache=False)
+        if mmr_relevance_factor is not None:
+            vlist = [v for _, v in vectors]
+            idlist = [id for id, _ in vectors]
+            sorted_indices = mmr_diversified_search(
+                query_vector, vlist,
+                relevance_factor=mmr_relevance_factor, top_n=limit)
+            results = []
+            # TODO: this is inefficient when limit is high
+            for i in range(limit):
+                if i >= len(sorted_indices):
+                    break
+                pos = sorted_indices[i]
+                score = pairwise_cosine_similarity(query_vector, vlist[pos])
+                results.append((score, idlist[pos]))
+            return results
         distances = []
         # Iterate over each indexed item
         for item_id, item_vector in vectors:
             # Calculate the Euclidean distance between the query vector and the item vector
             # distance = 1-np.linalg.norm(query_vector - item_vector)
-            distance = cosine_similarity(query_vector, item_vector)
+            distance = pairwise_cosine_similarity(query_vector, item_vector)
             distances.append((distance, item_id))
         # Sort the distances in ascending order

linkml_store/inference/evaluation.py CHANGED Viewed

@@ -20,6 +20,8 @@ def score_match(target: Optional[Any], candidate: Optional[Any], match_function:
     1.0
     >>> score_match("a", "b")
     0.0
+    >>> score_match("abcd", "abcde")
+    0.0
     >>> score_match("a", None)
     0.0
     >>> score_match(None, "a")
@@ -52,7 +54,7 @@ def score_match(target: Optional[Any], candidate: Optional[Any], match_function:
     :param target:
     :param candidate:
-    :param match_function:
+    :param match_function: defaults to struct
     :return:
     """
     if target == candidate:
@@ -99,7 +101,8 @@ def evaluate_predictor(
     :param predictor:
     :param target_attributes:
     :param feature_attributes:
-    :param evaluation_count:
+    :param evaluation_count: max iterations
+    :param match_function: function to use for matching
     :return:
     """
     n = 0
@@ -113,8 +116,8 @@ def evaluate_predictor(
         else:
             test_obj = row
         result = predictor.derive(test_obj)
-        logger.info(f"Predicted: {result.predicted_object} Expected: {expected_obj}")
         tp += score_match(result.predicted_object, expected_obj, match_function)
+        logger.info(f"TP={tp} MF={match_function} Predicted: {result.predicted_object} Expected: {expected_obj}")
         n += 1
         if evaluation_count is not None and n >= evaluation_count:
             break
@@ -125,6 +128,9 @@ def score_text_overlap(str1: Any, str2: Any) -> float:
     """
     Compute the overlap score between two strings.
+    >>> score_text_overlap("abc", "bcde")
+    0.5
     :param str1:
     :param str2:
     :return:

linkml_store/inference/implementations/rag_inference_engine.py CHANGED Viewed

@@ -1,17 +1,24 @@
+import json
 import logging
 from dataclasses import dataclass
-from typing import Any, Optional
+from pathlib import Path
+from typing import ClassVar, List, Optional, TextIO, Union
 import yaml
 from llm import get_key
+from pydantic import BaseModel
 from linkml_store.api.collection import OBJECT, Collection
 from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
-from linkml_store.inference.inference_engine import InferenceEngine
+from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
 from linkml_store.utils.object_utils import select_nested
 logger = logging.getLogger(__name__)
+MAX_ITERATIONS = 5
+DEFAULT_NUM_EXAMPLES = 20
+DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
 SYSTEM_PROMPT = """
 You are a {llm_config.role}, your task is to inference the YAML
 object output given the YAML object input. I will provide you
@@ -23,9 +30,14 @@ You should return ONLY valid YAML in your response.
 """
-# def select_object(obj: OBJECT, key_paths: List[str]) -> OBJECT:
-# return {k: obj.get(k, None) for k in keys}
-# return {k: object_path_get(obj, k, None) for k in key_paths}
+class TrainedModel(BaseModel, extra="forbid"):
+    rag_collection_rows: List[OBJECT]
+    index_rows: List[OBJECT]
+    config: Optional[InferenceConfig] = None
+class RAGInference(Inference):
+    iterations: int = 0
 @dataclass
@@ -54,14 +66,23 @@ class RAGInferenceEngine(InferenceEngine):
     >>> prediction.predicted_object
     {'capital': 'Montevideo', 'code': 'UY', 'continent': 'South America', 'languages': ['Spanish']}
+    The "model" can be saved for later use:
+    >>> ie.export_model("tests/output/countries.rag_model.json")
+    Note in this case the model is not the underlying LLM, but the "RAG Model" which is the vectorized
+    representation of training set objects.
     """
-    classifier: Any = None
-    encoders: dict = None
     _model: "llm.Model" = None  # noqa: F821
     rag_collection: Collection = None
+    PERSIST_COLS: ClassVar[List[str]] = [
+        "config",
+    ]
     def __post_init__(self):
         if not self.config:
             self.config = InferenceConfig()
@@ -81,14 +102,16 @@ class RAGInferenceEngine(InferenceEngine):
         return self._model
     def initialize_model(self, **kwargs):
-        rag_collection = self.training_data.collection
-        rag_collection.attach_indexer("llm", auto_index=False)
-        self.rag_collection = rag_collection
+        logger.info(f"Initializing model {self.model}")
+        if self.training_data:
+            rag_collection = self.training_data.collection
+            rag_collection.attach_indexer("llm", auto_index=False)
+            self.rag_collection = rag_collection
     def object_to_text(self, object: OBJECT) -> str:
         return yaml.dump(object)
-    def derive(self, object: OBJECT) -> Optional[Inference]:
+    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
         import llm
         from tiktoken import encoding_for_model
@@ -98,48 +121,142 @@ class RAGInferenceEngine(InferenceEngine):
         model_name = self.config.llm_config.model_name
         feature_attributes = self.config.feature_attributes
         target_attributes = self.config.target_attributes
-        num_examples = self.config.llm_config.number_of_few_shot_examples or 5
+        num_examples = self.config.llm_config.number_of_few_shot_examples or DEFAULT_NUM_EXAMPLES
         query_text = self.object_to_text(object)
-        if not self.rag_collection.indexers:
-            raise ValueError("RAG collection must have an indexer attached")
-        rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
-        examples = rs.rows
-        if not examples:
-            raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
+        mmr_relevance_factor = DEFAULT_MMR_RELEVANCE_FACTOR
+        if not self.rag_collection:
+            # TODO: zero-shot mode
+            examples = []
+        else:
+            if not self.rag_collection.indexers:
+                raise ValueError("RAG collection must have an indexer attached")
+            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
+                                            mmr_relevance_factor=mmr_relevance_factor)
+            examples = rs.rows
+            if not examples:
+                raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
         prompt_clauses = []
+        query_obj = select_nested(object, feature_attributes)
+        query_text = self.object_to_text(query_obj)
         for example in examples:
-            # input_obj = {k: example.get(k, None) for k in feature_attributes}
             input_obj = select_nested(example, feature_attributes)
-            # output_obj = {k: example.get(k, None) for k in target_attributes}
+            input_obj_text = self.object_to_text(input_obj)
+            if input_obj_text == query_text:
+                raise ValueError(
+                    f"Query object {query_text} is the same as example object {input_obj_text}\n"
+                    "This indicates possible test data leakage\n."
+                    "TODO: allow an option that allows user to treat this as a basic lookup\n"
+                )
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
-                "---\nExample:\n"
-                f"## INPUT:\n{self.object_to_text(input_obj)}\n"
-                f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
+                "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
             )
             prompt_clauses.append(prompt_clause)
-        # query_obj = {k: object.get(k, None) for k in feature_attributes}
-        query_obj = select_nested(object, feature_attributes)
-        query_text = self.object_to_text(query_obj)
-        prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
+        system_prompt += "\n".join(additional_prompt_texts or [])
+        prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
-        def make_text(texts):
-            return "\n".join(prompt_clauses) + prompt_end
+        def make_text(texts: List[str]):
+            return "\n".join(texts) + prompt_end
         try:
             encoding = encoding_for_model(model_name)
         except KeyError:
             encoding = encoding_for_model("gpt-4")
         token_limit = get_token_limit(model_name)
-        prompt = render_formatted_text(make_text, prompt_clauses, encoding, token_limit)
+        prompt = render_formatted_text(make_text, values=prompt_clauses,
+                                       encoding=encoding, token_limit=token_limit,
+                                       additional_text=system_prompt)
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system_prompt)
         yaml_str = response.text()
         logger.info(f"Response: {yaml_str}")
+        predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
+        if self.config.validate_results:
+            base_collection = self.training_data.base_collection
+            errs = list(base_collection.iter_validate_collection([predicted_object]))
+            if errs:
+                print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
+                print(f"PARSED: {predicted_object}")
+                print(f"ERRORS: {errs}")
+                if iteration > MAX_ITERATIONS:
+                    raise ValueError(f"Validation errors: {errs}")
+                extra_texts = [
+                    "Make sure results conform to the schema. Previously you provided:\n",
+                    yaml_str,
+                    "\nThis was invalid.\n",
+                    "Validation errors:\n",
+                ] + [self.object_to_text(e) for e in errs]
+                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
+        return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+    def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
+        if "```" in yaml_str:
+            yaml_str = yaml_str.split("```")[1].strip()
+            if yaml_str.startswith("yaml"):
+                yaml_str = yaml_str[4:].strip()
         try:
-            predicted_object = yaml.safe_load(yaml_str)
-            return Inference(predicted_object=predicted_object)
-        except yaml.parser.ParserError as e:
-            logger.error(f"Error parsing response: {yaml_str}\n{e}")
+            return yaml.safe_load(yaml_str)
+        except Exception as e:
+            if strict:
+                raise e
+            logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
             return None
+    def export_model(
+        self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
+    ):
+        self.save_model(output)
+    def save_model(self, output: Union[str, Path]) -> None:
+        """
+        Save the trained model and related data to a file.
+        :param output: Path to save the model
+        """
+        # trigger index
+        _qr = self.rag_collection.search("*", limit=1)
+        assert len(_qr.ranked_rows) > 0
+        rows = self.rag_collection.find(limit=-1).rows
+        indexers = self.rag_collection.indexers
+        assert len(indexers) == 1
+        ix = self.rag_collection.indexers["llm"]
+        ix_coll = self.rag_collection.parent.get_collection(self.rag_collection.get_index_collection_name(ix))
+        ix_rows = ix_coll.find(limit=-1).rows
+        assert len(ix_rows) > 0
+        tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows, config=self.config)
+        # tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows)
+        with open(output, "w", encoding="utf-8") as f:
+            json.dump(tm.model_dump(), f)
+    @classmethod
+    def load_model(cls, file_path: Union[str, Path]) -> "RAGInferenceEngine":
+        """
+        Load a trained model and related data from a file.
+        :param file_path: Path to the saved model
+        :return: SklearnInferenceEngine instance with loaded model
+        """
+        with open(file_path, "r", encoding="utf-8") as f:
+            model_data = json.load(f)
+        tm = TrainedModel(**model_data)
+        from linkml_store.api import Client
+        client = Client()
+        db = client.attach_database("duckdb", alias="training")
+        db.store({"data": tm.rag_collection_rows})
+        collection = db.get_collection("data")
+        ix = collection.attach_indexer("llm", auto_index=False)
+        assert ix.name
+        ix_coll_name = collection.get_index_collection_name(ix)
+        assert ix_coll_name
+        ix_coll = db.get_collection(ix_coll_name, create_if_not_exists=True)
+        ix_coll.insert(tm.index_rows)
+        ie = cls(config=tm.config)
+        ie.rag_collection = collection
+        return ie

linkml_store/inference/implementations/sklearn_inference_engine.py CHANGED Viewed

@@ -153,7 +153,7 @@ class SklearnInferenceEngine(InferenceEngine):
             y = y_encoder.fit_transform(y.values.ravel())  # Convert to 1D numpy array
             self.transformed_targets = y_encoder.classes_
-        logger.info(f"Fitting model with features: {X.columns}")
+        # print(f"Fitting model with features: {X.columns}")
         clf = DecisionTreeClassifier(random_state=42)
         clf.fit(X, y)
         self.classifier = clf

linkml_store/inference/inference_config.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Any
 from pydantic import BaseModel, ConfigDict, Field
@@ -35,6 +35,8 @@ class InferenceConfig(BaseModel, extra="forbid"):
     feature_attributes: Optional[List[str]] = None
     train_test_split: Optional[Tuple[float, float]] = None
     llm_config: Optional[LLMConfig] = None
+    random_seed: Optional[int] = None
+    validate_results: Optional[bool] = None
     @classmethod
     def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
@@ -57,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
     """
     Result of an inference derivation.
     """
+    query: Optional[OBJECT] = Field(default=None, description="The query object.")
     predicted_object: OBJECT = Field(..., description="The predicted object.")
     confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
+    explanation: Optional[Any] = Field(default=None, description="Explanation of the prediction.")

linkml_store/inference/inference_engine.py CHANGED Viewed

@@ -29,6 +29,7 @@ class ModelSerialization(str, Enum):
     PNG = "png"
     LINKML_EXPRESSION = "linkml_expression"
     RULE_BASED = "rulebased"
+    RAG_INDEX = "rag_index"
     @classmethod
     def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
@@ -58,7 +59,7 @@ class ModelSerialization(str, Enum):
 class CollectionSlice(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
     name: Optional[str] = None
     base_collection: Optional[Collection] = None
@@ -69,17 +70,26 @@ class CollectionSlice(BaseModel):
     @property
     def collection(self) -> Collection:
+        if not self._collection and not self.indices:
+            return self.base_collection
         if not self._collection:
             rows = self.base_collection.find({}, limit=-1).rows
-            # subset based on indices
             subset = [rows[i] for i in self.indices]
             db = self.base_collection.parent
-            subset_name = f"{self.base_collection.alias}__rag_{self.name}"
+            subset_name = self.slice_alias
             subset_collection = db.get_collection(subset_name, create_if_not_exists=True)
+            # ensure the collection has the same schema type as the base collection;
+            # this ensures that column/attribute types are preserved
+            subset_collection.metadata.type = self.base_collection.target_class_name
+            subset_collection.delete_where({})
             subset_collection.insert(subset)
             self._collection = subset_collection
         return self._collection
+    @property
+    def slice_alias(self) -> str:
+        return f"{self.base_collection.alias}__rag_{self.name}"
     def as_dataframe(self, flattened=False) -> pd.DataFrame:
         """
         Return the slice of the collection as a dataframe.
@@ -113,31 +123,28 @@ class InferenceEngine(ABC):
         :param collection:
         :param split:
+        :param randomize:
         :return:
         """
+        local_random = random.Random(self.config.random_seed) if self.config.random_seed else random.Random()
         split = split or self.config.train_test_split
         if not split:
             split = (0.7, 0.3)
+        if split[0] == 1.0:
+            self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
+            self.testing_data = None
+            return
         logger.info(f"Loading and splitting data from collection {collection.alias}")
         size = collection.size()
         indices = range(size)
         if randomize:
-            train_indices = random.sample(indices, int(size * split[0]))
+            train_indices = local_random.sample(indices, int(size * split[0]))
             test_indices = set(indices) - set(train_indices)
         else:
             train_indices = indices[: int(size * split[0])]
             test_indices = indices[int(size * split[0]) :]
         self.training_data = CollectionSlice(name="train", base_collection=collection, indices=train_indices)
         self.testing_data = CollectionSlice(name="test", base_collection=collection, indices=test_indices)
-        # all_data = collection.find({}, limit=size).rows
-        # all_data_df = nested_objects_to_dataframe(all_data)
-        # all_data_df = collection.find({}, limit=size).rows_dataframe
-        # randomize/shuffle order of rows in dataframe
-        # all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
-        # self.training_data = CollectionSlice(dataframe=all_data_df[: int(size * split[0])])
-        # self.testing_data = CollectionSlice(dataframe=all_data_df[int(size * split[0]) : size])
-        # self.training_data = CollectionSlice(base_collection=collection, slice=(0, int(size * split[0])))
-        # self.testing_data = CollectionSlice(base_collection=collection, slice=(int(size * split[0]), size))
     def initialize_model(self, **kwargs):
         """

linkml_store/utils/llm_utils.py CHANGED Viewed

@@ -20,6 +20,7 @@ MODEL_TOKEN_MAPPING = {
     "gpt-3.5-turbo-instruct": 4096,
     "text-ada-001": 2049,
     "ada": 2049,
+    "ada-002": 8192,
     "text-babbage-001": 2040,
     "babbage": 2049,
     "text-curie-001": 2049,

linkml_store/utils/vector_utils.py ADDED Viewed

@@ -0,0 +1,165 @@
+import logging
+from typing import List, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+LOL = List[List[float]]
+def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
+    """
+    Calculate the cosine similarity between two vectors.
+    >>> v100 = np.array([1, 0, 0])
+    >>> v010 = np.array([0, 1, 0])
+    >>> v001 = np.array([0, 0, 1])
+    >>> v011 = np.array([0, 1, 1])
+    >>> pairwise_cosine_similarity(v100, v010)
+    0.0
+    >>> pairwise_cosine_similarity(v100, v001)
+    0.0
+    >>> pairwise_cosine_similarity(v010, v001)
+    0.0
+    >>> pairwise_cosine_similarity(v100, v100)
+    1.0
+    >>> f"{pairwise_cosine_similarity(v010, v011):0.3f}"
+    '0.707'
+    :param vector1:
+    :param vector2:
+    :return:
+    """
+    dot_product = np.dot(vector1, vector2)
+    norm1 = np.linalg.norm(vector1)
+    norm2 = np.linalg.norm(vector2)
+    return dot_product / (norm1 * norm2)
+def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
+    """
+    Compute cosine similarity between two lists of vectors.
+    Result is a two column vector sim[ROW][COL] where ROW is from list1 and COL is from list2.
+    :param list1:
+    :param list2:
+    :return:
+    """
+    # Convert lists to numpy arrays
+    matrix1 = np.array(list1)
+    matrix2 = np.array(list2)
+    # Normalize the vectors in both matrices
+    matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
+    matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
+    # Compute dot products (resulting in cosine similarity values)
+    cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
+    return cosine_similarity_matrix
+def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Find the top match for each row in the cosine similarity matrix.
+    :param cosine_similarity_matrix:
+    :return:
+    """
+    # Find the index of the maximum value in each row
+    top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
+    # Find the maximum similarity value in each row
+    top_match_values = np.amax(cosine_similarity_matrix, axis=1)
+    return top_match_indices, top_match_values
+def top_n_matches(
+    cosine_similarity_matrix: np.ndarray, n: int = 10
+) -> Tuple[np.ndarray, np.ndarray]:
+    # Find the indices that would sort each row in descending order
+    sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
+    # Take the first n indices from the sorted indices to get the top n matches
+    top_n_indices = sorted_indices[:, :n]
+    # Take the first n values from the sorted values to get the top n match values
+    top_n_values = -np.sort(-cosine_similarity_matrix, axis=1)[:, :n]
+    return top_n_indices, top_n_values
+def mmr_diversified_search(
+    query_vector: np.ndarray, document_vectors: List[np.ndarray], relevance_factor=0.5, top_n=None
+) -> List[int]:
+    """
+    Perform diversified search using Maximal Marginal Relevance (MMR).
+    :param query_vector: The vector representing the query.
+    :param document_vectors: The vectors representing the documents.
+    :param relevance_factor: The balance parameter between relevance and diversity.
+    :param top_n: The number of results to return. If None, return all.
+    :return: A list of indices representing the diversified order of documents.
+    """
+    if top_n is None:
+        # If no specific number of results is specified, return all
+        top_n = len(document_vectors)
+    if top_n == 0:
+        return []
+    # Calculate cosine similarities between query and all documents
+    norms_query = np.linalg.norm(query_vector)
+    norms_docs = np.linalg.norm(document_vectors, axis=1)
+    similarities = np.dot(document_vectors, query_vector) / (norms_docs * norms_query)
+    # Initialize set of selected indices and results list
+    selected_indices = set()
+    result_indices = []
+    # Diversified search loop
+    for _ in range(top_n):
+        max_mmr = float("-inf")
+        best_index = None
+        # Loop over all documents
+        for idx, _doc_vector in enumerate(document_vectors):
+            if idx not in selected_indices:
+                relevance = relevance_factor * similarities[idx]
+                diversity = 0
+                # Penalize based on similarity to already selected documents
+                if selected_indices:
+                    max_sim_to_selected = max(
+                        [
+                            np.dot(document_vectors[idx], document_vectors[s])
+                            / (
+                                np.linalg.norm(document_vectors[idx])
+                                * np.linalg.norm(document_vectors[s])
+                            )
+                            for s in selected_indices
+                        ]
+                    )
+                    diversity = (1 - relevance_factor) * max_sim_to_selected
+                mmr_score = relevance - diversity
+                # Update best MMR score and index
+                if mmr_score > max_mmr:
+                    max_mmr = mmr_score
+                    best_index = idx
+        # Add the best document to the result and mark it as selected
+        if best_index is None:
+            logger.warning(f"No best index found over {len(document_vectors)} documents.")
+            continue
+        result_indices.append(best_index)
+        selected_indices.add(best_index)
+    return result_indices

{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: linkml-store
-Version: 0.2.0
+Version: 0.2.2
 Summary: linkml-store
 License: MIT
 Author: Author 1
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
 Provides-Extra: fastapi
 Provides-Extra: frictionless
 Provides-Extra: h5py
+Provides-Extra: ibis
 Provides-Extra: llm
 Provides-Extra: map
 Provides-Extra: mongodb
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: duckdb-engine (>=0.11.2)
 Requires-Dist: fastapi ; extra == "fastapi"
 Requires-Dist: frictionless ; extra == "frictionless"
+Requires-Dist: gcsfs ; extra == "ibis"
 Requires-Dist: h5py ; extra == "h5py"
+Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
 Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
 Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
 Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
 Requires-Dist: linkml_renderer ; extra == "renderer"
 Requires-Dist: llm ; extra == "llm"
 Requires-Dist: matplotlib ; extra == "analytics"
+Requires-Dist: multipledispatch ; extra == "ibis"
 Requires-Dist: neo4j ; extra == "neo4j"
 Requires-Dist: networkx ; extra == "neo4j"
 Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pymongo ; extra == "mongodb"
 Requires-Dist: pystow (>=0.5.4,<0.6.0)
+Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
 Requires-Dist: scikit-learn ; extra == "scipy"
 Requires-Dist: scipy ; extra == "scipy"
 Requires-Dist: seaborn ; extra == "analytics"

{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 linkml_store/__init__.py,sha256=jlU6WOUAn8cKIhzbTULmBTWpW9gZdEt7q_RI6KZN1bY,118
 linkml_store/api/__init__.py,sha256=3CelcFEFz0y3MkQAzhQ9JxHIt1zFk6nYZxSmYTo8YZE,226
 linkml_store/api/client.py,sha256=3klBXenQVbLjNQF3WmYfjASt3zvKOfWaCNp5aJM81Ec,12034
-linkml_store/api/collection.py,sha256=7JndC6A9r3OVbR9aB6d_bdaYN53XU4FpppUterygOaE,37800
+linkml_store/api/collection.py,sha256=YVmfqdZaWfLAw3yzho-GEknsAiV1h5Z3O6csB_8CTY0,39407
 linkml_store/api/config.py,sha256=71pxQ5jM-ETxJWU7CzmKjsH6IEJUMP5sml381u9TYVk,5654
-linkml_store/api/database.py,sha256=QVvUuLQPCxB4cvsS7rXqPSfoHkhcMzP9vUcsjkuEYds,29051
-linkml_store/api/queries.py,sha256=w0qnNeCH6pC9WTGoEQYd300MF6o0G3atz2YxN3WecAs,2028
+linkml_store/api/database.py,sha256=nvae8jnOZsQIFCsl_lRBnKcvrpJg4A10ujIKGeMyUS8,29350
+linkml_store/api/queries.py,sha256=tx9fgGY5fC_2ZbIvg4BqTK_MXJwA_DI4mxr8HdQ6Vos,2075
 linkml_store/api/stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 linkml_store/api/stores/chromadb/__init__.py,sha256=e9BkOPuPnVQKA5PRKDulag59yGNHDP3U2_DnPSrFAKM,132
 linkml_store/api/stores/chromadb/chromadb_collection.py,sha256=RQUZx5oeotkzNihg-dlSevkiTiKY1d9x0bS63HF80W4,4270
 linkml_store/api/stores/chromadb/chromadb_database.py,sha256=dZA3LQE8-ZMhJQOzsUFyxehnKpFF7adR182aggfkaFY,3205
 linkml_store/api/stores/duckdb/__init__.py,sha256=rbQSDgNg-fdvi6-pHGYkJTST4p1qXUZBf9sFSsO3KPk,387
-linkml_store/api/stores/duckdb/duckdb_collection.py,sha256=yXnJpEiGK4lMyNuJykuvlKOqaV9ntqv0m0NZMOw0auk,6911
+linkml_store/api/stores/duckdb/duckdb_collection.py,sha256=Rkbm_uIVIRj5576lEolsyY_3Um1h8Lf3RHn8Fy3LIgU,7036
 linkml_store/api/stores/duckdb/duckdb_database.py,sha256=GH9bcOfHpNp6r-Eu1C3W0xuYcLsqGFDH1Sh4weifGaQ,9923
 linkml_store/api/stores/duckdb/mappings.py,sha256=tDce3W1Apwammhf4LS6cRJ0m4NiJ0eB7vOI_4U5ETY8,148
 linkml_store/api/stores/filesystem/__init__.py,sha256=KjvCjdttwqMHNeGyL-gr59zRz0--HFEWWUNNCJ5hITs,347
@@ -30,30 +30,30 @@ linkml_store/api/stores/solr/solr_collection.py,sha256=ZlxC3JbVaHfSA4HuTeJTsp6qe
 linkml_store/api/stores/solr/solr_database.py,sha256=TFjqbY7jAkdrhAchbNg0E-mChSP7ogNwFExslbvX7Yo,2877
 linkml_store/api/stores/solr/solr_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 linkml_store/api/types.py,sha256=3aIQtDFMvsSmjuN5qrR2vNK5sHa6yzD_rEOPA6tHwvg,176
-linkml_store/cli.py,sha256=NIEU5dEkEKz3a2q4mpkdXxHX1mANd2z9oFIkNVz-wsw,27048
+linkml_store/cli.py,sha256=wl8BhnPcSU6Lt-jsvN1o6086PpUAfu43n5GI6w9SGxw,29384
 linkml_store/constants.py,sha256=x4ZmDsfE9rZcL5WpA93uTKrRWzCD6GodYXviVzIvR38,112
 linkml_store/graphs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 linkml_store/graphs/graph_map.py,sha256=bYRxv8n1YPnFqE9d6JKNmRawb8EAhsPlHhBue0gvtZE,712
 linkml_store/index/__init__.py,sha256=6SQzDe-WZSSqbGNsbCDfyPTyz0s9ISDKw1dm9xgQuT4,1396
 linkml_store/index/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-linkml_store/index/implementations/llm_indexer.py,sha256=LI5f8SLF_rJY5W6wZPLaUqpyoq-VDW_KqlCBNDNm_po,4827
+linkml_store/index/implementations/llm_indexer.py,sha256=y1xvfUm_rl4UEiWJbsUsEnTCma98XRB9C1XOnuaAv5o,5474
 linkml_store/index/implementations/simple_indexer.py,sha256=KnkFJtXTHnwjhD_D6ZK2rFhBID1dgCedcOVPEWAY2NU,1282
-linkml_store/index/indexer.py,sha256=K-TDPzdTyGFo6iG4XI_A_3IpwDbKeiTIbdr85NIL5r8,4918
+linkml_store/index/indexer.py,sha256=e5dsjh2wjOTDRsfClKJAFTbcK1UC7BOGkUCOfDg9omI,7635
 linkml_store/inference/__init__.py,sha256=b8NAFNZjOYU_8gOvxdyCyoiHOOl5Ai2ckKs1tv7ZkkY,342
-linkml_store/inference/evaluation.py,sha256=qvsmGDBKTZBDKhpbPDe_AkcJ2LtQ8e-oUYCUGfI6IAE,5799
+linkml_store/inference/evaluation.py,sha256=YDFYaEu2QLSfFq4oyARrnKfTiPLtNF8irhhspgVDfdY,6013
 linkml_store/inference/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-linkml_store/inference/implementations/rag_inference_engine.py,sha256=MH50-6i30Y5oKgIx47-yDjsPCojYC6-lujtHFBDqIxs,5833
+linkml_store/inference/implementations/rag_inference_engine.py,sha256=mN7YQI-BeZglsAnZnNIuAj-Nxg1su5efNaohooEmNmM,10622
 linkml_store/inference/implementations/rule_based_inference_engine.py,sha256=0IEY_fsHJPJy6QKbYQU_qE87RRnPOXQxPuJKXCQG8jU,6250
-linkml_store/inference/implementations/sklearn_inference_engine.py,sha256=HRhwnlpDJOijxvhLmdTSOq1S9xjBVCrgRT1C8uS0XZQ,13196
-linkml_store/inference/inference_config.py,sha256=xgl3VmueErLIOnQQn4HdC2STJNY6yKoPasWmym4ltHQ,2014
-linkml_store/inference/inference_engine.py,sha256=D1JlkihyNbZp7PYe5lplUbTJgyP7jL4vnxcpBio-KUs,6987
+linkml_store/inference/implementations/sklearn_inference_engine.py,sha256=Sdi7CoRK3qoLJu3prgLy1Ck_zQ1gHWRKFybHe7XQ4_g,13192
+linkml_store/inference/inference_config.py,sha256=EFGdigxWsfTPREbgqyJVRShN0JktCEmFLLoECrLfXSg,2282
+linkml_store/inference/inference_engine.py,sha256=l2UB6cA0rW7a9qyiv8JF5Nzj8nRHGX_yqMYbiDnY1Qc,7055
 linkml_store/inference/inference_engine_registry.py,sha256=6o66gvBYBwdeAKm62zqqvfaBlcopVP_cla3L6uXGsHA,3015
 linkml_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 linkml_store/utils/change_utils.py,sha256=O2rvSvgTKB60reLLz9mX5OWykAA_m93bwnUh5ZWa0EY,471
 linkml_store/utils/file_utils.py,sha256=rQ7-XpmI6_Kx_dhEnI98muFRr0MmgI_kZ_9cgJBf_0I,1411
 linkml_store/utils/format_utils.py,sha256=airJ2_tFsr0dTIbSHT5y0TZbDrvBBV4_qThFPFY5k8U,10925
 linkml_store/utils/io.py,sha256=JHUrWDtlZC2jtN_PQZ4ypdGIyYlftZEN3JaCvEPs44w,884
-linkml_store/utils/llm_utils.py,sha256=Wb4h_E8vrJZDAYHhOdMCSMcz-xxVia4nfuFqiYitZ98,2864
+linkml_store/utils/llm_utils.py,sha256=3jRFUtEywoKdomKb3aCH1GdI9hQJOQo8Udb3Jy4M-Xw,2885
 linkml_store/utils/mongodb_utils.py,sha256=Rl1YmMKs1IXwSsJIViSDChbi0Oer5cBnMmjka2TeQS8,4665
 linkml_store/utils/neo4j_utils.py,sha256=y3KPmDZ8mQmePgg0lUeKkeKqzEr2rV226xxEtHc5pRg,1266
 linkml_store/utils/object_utils.py,sha256=Vib-5Ip2DlRVKLZpU-008ZZI813-vfKVSCY0TksRenM,6293
@@ -64,6 +64,7 @@ linkml_store/utils/schema_utils.py,sha256=iJiZxo5NGr7v87h4DV6V9DrDOZHSswMRuf0N4V
 linkml_store/utils/sklearn_utils.py,sha256=itPpcrsbbyOazdjmivaaZ1lyZeytm0a0hJ2AS8ziUgg,7590
 linkml_store/utils/sql_utils.py,sha256=T41w_vsc3SauTJQkDMwid_nOtKW1YOKyUuaxEf470hk,5938
 linkml_store/utils/stats_utils.py,sha256=4KqBb1bqDgAmq-1fJLLu5B2paPgoZZc3A-gnyVam4bI,1799
+linkml_store/utils/vector_utils.py,sha256=Q1RlpDzavJAM9-H2m2XNU5BNUcfZkpIWeEZii2hK0PQ,5449
 linkml_store/webapi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 linkml_store/webapi/html/__init__.py,sha256=hwp5eeBJKH65Bvv1x9Z4vsT1tLSYtb9Dq4I9r1kL1q0,69
 linkml_store/webapi/html/base.html.j2,sha256=hoiV2uaSxxrQp7VuAZBOHueH7czyJMYcPBRN6dZFYhk,693
@@ -72,8 +73,8 @@ linkml_store/webapi/html/database_details.html.j2,sha256=qtXdavbZb0mohiObI9dvJtk
 linkml_store/webapi/html/databases.html.j2,sha256=a9BCWQYfPeFhdUd31CWhB0yWhTIFXQayO08JgjyqKoc,294
 linkml_store/webapi/html/generic.html.j2,sha256=KtLaO2HUEF2Opq-OwHKgRKetNWe8IWc6JuIkxRPsywk,1018
 linkml_store/webapi/main.py,sha256=B0Da575kKR7X88N9ykm99Dem8FyBAW9f-w3A_JwUzfw,29165
-linkml_store-0.2.0.dist-info/LICENSE,sha256=77mDOslUnalYnuq9xQYZKtIoNEzcH9mIjvWHOKjamnE,1086
-linkml_store-0.2.0.dist-info/METADATA,sha256=v_KjIlu-gTOHunF0ASPHRP_utQv-ry1piX3RpfPWX1k,6743
-linkml_store-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-linkml_store-0.2.0.dist-info/entry_points.txt,sha256=gWxVsHqx-t-UKWFHFzawQTvs4is4vC1rCF5AeKyqWWk,101
-linkml_store-0.2.0.dist-info/RECORD,,
+linkml_store-0.2.2.dist-info/LICENSE,sha256=77mDOslUnalYnuq9xQYZKtIoNEzcH9mIjvWHOKjamnE,1086
+linkml_store-0.2.2.dist-info/METADATA,sha256=_zde_tfX6AAw1ZvM1LnYOmzkQbiz6f3rQhVyBKODdnE,6977
+linkml_store-0.2.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+linkml_store-0.2.2.dist-info/entry_points.txt,sha256=gWxVsHqx-t-UKWFHFzawQTvs4is4vC1rCF5AeKyqWWk,101
+linkml_store-0.2.2.dist-info/RECORD,,

{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{linkml_store-0.2.0.dist-info → linkml_store-0.2.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

linkml-store 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl