PyPI - linkml-store - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

linkml-store 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (79) hide show

{linkml_store-0.2.1 → linkml_store-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: linkml-store
-Version: 0.2.1
+Version: 0.2.2
 Summary: linkml-store
 License: MIT
 Author: Author 1

{linkml_store-0.2.1 → linkml_store-0.2.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "linkml-store"
-version = "0.2.1"
+version = "0.2.2"
 description = "linkml-store"
 authors = ["Author 1 <author@org.org>"]
 license = "MIT"

{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/collection.py RENAMED Viewed

@@ -470,6 +470,7 @@ class Collection(Generic[DatabaseType]):
         where: Optional[Any] = None,
         index_name: Optional[str] = None,
         limit: Optional[int] = None,
+        mmr_relevance_factor: Optional[float] = None,
         **kwargs,
     ) -> QueryResult:
         """
@@ -534,7 +535,7 @@ class Collection(Generic[DatabaseType]):
         index_col = ix.index_field
         # TODO: optimize this for large indexes
         vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
-        results = ix.search(query, vector_pairs, limit=limit)
+        results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
             del r[1][index_col]
         new_qr = QueryResult(num_rows=len(results))

{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/cli.py RENAMED Viewed

@@ -1,8 +1,9 @@
 import logging
 import sys
 import warnings
+from collections import defaultdict
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple, Any
 import click
 import yaml
@@ -415,14 +416,6 @@ def list_collections(ctx, **kwargs):
 def fq(ctx, where, limit, columns, output_type, wide, output):
     """
     Query facets from the specified collection.
-    :param ctx:
-    :param where:
-    :param limit:
-    :param columns:
-    :param output_type:
-    :param output:
-    :return:
     """
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
@@ -488,6 +481,41 @@ def describe(ctx, where, output_type, output, limit):
     write_output(df.describe(include="all").transpose(), output_type, target=output)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--index", "-I", help="Attributes to index on in pivot")
+@click.option("--columns", "-A", help="Attributes to use as columns in pivot")
+@click.option("--values", "-V", help="Attributes to use as values in pivot")
+@click.pass_context
+def pivot(ctx, where, limit, index, columns, values, output_type, output):
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    column_atts = columns.split(",") if columns else None
+    value_atts = values.split(",") if values else None
+    index_atts = index.split(",") if index else None
+    results = collection.find(where_clause, limit=limit)
+    pivoted = defaultdict(dict)
+    for row in results.rows:
+        index_key = tuple([row.get(att) for att in index_atts])
+        column_key = tuple([row.get(att) for att in column_atts])
+        value_key = tuple([row.get(att) for att in value_atts])
+        pivoted[index_key][column_key] = value_key
+    pivoted_objs = []
+    def detuple(t: Tuple) -> Any:
+        if len(t) == 1:
+            return t[0]
+        return str(t)
+    for index_key, data in pivoted.items():
+        obj = {att: key for att, key in zip(index_atts, index_key)}
+        for column_key, value_key in data.items():
+            obj[detuple(column_key)] = detuple(value_key)
+        pivoted_objs.append(obj)
+    write_output(pivoted_objs, output_type, target=output)
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")

{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/indexer.py RENAMED Viewed

@@ -3,6 +3,7 @@ from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
+from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
 from pydantic import BaseModel
 INDEX_ITEM = np.ndarray
@@ -19,20 +20,6 @@ class TemplateSyntaxEnum(str, Enum):
     fstring = "fstring"
-def cosine_similarity(vector1, vector2) -> float:
-    """
-    Calculate the cosine similarity between two vectors
-    :param vector1:
-    :param vector2:
-    :return:
-    """
-    dot_product = np.dot(vector1, vector2)
-    norm1 = np.linalg.norm(vector1)
-    norm2 = np.linalg.norm(vector2)
-    return dot_product / (norm1 * norm2)
 class Indexer(BaseModel):
     """
     An indexer operates on a collection in order to search for objects.
@@ -79,7 +66,7 @@ class Indexer(BaseModel):
     to get a sense of how they work.
     >>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
-    >>> assert cosine_similarity(vectors[0], vectors[1]) > cosine_similarity(vectors[0], vectors[2])
+    >>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
     Note you should consult the documentation for the specific indexer you are using for more details on
     how text is converted to vectors.
@@ -167,7 +154,8 @@ class Indexer(BaseModel):
         return str(obj)
     def search(
-        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
+        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
+            mmr_relevance_factor: Optional[float] = None
     ) -> List[Tuple[float, Any]]:
         """
         Use the indexer to search against a database of vectors.
@@ -183,13 +171,29 @@ class Indexer(BaseModel):
         # Convert the query string to a vector
         query_vector = self.text_to_vector(query, cache=False)
+        if mmr_relevance_factor is not None:
+            vlist = [v for _, v in vectors]
+            idlist = [id for id, _ in vectors]
+            sorted_indices = mmr_diversified_search(
+                query_vector, vlist,
+                relevance_factor=mmr_relevance_factor, top_n=limit)
+            results = []
+            # TODO: this is inefficient when limit is high
+            for i in range(limit):
+                if i >= len(sorted_indices):
+                    break
+                pos = sorted_indices[i]
+                score = pairwise_cosine_similarity(query_vector, vlist[pos])
+                results.append((score, idlist[pos]))
+            return results
         distances = []
         # Iterate over each indexed item
         for item_id, item_vector in vectors:
             # Calculate the Euclidean distance between the query vector and the item vector
             # distance = 1-np.linalg.norm(query_vector - item_vector)
-            distance = cosine_similarity(query_vector, item_vector)
+            distance = pairwise_cosine_similarity(query_vector, item_vector)
             distances.append((distance, item_id))
         # Sort the distances in ascending order

{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/rag_inference_engine.py RENAMED Viewed

@@ -15,6 +15,10 @@ from linkml_store.utils.object_utils import select_nested
 logger = logging.getLogger(__name__)
+MAX_ITERATIONS = 5
+DEFAULT_NUM_EXAMPLES = 20
+DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
 SYSTEM_PROMPT = """
 You are a {llm_config.role}, your task is to inference the YAML
 object output given the YAML object input. I will provide you
@@ -32,6 +36,10 @@ class TrainedModel(BaseModel, extra="forbid"):
     config: Optional[InferenceConfig] = None
+class RAGInference(Inference):
+    iterations: int = 0
 @dataclass
 class RAGInferenceEngine(InferenceEngine):
     """
@@ -103,7 +111,7 @@ class RAGInferenceEngine(InferenceEngine):
     def object_to_text(self, object: OBJECT) -> str:
         return yaml.dump(object)
-    def derive(self, object: OBJECT) -> Optional[Inference]:
+    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
         import llm
         from tiktoken import encoding_for_model
@@ -113,15 +121,17 @@ class RAGInferenceEngine(InferenceEngine):
         model_name = self.config.llm_config.model_name
         feature_attributes = self.config.feature_attributes
         target_attributes = self.config.target_attributes
-        num_examples = self.config.llm_config.number_of_few_shot_examples or 5
+        num_examples = self.config.llm_config.number_of_few_shot_examples or DEFAULT_NUM_EXAMPLES
         query_text = self.object_to_text(object)
+        mmr_relevance_factor = DEFAULT_MMR_RELEVANCE_FACTOR
         if not self.rag_collection:
             # TODO: zero-shot mode
             examples = []
         else:
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
-            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
+            rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
+                                            mmr_relevance_factor=mmr_relevance_factor)
             examples = rs.rows
             if not examples:
                 raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
@@ -143,23 +153,43 @@ class RAGInferenceEngine(InferenceEngine):
             )
             prompt_clauses.append(prompt_clause)
-        prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
         system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
+        system_prompt += "\n".join(additional_prompt_texts or [])
+        prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
-        def make_text(texts):
-            return "\n".join(prompt_clauses) + prompt_end
+        def make_text(texts: List[str]):
+            return "\n".join(texts) + prompt_end
         try:
             encoding = encoding_for_model(model_name)
         except KeyError:
             encoding = encoding_for_model("gpt-4")
         token_limit = get_token_limit(model_name)
-        prompt = render_formatted_text(make_text, prompt_clauses, encoding, token_limit)
+        prompt = render_formatted_text(make_text, values=prompt_clauses,
+                                       encoding=encoding, token_limit=token_limit,
+                                       additional_text=system_prompt)
         logger.info(f"Prompt: {prompt}")
         response = model.prompt(prompt, system_prompt)
         yaml_str = response.text()
         logger.info(f"Response: {yaml_str}")
-        return Inference(predicted_object=self._parse_yaml_payload(yaml_str))
+        predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
+        if self.config.validate_results:
+            base_collection = self.training_data.base_collection
+            errs = list(base_collection.iter_validate_collection([predicted_object]))
+            if errs:
+                print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
+                print(f"PARSED: {predicted_object}")
+                print(f"ERRORS: {errs}")
+                if iteration > MAX_ITERATIONS:
+                    raise ValueError(f"Validation errors: {errs}")
+                extra_texts = [
+                    "Make sure results conform to the schema. Previously you provided:\n",
+                    yaml_str,
+                    "\nThis was invalid.\n",
+                    "Validation errors:\n",
+                ] + [self.object_to_text(e) for e in errs]
+                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
+        return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
     def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
         if "```" in yaml_str:

{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/inference_config.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Any
 from pydantic import BaseModel, ConfigDict, Field
@@ -36,6 +36,7 @@ class InferenceConfig(BaseModel, extra="forbid"):
     train_test_split: Optional[Tuple[float, float]] = None
     llm_config: Optional[LLMConfig] = None
     random_seed: Optional[int] = None
+    validate_results: Optional[bool] = None
     @classmethod
     def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
@@ -58,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
     """
     Result of an inference derivation.
     """
+    query: Optional[OBJECT] = Field(default=None, description="The query object.")
     predicted_object: OBJECT = Field(..., description="The predicted object.")
     confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
+    explanation: Optional[Any] = Field(default=None, description="Explanation of the prediction.")

linkml_store-0.2.2/src/linkml_store/utils/vector_utils.py ADDED Viewed

@@ -0,0 +1,165 @@
+import logging
+from typing import List, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+LOL = List[List[float]]
+def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
+    """
+    Calculate the cosine similarity between two vectors.
+    >>> v100 = np.array([1, 0, 0])
+    >>> v010 = np.array([0, 1, 0])
+    >>> v001 = np.array([0, 0, 1])
+    >>> v011 = np.array([0, 1, 1])
+    >>> pairwise_cosine_similarity(v100, v010)
+    0.0
+    >>> pairwise_cosine_similarity(v100, v001)
+    0.0
+    >>> pairwise_cosine_similarity(v010, v001)
+    0.0
+    >>> pairwise_cosine_similarity(v100, v100)
+    1.0
+    >>> f"{pairwise_cosine_similarity(v010, v011):0.3f}"
+    '0.707'
+    :param vector1:
+    :param vector2:
+    :return:
+    """
+    dot_product = np.dot(vector1, vector2)
+    norm1 = np.linalg.norm(vector1)
+    norm2 = np.linalg.norm(vector2)
+    return dot_product / (norm1 * norm2)
+def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
+    """
+    Compute cosine similarity between two lists of vectors.
+    Result is a two column vector sim[ROW][COL] where ROW is from list1 and COL is from list2.
+    :param list1:
+    :param list2:
+    :return:
+    """
+    # Convert lists to numpy arrays
+    matrix1 = np.array(list1)
+    matrix2 = np.array(list2)
+    # Normalize the vectors in both matrices
+    matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
+    matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
+    # Compute dot products (resulting in cosine similarity values)
+    cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
+    return cosine_similarity_matrix
+def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Find the top match for each row in the cosine similarity matrix.
+    :param cosine_similarity_matrix:
+    :return:
+    """
+    # Find the index of the maximum value in each row
+    top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
+    # Find the maximum similarity value in each row
+    top_match_values = np.amax(cosine_similarity_matrix, axis=1)
+    return top_match_indices, top_match_values
+def top_n_matches(
+    cosine_similarity_matrix: np.ndarray, n: int = 10
+) -> Tuple[np.ndarray, np.ndarray]:
+    # Find the indices that would sort each row in descending order
+    sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
+    # Take the first n indices from the sorted indices to get the top n matches
+    top_n_indices = sorted_indices[:, :n]
+    # Take the first n values from the sorted values to get the top n match values
+    top_n_values = -np.sort(-cosine_similarity_matrix, axis=1)[:, :n]
+    return top_n_indices, top_n_values
+def mmr_diversified_search(
+    query_vector: np.ndarray, document_vectors: List[np.ndarray], relevance_factor=0.5, top_n=None
+) -> List[int]:
+    """
+    Perform diversified search using Maximal Marginal Relevance (MMR).
+    :param query_vector: The vector representing the query.
+    :param document_vectors: The vectors representing the documents.
+    :param relevance_factor: The balance parameter between relevance and diversity.
+    :param top_n: The number of results to return. If None, return all.
+    :return: A list of indices representing the diversified order of documents.
+    """
+    if top_n is None:
+        # If no specific number of results is specified, return all
+        top_n = len(document_vectors)
+    if top_n == 0:
+        return []
+    # Calculate cosine similarities between query and all documents
+    norms_query = np.linalg.norm(query_vector)
+    norms_docs = np.linalg.norm(document_vectors, axis=1)
+    similarities = np.dot(document_vectors, query_vector) / (norms_docs * norms_query)
+    # Initialize set of selected indices and results list
+    selected_indices = set()
+    result_indices = []
+    # Diversified search loop
+    for _ in range(top_n):
+        max_mmr = float("-inf")
+        best_index = None
+        # Loop over all documents
+        for idx, _doc_vector in enumerate(document_vectors):
+            if idx not in selected_indices:
+                relevance = relevance_factor * similarities[idx]
+                diversity = 0
+                # Penalize based on similarity to already selected documents
+                if selected_indices:
+                    max_sim_to_selected = max(
+                        [
+                            np.dot(document_vectors[idx], document_vectors[s])
+                            / (
+                                np.linalg.norm(document_vectors[idx])
+                                * np.linalg.norm(document_vectors[s])
+                            )
+                            for s in selected_indices
+                        ]
+                    )
+                    diversity = (1 - relevance_factor) * max_sim_to_selected
+                mmr_score = relevance - diversity
+                # Update best MMR score and index
+                if mmr_score > max_mmr:
+                    max_mmr = mmr_score
+                    best_index = idx
+        # Add the best document to the result and mark it as selected
+        if best_index is None:
+            logger.warning(f"No best index found over {len(document_vectors)} documents.")
+            continue
+        result_indices.append(best_index)
+        selected_indices.add(best_index)
+    return result_indices