linkml-store 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- {linkml_store-0.2.1 → linkml_store-0.2.2}/PKG-INFO +1 -1
- {linkml_store-0.2.1 → linkml_store-0.2.2}/pyproject.toml +1 -1
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/collection.py +2 -1
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/cli.py +37 -9
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/indexer.py +21 -17
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/rag_inference_engine.py +38 -8
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/inference_config.py +4 -2
- linkml_store-0.2.2/src/linkml_store/utils/vector_utils.py +165 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/LICENSE +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/README.md +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/client.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/config.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/queries.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/types.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/constants.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/graphs/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/graphs/graph_map.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/evaluation.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/inference_engine.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/inference_engine_registry.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/change_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/file_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/format_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/io.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/llm_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/mongodb_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/neo4j_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/object_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/pandas_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/patch_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/query_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/schema_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/sklearn_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/sql_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/utils/stats_utils.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/__init__.py +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/base.html.j2 +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
- {linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/main.py +0 -0
|
@@ -470,6 +470,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
470
470
|
where: Optional[Any] = None,
|
|
471
471
|
index_name: Optional[str] = None,
|
|
472
472
|
limit: Optional[int] = None,
|
|
473
|
+
mmr_relevance_factor: Optional[float] = None,
|
|
473
474
|
**kwargs,
|
|
474
475
|
) -> QueryResult:
|
|
475
476
|
"""
|
|
@@ -534,7 +535,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
534
535
|
index_col = ix.index_field
|
|
535
536
|
# TODO: optimize this for large indexes
|
|
536
537
|
vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
|
|
537
|
-
results = ix.search(query, vector_pairs, limit=limit)
|
|
538
|
+
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
|
|
538
539
|
for r in results:
|
|
539
540
|
del r[1][index_col]
|
|
540
541
|
new_qr = QueryResult(num_rows=len(results))
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
3
|
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional, Tuple, Any
|
|
6
7
|
|
|
7
8
|
import click
|
|
8
9
|
import yaml
|
|
@@ -415,14 +416,6 @@ def list_collections(ctx, **kwargs):
|
|
|
415
416
|
def fq(ctx, where, limit, columns, output_type, wide, output):
|
|
416
417
|
"""
|
|
417
418
|
Query facets from the specified collection.
|
|
418
|
-
|
|
419
|
-
:param ctx:
|
|
420
|
-
:param where:
|
|
421
|
-
:param limit:
|
|
422
|
-
:param columns:
|
|
423
|
-
:param output_type:
|
|
424
|
-
:param output:
|
|
425
|
-
:return:
|
|
426
419
|
"""
|
|
427
420
|
collection = ctx.obj["settings"].collection
|
|
428
421
|
where_clause = yaml.safe_load(where) if where else None
|
|
@@ -488,6 +481,41 @@ def describe(ctx, where, output_type, output, limit):
|
|
|
488
481
|
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
489
482
|
|
|
490
483
|
|
|
484
|
+
@cli.command()
|
|
485
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
486
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
487
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
488
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
489
|
+
@click.option("--index", "-I", help="Attributes to index on in pivot")
|
|
490
|
+
@click.option("--columns", "-A", help="Attributes to use as columns in pivot")
|
|
491
|
+
@click.option("--values", "-V", help="Attributes to use as values in pivot")
|
|
492
|
+
@click.pass_context
|
|
493
|
+
def pivot(ctx, where, limit, index, columns, values, output_type, output):
|
|
494
|
+
collection = ctx.obj["settings"].collection
|
|
495
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
496
|
+
column_atts = columns.split(",") if columns else None
|
|
497
|
+
value_atts = values.split(",") if values else None
|
|
498
|
+
index_atts = index.split(",") if index else None
|
|
499
|
+
results = collection.find(where_clause, limit=limit)
|
|
500
|
+
pivoted = defaultdict(dict)
|
|
501
|
+
for row in results.rows:
|
|
502
|
+
index_key = tuple([row.get(att) for att in index_atts])
|
|
503
|
+
column_key = tuple([row.get(att) for att in column_atts])
|
|
504
|
+
value_key = tuple([row.get(att) for att in value_atts])
|
|
505
|
+
pivoted[index_key][column_key] = value_key
|
|
506
|
+
pivoted_objs = []
|
|
507
|
+
def detuple(t: Tuple) -> Any:
|
|
508
|
+
if len(t) == 1:
|
|
509
|
+
return t[0]
|
|
510
|
+
return str(t)
|
|
511
|
+
for index_key, data in pivoted.items():
|
|
512
|
+
obj = {att: key for att, key in zip(index_atts, index_key)}
|
|
513
|
+
for column_key, value_key in data.items():
|
|
514
|
+
obj[detuple(column_key)] = detuple(value_key)
|
|
515
|
+
pivoted_objs.append(obj)
|
|
516
|
+
write_output(pivoted_objs, output_type, target=output)
|
|
517
|
+
|
|
518
|
+
|
|
491
519
|
@cli.command()
|
|
492
520
|
@click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
|
|
493
521
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
@@ -3,6 +3,7 @@ from enum import Enum
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
+
from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
|
|
6
7
|
from pydantic import BaseModel
|
|
7
8
|
|
|
8
9
|
INDEX_ITEM = np.ndarray
|
|
@@ -19,20 +20,6 @@ class TemplateSyntaxEnum(str, Enum):
|
|
|
19
20
|
fstring = "fstring"
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def cosine_similarity(vector1, vector2) -> float:
|
|
23
|
-
"""
|
|
24
|
-
Calculate the cosine similarity between two vectors
|
|
25
|
-
|
|
26
|
-
:param vector1:
|
|
27
|
-
:param vector2:
|
|
28
|
-
:return:
|
|
29
|
-
"""
|
|
30
|
-
dot_product = np.dot(vector1, vector2)
|
|
31
|
-
norm1 = np.linalg.norm(vector1)
|
|
32
|
-
norm2 = np.linalg.norm(vector2)
|
|
33
|
-
return dot_product / (norm1 * norm2)
|
|
34
|
-
|
|
35
|
-
|
|
36
23
|
class Indexer(BaseModel):
|
|
37
24
|
"""
|
|
38
25
|
An indexer operates on a collection in order to search for objects.
|
|
@@ -79,7 +66,7 @@ class Indexer(BaseModel):
|
|
|
79
66
|
to get a sense of how they work.
|
|
80
67
|
|
|
81
68
|
>>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
|
|
82
|
-
>>> assert
|
|
69
|
+
>>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
|
|
83
70
|
|
|
84
71
|
Note you should consult the documentation for the specific indexer you are using for more details on
|
|
85
72
|
how text is converted to vectors.
|
|
@@ -167,7 +154,8 @@ class Indexer(BaseModel):
|
|
|
167
154
|
return str(obj)
|
|
168
155
|
|
|
169
156
|
def search(
|
|
170
|
-
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
|
|
157
|
+
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
|
|
158
|
+
mmr_relevance_factor: Optional[float] = None
|
|
171
159
|
) -> List[Tuple[float, Any]]:
|
|
172
160
|
"""
|
|
173
161
|
Use the indexer to search against a database of vectors.
|
|
@@ -183,13 +171,29 @@ class Indexer(BaseModel):
|
|
|
183
171
|
# Convert the query string to a vector
|
|
184
172
|
query_vector = self.text_to_vector(query, cache=False)
|
|
185
173
|
|
|
174
|
+
if mmr_relevance_factor is not None:
|
|
175
|
+
vlist = [v for _, v in vectors]
|
|
176
|
+
idlist = [id for id, _ in vectors]
|
|
177
|
+
sorted_indices = mmr_diversified_search(
|
|
178
|
+
query_vector, vlist,
|
|
179
|
+
relevance_factor=mmr_relevance_factor, top_n=limit)
|
|
180
|
+
results = []
|
|
181
|
+
# TODO: this is inefficient when limit is high
|
|
182
|
+
for i in range(limit):
|
|
183
|
+
if i >= len(sorted_indices):
|
|
184
|
+
break
|
|
185
|
+
pos = sorted_indices[i]
|
|
186
|
+
score = pairwise_cosine_similarity(query_vector, vlist[pos])
|
|
187
|
+
results.append((score, idlist[pos]))
|
|
188
|
+
return results
|
|
189
|
+
|
|
186
190
|
distances = []
|
|
187
191
|
|
|
188
192
|
# Iterate over each indexed item
|
|
189
193
|
for item_id, item_vector in vectors:
|
|
190
194
|
# Calculate the Euclidean distance between the query vector and the item vector
|
|
191
195
|
# distance = 1-np.linalg.norm(query_vector - item_vector)
|
|
192
|
-
distance =
|
|
196
|
+
distance = pairwise_cosine_similarity(query_vector, item_vector)
|
|
193
197
|
distances.append((distance, item_id))
|
|
194
198
|
|
|
195
199
|
# Sort the distances in ascending order
|
|
@@ -15,6 +15,10 @@ from linkml_store.utils.object_utils import select_nested
|
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
|
+
MAX_ITERATIONS = 5
|
|
19
|
+
DEFAULT_NUM_EXAMPLES = 20
|
|
20
|
+
DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
|
|
21
|
+
|
|
18
22
|
SYSTEM_PROMPT = """
|
|
19
23
|
You are a {llm_config.role}, your task is to inference the YAML
|
|
20
24
|
object output given the YAML object input. I will provide you
|
|
@@ -32,6 +36,10 @@ class TrainedModel(BaseModel, extra="forbid"):
|
|
|
32
36
|
config: Optional[InferenceConfig] = None
|
|
33
37
|
|
|
34
38
|
|
|
39
|
+
class RAGInference(Inference):
|
|
40
|
+
iterations: int = 0
|
|
41
|
+
|
|
42
|
+
|
|
35
43
|
@dataclass
|
|
36
44
|
class RAGInferenceEngine(InferenceEngine):
|
|
37
45
|
"""
|
|
@@ -103,7 +111,7 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
103
111
|
def object_to_text(self, object: OBJECT) -> str:
|
|
104
112
|
return yaml.dump(object)
|
|
105
113
|
|
|
106
|
-
def derive(self, object: OBJECT) -> Optional[
|
|
114
|
+
def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
|
|
107
115
|
import llm
|
|
108
116
|
from tiktoken import encoding_for_model
|
|
109
117
|
|
|
@@ -113,15 +121,17 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
113
121
|
model_name = self.config.llm_config.model_name
|
|
114
122
|
feature_attributes = self.config.feature_attributes
|
|
115
123
|
target_attributes = self.config.target_attributes
|
|
116
|
-
num_examples = self.config.llm_config.number_of_few_shot_examples or
|
|
124
|
+
num_examples = self.config.llm_config.number_of_few_shot_examples or DEFAULT_NUM_EXAMPLES
|
|
117
125
|
query_text = self.object_to_text(object)
|
|
126
|
+
mmr_relevance_factor = DEFAULT_MMR_RELEVANCE_FACTOR
|
|
118
127
|
if not self.rag_collection:
|
|
119
128
|
# TODO: zero-shot mode
|
|
120
129
|
examples = []
|
|
121
130
|
else:
|
|
122
131
|
if not self.rag_collection.indexers:
|
|
123
132
|
raise ValueError("RAG collection must have an indexer attached")
|
|
124
|
-
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm"
|
|
133
|
+
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
|
|
134
|
+
mmr_relevance_factor=mmr_relevance_factor)
|
|
125
135
|
examples = rs.rows
|
|
126
136
|
if not examples:
|
|
127
137
|
raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
|
|
@@ -143,23 +153,43 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
143
153
|
)
|
|
144
154
|
prompt_clauses.append(prompt_clause)
|
|
145
155
|
|
|
146
|
-
prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
|
|
147
156
|
system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
|
|
157
|
+
system_prompt += "\n".join(additional_prompt_texts or [])
|
|
158
|
+
prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
|
|
148
159
|
|
|
149
|
-
def make_text(texts):
|
|
150
|
-
return "\n".join(
|
|
160
|
+
def make_text(texts: List[str]):
|
|
161
|
+
return "\n".join(texts) + prompt_end
|
|
151
162
|
|
|
152
163
|
try:
|
|
153
164
|
encoding = encoding_for_model(model_name)
|
|
154
165
|
except KeyError:
|
|
155
166
|
encoding = encoding_for_model("gpt-4")
|
|
156
167
|
token_limit = get_token_limit(model_name)
|
|
157
|
-
prompt = render_formatted_text(make_text, prompt_clauses,
|
|
168
|
+
prompt = render_formatted_text(make_text, values=prompt_clauses,
|
|
169
|
+
encoding=encoding, token_limit=token_limit,
|
|
170
|
+
additional_text=system_prompt)
|
|
158
171
|
logger.info(f"Prompt: {prompt}")
|
|
159
172
|
response = model.prompt(prompt, system_prompt)
|
|
160
173
|
yaml_str = response.text()
|
|
161
174
|
logger.info(f"Response: {yaml_str}")
|
|
162
|
-
|
|
175
|
+
predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
|
|
176
|
+
if self.config.validate_results:
|
|
177
|
+
base_collection = self.training_data.base_collection
|
|
178
|
+
errs = list(base_collection.iter_validate_collection([predicted_object]))
|
|
179
|
+
if errs:
|
|
180
|
+
print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
|
|
181
|
+
print(f"PARSED: {predicted_object}")
|
|
182
|
+
print(f"ERRORS: {errs}")
|
|
183
|
+
if iteration > MAX_ITERATIONS:
|
|
184
|
+
raise ValueError(f"Validation errors: {errs}")
|
|
185
|
+
extra_texts = [
|
|
186
|
+
"Make sure results conform to the schema. Previously you provided:\n",
|
|
187
|
+
yaml_str,
|
|
188
|
+
"\nThis was invalid.\n",
|
|
189
|
+
"Validation errors:\n",
|
|
190
|
+
] + [self.object_to_text(e) for e in errs]
|
|
191
|
+
return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
|
|
192
|
+
return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
|
|
163
193
|
|
|
164
194
|
def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
|
|
165
195
|
if "```" in yaml_str:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List, Optional, Tuple
|
|
2
|
+
from typing import List, Optional, Tuple, Any
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field
|
|
5
5
|
|
|
@@ -36,6 +36,7 @@ class InferenceConfig(BaseModel, extra="forbid"):
|
|
|
36
36
|
train_test_split: Optional[Tuple[float, float]] = None
|
|
37
37
|
llm_config: Optional[LLMConfig] = None
|
|
38
38
|
random_seed: Optional[int] = None
|
|
39
|
+
validate_results: Optional[bool] = None
|
|
39
40
|
|
|
40
41
|
@classmethod
|
|
41
42
|
def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
|
|
@@ -58,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
|
|
|
58
59
|
"""
|
|
59
60
|
Result of an inference derivation.
|
|
60
61
|
"""
|
|
61
|
-
|
|
62
|
+
query: Optional[OBJECT] = Field(default=None, description="The query object.")
|
|
62
63
|
predicted_object: OBJECT = Field(..., description="The predicted object.")
|
|
63
64
|
confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
|
|
65
|
+
explanation: Optional[Any] = Field(default=None, description="Explanation of the prediction.")
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
LOL = List[List[float]]
|
|
10
|
+
|
|
11
|
+
def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
|
|
12
|
+
"""
|
|
13
|
+
Calculate the cosine similarity between two vectors.
|
|
14
|
+
|
|
15
|
+
>>> v100 = np.array([1, 0, 0])
|
|
16
|
+
>>> v010 = np.array([0, 1, 0])
|
|
17
|
+
>>> v001 = np.array([0, 0, 1])
|
|
18
|
+
>>> v011 = np.array([0, 1, 1])
|
|
19
|
+
>>> pairwise_cosine_similarity(v100, v010)
|
|
20
|
+
0.0
|
|
21
|
+
>>> pairwise_cosine_similarity(v100, v001)
|
|
22
|
+
0.0
|
|
23
|
+
>>> pairwise_cosine_similarity(v010, v001)
|
|
24
|
+
0.0
|
|
25
|
+
>>> pairwise_cosine_similarity(v100, v100)
|
|
26
|
+
1.0
|
|
27
|
+
>>> f"{pairwise_cosine_similarity(v010, v011):0.3f}"
|
|
28
|
+
'0.707'
|
|
29
|
+
|
|
30
|
+
:param vector1:
|
|
31
|
+
:param vector2:
|
|
32
|
+
:return:
|
|
33
|
+
"""
|
|
34
|
+
dot_product = np.dot(vector1, vector2)
|
|
35
|
+
norm1 = np.linalg.norm(vector1)
|
|
36
|
+
norm2 = np.linalg.norm(vector2)
|
|
37
|
+
return dot_product / (norm1 * norm2)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
|
|
41
|
+
"""
|
|
42
|
+
Compute cosine similarity between two lists of vectors.
|
|
43
|
+
|
|
44
|
+
Result is a two column vector sim[ROW][COL] where ROW is from list1 and COL is from list2.
|
|
45
|
+
|
|
46
|
+
:param list1:
|
|
47
|
+
:param list2:
|
|
48
|
+
:return:
|
|
49
|
+
"""
|
|
50
|
+
# Convert lists to numpy arrays
|
|
51
|
+
matrix1 = np.array(list1)
|
|
52
|
+
matrix2 = np.array(list2)
|
|
53
|
+
|
|
54
|
+
# Normalize the vectors in both matrices
|
|
55
|
+
matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
|
|
56
|
+
matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
|
|
57
|
+
|
|
58
|
+
# Compute dot products (resulting in cosine similarity values)
|
|
59
|
+
cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
|
|
60
|
+
|
|
61
|
+
return cosine_similarity_matrix
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
65
|
+
"""
|
|
66
|
+
Find the top match for each row in the cosine similarity matrix.
|
|
67
|
+
|
|
68
|
+
:param cosine_similarity_matrix:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
# Find the index of the maximum value in each row
|
|
72
|
+
top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
|
|
73
|
+
|
|
74
|
+
# Find the maximum similarity value in each row
|
|
75
|
+
top_match_values = np.amax(cosine_similarity_matrix, axis=1)
|
|
76
|
+
|
|
77
|
+
return top_match_indices, top_match_values
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def top_n_matches(
|
|
81
|
+
cosine_similarity_matrix: np.ndarray, n: int = 10
|
|
82
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
83
|
+
# Find the indices that would sort each row in descending order
|
|
84
|
+
sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
|
|
85
|
+
|
|
86
|
+
# Take the first n indices from the sorted indices to get the top n matches
|
|
87
|
+
top_n_indices = sorted_indices[:, :n]
|
|
88
|
+
|
|
89
|
+
# Take the first n values from the sorted values to get the top n match values
|
|
90
|
+
top_n_values = -np.sort(-cosine_similarity_matrix, axis=1)[:, :n]
|
|
91
|
+
|
|
92
|
+
return top_n_indices, top_n_values
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def mmr_diversified_search(
|
|
96
|
+
query_vector: np.ndarray, document_vectors: List[np.ndarray], relevance_factor=0.5, top_n=None
|
|
97
|
+
) -> List[int]:
|
|
98
|
+
"""
|
|
99
|
+
Perform diversified search using Maximal Marginal Relevance (MMR).
|
|
100
|
+
|
|
101
|
+
:param query_vector: The vector representing the query.
|
|
102
|
+
:param document_vectors: The vectors representing the documents.
|
|
103
|
+
:param relevance_factor: The balance parameter between relevance and diversity.
|
|
104
|
+
:param top_n: The number of results to return. If None, return all.
|
|
105
|
+
:return: A list of indices representing the diversified order of documents.
|
|
106
|
+
"""
|
|
107
|
+
if top_n is None:
|
|
108
|
+
# If no specific number of results is specified, return all
|
|
109
|
+
top_n = len(document_vectors)
|
|
110
|
+
|
|
111
|
+
if top_n == 0:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
# Calculate cosine similarities between query and all documents
|
|
115
|
+
norms_query = np.linalg.norm(query_vector)
|
|
116
|
+
norms_docs = np.linalg.norm(document_vectors, axis=1)
|
|
117
|
+
similarities = np.dot(document_vectors, query_vector) / (norms_docs * norms_query)
|
|
118
|
+
|
|
119
|
+
# Initialize set of selected indices and results list
|
|
120
|
+
selected_indices = set()
|
|
121
|
+
result_indices = []
|
|
122
|
+
|
|
123
|
+
# Diversified search loop
|
|
124
|
+
for _ in range(top_n):
|
|
125
|
+
max_mmr = float("-inf")
|
|
126
|
+
best_index = None
|
|
127
|
+
|
|
128
|
+
# Loop over all documents
|
|
129
|
+
for idx, _doc_vector in enumerate(document_vectors):
|
|
130
|
+
if idx not in selected_indices:
|
|
131
|
+
relevance = relevance_factor * similarities[idx]
|
|
132
|
+
diversity = 0
|
|
133
|
+
|
|
134
|
+
# Penalize based on similarity to already selected documents
|
|
135
|
+
if selected_indices:
|
|
136
|
+
max_sim_to_selected = max(
|
|
137
|
+
[
|
|
138
|
+
np.dot(document_vectors[idx], document_vectors[s])
|
|
139
|
+
/ (
|
|
140
|
+
np.linalg.norm(document_vectors[idx])
|
|
141
|
+
* np.linalg.norm(document_vectors[s])
|
|
142
|
+
)
|
|
143
|
+
for s in selected_indices
|
|
144
|
+
]
|
|
145
|
+
)
|
|
146
|
+
diversity = (1 - relevance_factor) * max_sim_to_selected
|
|
147
|
+
|
|
148
|
+
mmr_score = relevance - diversity
|
|
149
|
+
|
|
150
|
+
# Update best MMR score and index
|
|
151
|
+
if mmr_score > max_mmr:
|
|
152
|
+
max_mmr = mmr_score
|
|
153
|
+
best_index = idx
|
|
154
|
+
|
|
155
|
+
# Add the best document to the result and mark it as selected
|
|
156
|
+
if best_index is None:
|
|
157
|
+
logger.warning(f"No best index found over {len(document_vectors)} documents.")
|
|
158
|
+
continue
|
|
159
|
+
result_indices.append(best_index)
|
|
160
|
+
selected_indices.add(best_index)
|
|
161
|
+
|
|
162
|
+
return result_indices
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/chromadb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/hdf5_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/__init__.py
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/llm_indexer.py
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/index/implementations/simple_indexer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/inference/inference_engine_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/collection_details.html.j2
RENAMED
|
File without changes
|
{linkml_store-0.2.1 → linkml_store-0.2.2}/src/linkml_store/webapi/html/database_details.html.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|