linkml-store 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +19 -13
- linkml_store/api/collection.py +10 -1
- linkml_store/api/config.py +5 -1
- linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
- linkml_store/cli.py +64 -20
- linkml_store/index/implementations/llm_indexer.py +7 -4
- linkml_store/index/indexer.py +21 -17
- linkml_store/inference/implementations/rag_inference_engine.py +38 -8
- linkml_store/inference/inference_config.py +4 -2
- linkml_store/inference/inference_engine.py +4 -2
- linkml_store/utils/format_utils.py +6 -1
- linkml_store/utils/llm_utils.py +8 -3
- linkml_store/utils/vector_utils.py +165 -0
- {linkml_store-0.2.1.dist-info → linkml_store-0.2.4.dist-info}/METADATA +11 -6
- {linkml_store-0.2.1.dist-info → linkml_store-0.2.4.dist-info}/RECORD +18 -17
- {linkml_store-0.2.1.dist-info → linkml_store-0.2.4.dist-info}/WHEEL +1 -1
- {linkml_store-0.2.1.dist-info → linkml_store-0.2.4.dist-info}/LICENSE +0 -0
- {linkml_store-0.2.1.dist-info → linkml_store-0.2.4.dist-info}/entry_points.txt +0 -0
linkml_store/api/client.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import importlib
|
|
1
2
|
import logging
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Dict, Optional, Union
|
|
@@ -7,23 +8,18 @@ from linkml_runtime import SchemaView
|
|
|
7
8
|
|
|
8
9
|
from linkml_store.api import Database
|
|
9
10
|
from linkml_store.api.config import ClientConfig
|
|
10
|
-
from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
|
|
11
|
-
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
|
|
12
|
-
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
|
|
13
|
-
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
|
|
14
|
-
from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
|
|
15
|
-
from linkml_store.api.stores.solr.solr_database import SolrDatabase
|
|
16
11
|
|
|
17
12
|
logger = logging.getLogger(__name__)
|
|
18
13
|
|
|
19
14
|
|
|
15
|
+
|
|
20
16
|
HANDLE_MAP = {
|
|
21
|
-
"duckdb": DuckDBDatabase,
|
|
22
|
-
"solr": SolrDatabase,
|
|
23
|
-
"mongodb": MongoDBDatabase,
|
|
24
|
-
"chromadb": ChromaDBDatabase,
|
|
25
|
-
"neo4j": Neo4jDatabase,
|
|
26
|
-
"file": FileSystemDatabase,
|
|
17
|
+
"duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
|
|
18
|
+
"solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
|
|
19
|
+
"mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
|
|
20
|
+
"chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
|
|
21
|
+
"neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
|
|
22
|
+
"file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
|
|
27
23
|
}
|
|
28
24
|
|
|
29
25
|
|
|
@@ -155,6 +151,9 @@ class Client:
|
|
|
155
151
|
if auto_attach:
|
|
156
152
|
db = self.attach_database(handle, alias=name, **kwargs)
|
|
157
153
|
db.from_config(db_config)
|
|
154
|
+
if db_config.source:
|
|
155
|
+
db = self.get_database(name)
|
|
156
|
+
db.store(db_config.source.data)
|
|
158
157
|
|
|
159
158
|
def _set_database_config(self, db: Database):
|
|
160
159
|
"""
|
|
@@ -207,7 +206,14 @@ class Client:
|
|
|
207
206
|
scheme, _ = handle.split(":", 1)
|
|
208
207
|
if scheme not in HANDLE_MAP:
|
|
209
208
|
raise ValueError(f"Unknown scheme: {scheme}")
|
|
210
|
-
|
|
209
|
+
module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
|
|
210
|
+
try:
|
|
211
|
+
module = importlib.import_module(module_path)
|
|
212
|
+
cls = getattr(module, class_name)
|
|
213
|
+
except ImportError as e:
|
|
214
|
+
raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
|
|
215
|
+
|
|
216
|
+
#cls = HANDLE_MAP[scheme]
|
|
211
217
|
db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
|
|
212
218
|
if schema_view:
|
|
213
219
|
db.set_schema_view(schema_view)
|
linkml_store/api/collection.py
CHANGED
|
@@ -470,6 +470,8 @@ class Collection(Generic[DatabaseType]):
|
|
|
470
470
|
where: Optional[Any] = None,
|
|
471
471
|
index_name: Optional[str] = None,
|
|
472
472
|
limit: Optional[int] = None,
|
|
473
|
+
select_cols: Optional[List[str]] = None,
|
|
474
|
+
mmr_relevance_factor: Optional[float] = None,
|
|
473
475
|
**kwargs,
|
|
474
476
|
) -> QueryResult:
|
|
475
477
|
"""
|
|
@@ -502,6 +504,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
502
504
|
:param where:
|
|
503
505
|
:param index_name:
|
|
504
506
|
:param limit:
|
|
507
|
+
:param select_cols:
|
|
505
508
|
:param kwargs:
|
|
506
509
|
:return:
|
|
507
510
|
"""
|
|
@@ -534,9 +537,14 @@ class Collection(Generic[DatabaseType]):
|
|
|
534
537
|
index_col = ix.index_field
|
|
535
538
|
# TODO: optimize this for large indexes
|
|
536
539
|
vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
|
|
537
|
-
results = ix.search(query, vector_pairs, limit=limit)
|
|
540
|
+
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
|
|
538
541
|
for r in results:
|
|
539
542
|
del r[1][index_col]
|
|
543
|
+
if select_cols:
|
|
544
|
+
new_results = []
|
|
545
|
+
for r in results:
|
|
546
|
+
new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
|
|
547
|
+
results = new_results
|
|
540
548
|
new_qr = QueryResult(num_rows=len(results))
|
|
541
549
|
new_qr.ranked_rows = results
|
|
542
550
|
new_qr.rows = [r[1] for r in results]
|
|
@@ -671,6 +679,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
671
679
|
"""
|
|
672
680
|
yield from self.find({}, limit=-1).rows
|
|
673
681
|
|
|
682
|
+
@property
|
|
674
683
|
def rows(self) -> List[OBJECT]:
|
|
675
684
|
"""
|
|
676
685
|
Return a list of objects in the collection.
|
linkml_store/api/config.py
CHANGED
|
@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
|
|
|
91
91
|
)
|
|
92
92
|
source: Optional[CollectionSource] = Field(
|
|
93
93
|
default=None,
|
|
94
|
-
description="
|
|
94
|
+
description="Source for the collection",
|
|
95
95
|
)
|
|
96
96
|
derived_from: Optional[List[DerivationConfiguration]] = Field(
|
|
97
97
|
default=None,
|
|
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
|
|
|
154
154
|
default=False,
|
|
155
155
|
description="Whether to ensure referential integrity",
|
|
156
156
|
)
|
|
157
|
+
source: Optional[CollectionSource] = Field(
|
|
158
|
+
default=None,
|
|
159
|
+
description="Source for the database",
|
|
160
|
+
)
|
|
157
161
|
|
|
158
162
|
|
|
159
163
|
class ClientConfig(ConfiguredBaseModel):
|
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
|
-
from
|
|
6
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
7
7
|
from linkml_runtime import SchemaView
|
|
8
8
|
|
|
9
9
|
from linkml_store.api import Database
|
linkml_store/cli.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
3
|
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional, Tuple, Any
|
|
6
7
|
|
|
7
8
|
import click
|
|
8
9
|
import yaml
|
|
@@ -134,12 +135,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
134
135
|
logger.setLevel(logging.ERROR)
|
|
135
136
|
ctx.ensure_object(dict)
|
|
136
137
|
if input:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
138
|
+
database = "duckdb" # default: store in duckdb
|
|
139
|
+
if input.startswith("http"):
|
|
140
|
+
parts = input.split("/")
|
|
141
|
+
collection = parts[-1]
|
|
142
|
+
collection = collection.split(".")[0]
|
|
143
|
+
else:
|
|
144
|
+
stem = underscore(Path(input).stem)
|
|
145
|
+
collection = stem
|
|
146
|
+
logger.info(f"Using input file: {input}, "
|
|
147
|
+
f"default storage is {database} and collection is {collection}")
|
|
140
148
|
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
|
|
141
|
-
# collection = Path(input).stem
|
|
142
|
-
# database = f"file:{Path(input).parent}"
|
|
143
149
|
if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
|
|
144
150
|
config = DEFAULT_LOCAL_CONF_PATH
|
|
145
151
|
if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
|
|
@@ -177,10 +183,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
177
183
|
|
|
178
184
|
@cli.command()
|
|
179
185
|
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
186
|
+
@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
|
|
180
187
|
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
181
188
|
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
182
189
|
@click.pass_context
|
|
183
|
-
def insert(ctx, files, object, format):
|
|
190
|
+
def insert(ctx, files, replace, object, format):
|
|
184
191
|
"""Insert objects from files (JSON, YAML, TSV) into the specified collection.
|
|
185
192
|
|
|
186
193
|
Using a configuration:
|
|
@@ -194,7 +201,6 @@ def insert(ctx, files, object, format):
|
|
|
194
201
|
collection = settings.collection
|
|
195
202
|
if not collection:
|
|
196
203
|
raise ValueError("Collection must be specified.")
|
|
197
|
-
objects = []
|
|
198
204
|
if not files and not object:
|
|
199
205
|
files = ["-"]
|
|
200
206
|
for file_path in files:
|
|
@@ -203,13 +209,19 @@ def insert(ctx, files, object, format):
|
|
|
203
209
|
else:
|
|
204
210
|
objects = load_objects(file_path)
|
|
205
211
|
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
206
|
-
|
|
212
|
+
if replace:
|
|
213
|
+
collection.replace(objects)
|
|
214
|
+
else:
|
|
215
|
+
collection.insert(objects)
|
|
207
216
|
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
208
217
|
if object:
|
|
209
218
|
for object_str in object:
|
|
210
219
|
logger.info(f"Parsing: {object_str}")
|
|
211
220
|
objects = yaml.safe_load(object_str)
|
|
212
|
-
|
|
221
|
+
if replace:
|
|
222
|
+
collection.replace(objects)
|
|
223
|
+
else:
|
|
224
|
+
collection.insert(objects)
|
|
213
225
|
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
|
|
214
226
|
collection.commit()
|
|
215
227
|
|
|
@@ -415,14 +427,6 @@ def list_collections(ctx, **kwargs):
|
|
|
415
427
|
def fq(ctx, where, limit, columns, output_type, wide, output):
|
|
416
428
|
"""
|
|
417
429
|
Query facets from the specified collection.
|
|
418
|
-
|
|
419
|
-
:param ctx:
|
|
420
|
-
:param where:
|
|
421
|
-
:param limit:
|
|
422
|
-
:param columns:
|
|
423
|
-
:param output_type:
|
|
424
|
-
:param output:
|
|
425
|
-
:return:
|
|
426
430
|
"""
|
|
427
431
|
collection = ctx.obj["settings"].collection
|
|
428
432
|
where_clause = yaml.safe_load(where) if where else None
|
|
@@ -488,6 +492,41 @@ def describe(ctx, where, output_type, output, limit):
|
|
|
488
492
|
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
489
493
|
|
|
490
494
|
|
|
495
|
+
@cli.command()
|
|
496
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
497
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
498
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
499
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
500
|
+
@click.option("--index", "-I", help="Attributes to index on in pivot")
|
|
501
|
+
@click.option("--columns", "-A", help="Attributes to use as columns in pivot")
|
|
502
|
+
@click.option("--values", "-V", help="Attributes to use as values in pivot")
|
|
503
|
+
@click.pass_context
|
|
504
|
+
def pivot(ctx, where, limit, index, columns, values, output_type, output):
|
|
505
|
+
collection = ctx.obj["settings"].collection
|
|
506
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
507
|
+
column_atts = columns.split(",") if columns else None
|
|
508
|
+
value_atts = values.split(",") if values else None
|
|
509
|
+
index_atts = index.split(",") if index else None
|
|
510
|
+
results = collection.find(where_clause, limit=limit)
|
|
511
|
+
pivoted = defaultdict(dict)
|
|
512
|
+
for row in results.rows:
|
|
513
|
+
index_key = tuple([row.get(att) for att in index_atts])
|
|
514
|
+
column_key = tuple([row.get(att) for att in column_atts])
|
|
515
|
+
value_key = tuple([row.get(att) for att in value_atts])
|
|
516
|
+
pivoted[index_key][column_key] = value_key
|
|
517
|
+
pivoted_objs = []
|
|
518
|
+
def detuple(t: Tuple) -> Any:
|
|
519
|
+
if len(t) == 1:
|
|
520
|
+
return t[0]
|
|
521
|
+
return str(t)
|
|
522
|
+
for index_key, data in pivoted.items():
|
|
523
|
+
obj = {att: key for att, key in zip(index_atts, index_key)}
|
|
524
|
+
for column_key, value_key in data.items():
|
|
525
|
+
obj[detuple(column_key)] = detuple(value_key)
|
|
526
|
+
pivoted_objs.append(obj)
|
|
527
|
+
write_output(pivoted_objs, output_type, target=output)
|
|
528
|
+
|
|
529
|
+
|
|
491
530
|
@cli.command()
|
|
492
531
|
@click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
|
|
493
532
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
@@ -506,10 +545,12 @@ def describe(ctx, where, output_type, output, limit):
|
|
|
506
545
|
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
|
|
507
546
|
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
|
|
508
547
|
@click.option("--query", "-q", type=click.STRING, help="query term")
|
|
548
|
+
@click.option("--where", "-w", type=click.STRING, help="query term")
|
|
509
549
|
@click.pass_context
|
|
510
550
|
def infer(
|
|
511
551
|
ctx,
|
|
512
552
|
inference_config_file,
|
|
553
|
+
where,
|
|
513
554
|
query,
|
|
514
555
|
evaluation_count,
|
|
515
556
|
evaluation_match_function,
|
|
@@ -551,6 +592,7 @@ def infer(
|
|
|
551
592
|
linkml-store -i tests/input/iris.csv inference -t sklearn \
|
|
552
593
|
-q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
|
|
553
594
|
"""
|
|
595
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
554
596
|
if query:
|
|
555
597
|
query_obj = yaml.safe_load(query)
|
|
556
598
|
else:
|
|
@@ -653,6 +695,7 @@ def schema(ctx, output_type, output):
|
|
|
653
695
|
@cli.command()
|
|
654
696
|
@click.argument("search_term")
|
|
655
697
|
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
|
|
698
|
+
@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
|
|
656
699
|
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
|
|
657
700
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
658
701
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
@@ -661,13 +704,14 @@ def schema(ctx, output_type, output):
|
|
|
661
704
|
)
|
|
662
705
|
@index_type_option
|
|
663
706
|
@click.pass_context
|
|
664
|
-
def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
|
|
707
|
+
def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
|
|
665
708
|
"""Search objects in the specified collection."""
|
|
666
709
|
collection = ctx.obj["settings"].collection
|
|
667
710
|
ix = get_indexer(index_type)
|
|
668
711
|
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
|
|
669
712
|
collection.attach_indexer(ix, auto_index=auto_index)
|
|
670
|
-
|
|
713
|
+
select_cols = yaml.safe_load(select) if select else None
|
|
714
|
+
result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
|
|
671
715
|
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
672
716
|
if output:
|
|
673
717
|
with open(output, "w") as f:
|
|
@@ -3,7 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import TYPE_CHECKING, List, Optional
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
-
from tiktoken import encoding_for_model
|
|
7
6
|
|
|
8
7
|
from linkml_store.api.config import CollectionConfig
|
|
9
8
|
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
@@ -55,7 +54,7 @@ class LLMIndexer(Indexer):
|
|
|
55
54
|
|
|
56
55
|
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
|
|
57
56
|
"""
|
|
58
|
-
Use LLM to embed
|
|
57
|
+
Use LLM to embed.
|
|
59
58
|
|
|
60
59
|
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
61
60
|
>>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
|
|
@@ -63,20 +62,24 @@ class LLMIndexer(Indexer):
|
|
|
63
62
|
:param texts:
|
|
64
63
|
:return:
|
|
65
64
|
"""
|
|
65
|
+
from tiktoken import encoding_for_model
|
|
66
66
|
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
67
67
|
model = self.embedding_model
|
|
68
|
-
|
|
68
|
+
# TODO: make this more accurate
|
|
69
|
+
token_limit = get_token_limit(model.model_id) - 200
|
|
69
70
|
encoding = encoding_for_model("gpt-4o")
|
|
70
71
|
|
|
71
72
|
def truncate_text(text: str) -> str:
|
|
72
73
|
# split into tokens every 1000 chars:
|
|
73
74
|
parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
|
|
74
|
-
|
|
75
|
+
truncated = render_formatted_text(
|
|
75
76
|
lambda x: "".join(x),
|
|
76
77
|
parts,
|
|
77
78
|
encoding,
|
|
78
79
|
token_limit,
|
|
79
80
|
)
|
|
81
|
+
logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
|
|
82
|
+
return truncated
|
|
80
83
|
|
|
81
84
|
texts = [truncate_text(text) for text in texts]
|
|
82
85
|
|
linkml_store/index/indexer.py
CHANGED
|
@@ -3,6 +3,7 @@ from enum import Enum
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
+
from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
|
|
6
7
|
from pydantic import BaseModel
|
|
7
8
|
|
|
8
9
|
INDEX_ITEM = np.ndarray
|
|
@@ -19,20 +20,6 @@ class TemplateSyntaxEnum(str, Enum):
|
|
|
19
20
|
fstring = "fstring"
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def cosine_similarity(vector1, vector2) -> float:
|
|
23
|
-
"""
|
|
24
|
-
Calculate the cosine similarity between two vectors
|
|
25
|
-
|
|
26
|
-
:param vector1:
|
|
27
|
-
:param vector2:
|
|
28
|
-
:return:
|
|
29
|
-
"""
|
|
30
|
-
dot_product = np.dot(vector1, vector2)
|
|
31
|
-
norm1 = np.linalg.norm(vector1)
|
|
32
|
-
norm2 = np.linalg.norm(vector2)
|
|
33
|
-
return dot_product / (norm1 * norm2)
|
|
34
|
-
|
|
35
|
-
|
|
36
23
|
class Indexer(BaseModel):
|
|
37
24
|
"""
|
|
38
25
|
An indexer operates on a collection in order to search for objects.
|
|
@@ -79,7 +66,7 @@ class Indexer(BaseModel):
|
|
|
79
66
|
to get a sense of how they work.
|
|
80
67
|
|
|
81
68
|
>>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
|
|
82
|
-
>>> assert
|
|
69
|
+
>>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
|
|
83
70
|
|
|
84
71
|
Note you should consult the documentation for the specific indexer you are using for more details on
|
|
85
72
|
how text is converted to vectors.
|
|
@@ -167,7 +154,8 @@ class Indexer(BaseModel):
|
|
|
167
154
|
return str(obj)
|
|
168
155
|
|
|
169
156
|
def search(
|
|
170
|
-
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
|
|
157
|
+
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
|
|
158
|
+
mmr_relevance_factor: Optional[float] = None
|
|
171
159
|
) -> List[Tuple[float, Any]]:
|
|
172
160
|
"""
|
|
173
161
|
Use the indexer to search against a database of vectors.
|
|
@@ -183,13 +171,29 @@ class Indexer(BaseModel):
|
|
|
183
171
|
# Convert the query string to a vector
|
|
184
172
|
query_vector = self.text_to_vector(query, cache=False)
|
|
185
173
|
|
|
174
|
+
if mmr_relevance_factor is not None:
|
|
175
|
+
vlist = [v for _, v in vectors]
|
|
176
|
+
idlist = [id for id, _ in vectors]
|
|
177
|
+
sorted_indices = mmr_diversified_search(
|
|
178
|
+
query_vector, vlist,
|
|
179
|
+
relevance_factor=mmr_relevance_factor, top_n=limit)
|
|
180
|
+
results = []
|
|
181
|
+
# TODO: this is inefficient when limit is high
|
|
182
|
+
for i in range(limit):
|
|
183
|
+
if i >= len(sorted_indices):
|
|
184
|
+
break
|
|
185
|
+
pos = sorted_indices[i]
|
|
186
|
+
score = pairwise_cosine_similarity(query_vector, vlist[pos])
|
|
187
|
+
results.append((score, idlist[pos]))
|
|
188
|
+
return results
|
|
189
|
+
|
|
186
190
|
distances = []
|
|
187
191
|
|
|
188
192
|
# Iterate over each indexed item
|
|
189
193
|
for item_id, item_vector in vectors:
|
|
190
194
|
# Calculate the Euclidean distance between the query vector and the item vector
|
|
191
195
|
# distance = 1-np.linalg.norm(query_vector - item_vector)
|
|
192
|
-
distance =
|
|
196
|
+
distance = pairwise_cosine_similarity(query_vector, item_vector)
|
|
193
197
|
distances.append((distance, item_id))
|
|
194
198
|
|
|
195
199
|
# Sort the distances in ascending order
|
|
@@ -15,6 +15,10 @@ from linkml_store.utils.object_utils import select_nested
|
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
|
+
MAX_ITERATIONS = 5
|
|
19
|
+
DEFAULT_NUM_EXAMPLES = 20
|
|
20
|
+
DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
|
|
21
|
+
|
|
18
22
|
SYSTEM_PROMPT = """
|
|
19
23
|
You are a {llm_config.role}, your task is to inference the YAML
|
|
20
24
|
object output given the YAML object input. I will provide you
|
|
@@ -32,6 +36,10 @@ class TrainedModel(BaseModel, extra="forbid"):
|
|
|
32
36
|
config: Optional[InferenceConfig] = None
|
|
33
37
|
|
|
34
38
|
|
|
39
|
+
class RAGInference(Inference):
|
|
40
|
+
iterations: int = 0
|
|
41
|
+
|
|
42
|
+
|
|
35
43
|
@dataclass
|
|
36
44
|
class RAGInferenceEngine(InferenceEngine):
|
|
37
45
|
"""
|
|
@@ -103,7 +111,7 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
103
111
|
def object_to_text(self, object: OBJECT) -> str:
|
|
104
112
|
return yaml.dump(object)
|
|
105
113
|
|
|
106
|
-
def derive(self, object: OBJECT) -> Optional[
|
|
114
|
+
def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
|
|
107
115
|
import llm
|
|
108
116
|
from tiktoken import encoding_for_model
|
|
109
117
|
|
|
@@ -113,15 +121,17 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
113
121
|
model_name = self.config.llm_config.model_name
|
|
114
122
|
feature_attributes = self.config.feature_attributes
|
|
115
123
|
target_attributes = self.config.target_attributes
|
|
116
|
-
num_examples = self.config.llm_config.number_of_few_shot_examples or
|
|
124
|
+
num_examples = self.config.llm_config.number_of_few_shot_examples or DEFAULT_NUM_EXAMPLES
|
|
117
125
|
query_text = self.object_to_text(object)
|
|
126
|
+
mmr_relevance_factor = DEFAULT_MMR_RELEVANCE_FACTOR
|
|
118
127
|
if not self.rag_collection:
|
|
119
128
|
# TODO: zero-shot mode
|
|
120
129
|
examples = []
|
|
121
130
|
else:
|
|
122
131
|
if not self.rag_collection.indexers:
|
|
123
132
|
raise ValueError("RAG collection must have an indexer attached")
|
|
124
|
-
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm"
|
|
133
|
+
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
|
|
134
|
+
mmr_relevance_factor=mmr_relevance_factor)
|
|
125
135
|
examples = rs.rows
|
|
126
136
|
if not examples:
|
|
127
137
|
raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
|
|
@@ -143,23 +153,43 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
143
153
|
)
|
|
144
154
|
prompt_clauses.append(prompt_clause)
|
|
145
155
|
|
|
146
|
-
prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
|
|
147
156
|
system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
|
|
157
|
+
system_prompt += "\n".join(additional_prompt_texts or [])
|
|
158
|
+
prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
|
|
148
159
|
|
|
149
|
-
def make_text(texts):
|
|
150
|
-
return "\n".join(
|
|
160
|
+
def make_text(texts: List[str]):
|
|
161
|
+
return "\n".join(texts) + prompt_end
|
|
151
162
|
|
|
152
163
|
try:
|
|
153
164
|
encoding = encoding_for_model(model_name)
|
|
154
165
|
except KeyError:
|
|
155
166
|
encoding = encoding_for_model("gpt-4")
|
|
156
167
|
token_limit = get_token_limit(model_name)
|
|
157
|
-
prompt = render_formatted_text(make_text, prompt_clauses,
|
|
168
|
+
prompt = render_formatted_text(make_text, values=prompt_clauses,
|
|
169
|
+
encoding=encoding, token_limit=token_limit,
|
|
170
|
+
additional_text=system_prompt)
|
|
158
171
|
logger.info(f"Prompt: {prompt}")
|
|
159
172
|
response = model.prompt(prompt, system_prompt)
|
|
160
173
|
yaml_str = response.text()
|
|
161
174
|
logger.info(f"Response: {yaml_str}")
|
|
162
|
-
|
|
175
|
+
predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
|
|
176
|
+
if self.config.validate_results:
|
|
177
|
+
base_collection = self.training_data.base_collection
|
|
178
|
+
errs = list(base_collection.iter_validate_collection([predicted_object]))
|
|
179
|
+
if errs:
|
|
180
|
+
print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
|
|
181
|
+
print(f"PARSED: {predicted_object}")
|
|
182
|
+
print(f"ERRORS: {errs}")
|
|
183
|
+
if iteration > MAX_ITERATIONS:
|
|
184
|
+
raise ValueError(f"Validation errors: {errs}")
|
|
185
|
+
extra_texts = [
|
|
186
|
+
"Make sure results conform to the schema. Previously you provided:\n",
|
|
187
|
+
yaml_str,
|
|
188
|
+
"\nThis was invalid.\n",
|
|
189
|
+
"Validation errors:\n",
|
|
190
|
+
] + [self.object_to_text(e) for e in errs]
|
|
191
|
+
return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
|
|
192
|
+
return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
|
|
163
193
|
|
|
164
194
|
def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
|
|
165
195
|
if "```" in yaml_str:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List, Optional, Tuple
|
|
2
|
+
from typing import List, Optional, Tuple, Any
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field
|
|
5
5
|
|
|
@@ -36,6 +36,7 @@ class InferenceConfig(BaseModel, extra="forbid"):
|
|
|
36
36
|
train_test_split: Optional[Tuple[float, float]] = None
|
|
37
37
|
llm_config: Optional[LLMConfig] = None
|
|
38
38
|
random_seed: Optional[int] = None
|
|
39
|
+
validate_results: Optional[bool] = None
|
|
39
40
|
|
|
40
41
|
@classmethod
|
|
41
42
|
def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
|
|
@@ -58,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
|
|
|
58
59
|
"""
|
|
59
60
|
Result of an inference derivation.
|
|
60
61
|
"""
|
|
61
|
-
|
|
62
|
+
query: Optional[OBJECT] = Field(default=None, description="The query object.")
|
|
62
63
|
predicted_object: OBJECT = Field(..., description="The predicted object.")
|
|
63
64
|
confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
|
|
65
|
+
explanation: Optional[Any] = Field(default=None, description="Explanation of the prediction.")
|
|
@@ -4,7 +4,7 @@ from abc import ABC
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional, TextIO, Tuple, Union
|
|
7
|
+
from typing import Optional, TextIO, Tuple, Union, Any
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from pydantic import BaseModel, ConfigDict
|
|
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
|
|
|
67
67
|
# slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
|
|
68
68
|
indices: Optional[Tuple[int, ...]] = None
|
|
69
69
|
_collection: Optional[Collection] = None
|
|
70
|
+
where: Any = None
|
|
70
71
|
|
|
71
72
|
@property
|
|
72
73
|
def collection(self) -> Collection:
|
|
73
74
|
if not self._collection and not self.indices:
|
|
74
75
|
return self.base_collection
|
|
75
76
|
if not self._collection:
|
|
76
|
-
rows = self.base_collection.
|
|
77
|
+
rows = self.base_collection.rows
|
|
77
78
|
subset = [rows[i] for i in self.indices]
|
|
78
79
|
db = self.base_collection.parent
|
|
79
80
|
subset_name = self.slice_alias
|
|
@@ -94,6 +95,7 @@ class CollectionSlice(BaseModel):
|
|
|
94
95
|
"""
|
|
95
96
|
Return the slice of the collection as a dataframe.
|
|
96
97
|
|
|
98
|
+
:param flattened: flattned nested objects to give keys like foo.bar
|
|
97
99
|
:return:
|
|
98
100
|
"""
|
|
99
101
|
rs = self.collection.find({}, limit=-1)
|
|
@@ -12,9 +12,9 @@ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
|
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
14
|
import pystow
|
|
15
|
+
import xmltodict
|
|
15
16
|
import yaml
|
|
16
17
|
from pydantic import BaseModel
|
|
17
|
-
from tabulate import tabulate
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -30,6 +30,7 @@ class Format(Enum):
|
|
|
30
30
|
YAMLL = "yamll"
|
|
31
31
|
TSV = "tsv"
|
|
32
32
|
CSV = "csv"
|
|
33
|
+
XML = "xml"
|
|
33
34
|
PYTHON = "python"
|
|
34
35
|
PARQUET = "parquet"
|
|
35
36
|
FORMATTED = "formatted"
|
|
@@ -50,6 +51,7 @@ class Format(Enum):
|
|
|
50
51
|
".yamll": cls.YAMLL,
|
|
51
52
|
".tsv": cls.TSV,
|
|
52
53
|
".csv": cls.CSV,
|
|
54
|
+
".xml": cls.XML,
|
|
53
55
|
".py": cls.PYTHON,
|
|
54
56
|
".parquet": cls.PARQUET,
|
|
55
57
|
".pq": cls.PARQUET,
|
|
@@ -124,6 +126,8 @@ def process_file(
|
|
|
124
126
|
delimiter = "\t" if format == Format.TSV else ","
|
|
125
127
|
reader = csv.DictReader(f, delimiter=delimiter)
|
|
126
128
|
objs = list(reader)
|
|
129
|
+
elif format == Format.XML:
|
|
130
|
+
objs = xmltodict.parse(f.read())
|
|
127
131
|
elif format == Format.PARQUET:
|
|
128
132
|
import pyarrow.parquet as pq
|
|
129
133
|
|
|
@@ -284,6 +288,7 @@ def render_output(
|
|
|
284
288
|
elif format == Format.PYTHON:
|
|
285
289
|
return str(data)
|
|
286
290
|
elif format == Format.TABLE:
|
|
291
|
+
from tabulate import tabulate
|
|
287
292
|
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
288
293
|
elif format == Format.YAML:
|
|
289
294
|
if isinstance(data, list):
|
linkml_store/utils/llm_utils.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, List, Optional, TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
import tiktoken
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
4
8
|
|
|
5
9
|
MODEL_TOKEN_MAPPING = {
|
|
6
10
|
"gpt-4o-mini": 128_000,
|
|
@@ -40,7 +44,7 @@ MODEL_TOKEN_MAPPING = {
|
|
|
40
44
|
def render_formatted_text(
|
|
41
45
|
render_func: Callable,
|
|
42
46
|
values: List[str],
|
|
43
|
-
encoding: Encoding,
|
|
47
|
+
encoding: "tiktoken.Encoding",
|
|
44
48
|
token_limit: int,
|
|
45
49
|
additional_text: Optional[str] = None,
|
|
46
50
|
) -> str:
|
|
@@ -67,6 +71,7 @@ def render_formatted_text(
|
|
|
67
71
|
if additional_text:
|
|
68
72
|
token_limit -= len(encoding.encode(additional_text))
|
|
69
73
|
text_length = len(encoding.encode(text))
|
|
74
|
+
logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
|
|
70
75
|
if text_length <= token_limit:
|
|
71
76
|
return text
|
|
72
77
|
if not values:
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
LOL = List[List[float]]
|
|
10
|
+
|
|
11
|
+
def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
|
|
12
|
+
"""
|
|
13
|
+
Calculate the cosine similarity between two vectors.
|
|
14
|
+
|
|
15
|
+
>>> v100 = np.array([1, 0, 0])
|
|
16
|
+
>>> v010 = np.array([0, 1, 0])
|
|
17
|
+
>>> v001 = np.array([0, 0, 1])
|
|
18
|
+
>>> v011 = np.array([0, 1, 1])
|
|
19
|
+
>>> pairwise_cosine_similarity(v100, v010)
|
|
20
|
+
0.0
|
|
21
|
+
>>> pairwise_cosine_similarity(v100, v001)
|
|
22
|
+
0.0
|
|
23
|
+
>>> pairwise_cosine_similarity(v010, v001)
|
|
24
|
+
0.0
|
|
25
|
+
>>> pairwise_cosine_similarity(v100, v100)
|
|
26
|
+
1.0
|
|
27
|
+
>>> f"{pairwise_cosine_similarity(v010, v011):0.3f}"
|
|
28
|
+
'0.707'
|
|
29
|
+
|
|
30
|
+
:param vector1:
|
|
31
|
+
:param vector2:
|
|
32
|
+
:return:
|
|
33
|
+
"""
|
|
34
|
+
dot_product = np.dot(vector1, vector2)
|
|
35
|
+
norm1 = np.linalg.norm(vector1)
|
|
36
|
+
norm2 = np.linalg.norm(vector2)
|
|
37
|
+
return dot_product / (norm1 * norm2)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
|
|
41
|
+
"""
|
|
42
|
+
Compute cosine similarity between two lists of vectors.
|
|
43
|
+
|
|
44
|
+
Result is a two column vector sim[ROW][COL] where ROW is from list1 and COL is from list2.
|
|
45
|
+
|
|
46
|
+
:param list1:
|
|
47
|
+
:param list2:
|
|
48
|
+
:return:
|
|
49
|
+
"""
|
|
50
|
+
# Convert lists to numpy arrays
|
|
51
|
+
matrix1 = np.array(list1)
|
|
52
|
+
matrix2 = np.array(list2)
|
|
53
|
+
|
|
54
|
+
# Normalize the vectors in both matrices
|
|
55
|
+
matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
|
|
56
|
+
matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
|
|
57
|
+
|
|
58
|
+
# Compute dot products (resulting in cosine similarity values)
|
|
59
|
+
cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
|
|
60
|
+
|
|
61
|
+
return cosine_similarity_matrix
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
65
|
+
"""
|
|
66
|
+
Find the top match for each row in the cosine similarity matrix.
|
|
67
|
+
|
|
68
|
+
:param cosine_similarity_matrix:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
# Find the index of the maximum value in each row
|
|
72
|
+
top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
|
|
73
|
+
|
|
74
|
+
# Find the maximum similarity value in each row
|
|
75
|
+
top_match_values = np.amax(cosine_similarity_matrix, axis=1)
|
|
76
|
+
|
|
77
|
+
return top_match_indices, top_match_values
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def top_n_matches(
|
|
81
|
+
cosine_similarity_matrix: np.ndarray, n: int = 10
|
|
82
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
83
|
+
# Find the indices that would sort each row in descending order
|
|
84
|
+
sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
|
|
85
|
+
|
|
86
|
+
# Take the first n indices from the sorted indices to get the top n matches
|
|
87
|
+
top_n_indices = sorted_indices[:, :n]
|
|
88
|
+
|
|
89
|
+
# Take the first n values from the sorted values to get the top n match values
|
|
90
|
+
top_n_values = -np.sort(-cosine_similarity_matrix, axis=1)[:, :n]
|
|
91
|
+
|
|
92
|
+
return top_n_indices, top_n_values
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def mmr_diversified_search(
|
|
96
|
+
query_vector: np.ndarray, document_vectors: List[np.ndarray], relevance_factor=0.5, top_n=None
|
|
97
|
+
) -> List[int]:
|
|
98
|
+
"""
|
|
99
|
+
Perform diversified search using Maximal Marginal Relevance (MMR).
|
|
100
|
+
|
|
101
|
+
:param query_vector: The vector representing the query.
|
|
102
|
+
:param document_vectors: The vectors representing the documents.
|
|
103
|
+
:param relevance_factor: The balance parameter between relevance and diversity.
|
|
104
|
+
:param top_n: The number of results to return. If None, return all.
|
|
105
|
+
:return: A list of indices representing the diversified order of documents.
|
|
106
|
+
"""
|
|
107
|
+
if top_n is None:
|
|
108
|
+
# If no specific number of results is specified, return all
|
|
109
|
+
top_n = len(document_vectors)
|
|
110
|
+
|
|
111
|
+
if top_n == 0:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
# Calculate cosine similarities between query and all documents
|
|
115
|
+
norms_query = np.linalg.norm(query_vector)
|
|
116
|
+
norms_docs = np.linalg.norm(document_vectors, axis=1)
|
|
117
|
+
similarities = np.dot(document_vectors, query_vector) / (norms_docs * norms_query)
|
|
118
|
+
|
|
119
|
+
# Initialize set of selected indices and results list
|
|
120
|
+
selected_indices = set()
|
|
121
|
+
result_indices = []
|
|
122
|
+
|
|
123
|
+
# Diversified search loop
|
|
124
|
+
for _ in range(top_n):
|
|
125
|
+
max_mmr = float("-inf")
|
|
126
|
+
best_index = None
|
|
127
|
+
|
|
128
|
+
# Loop over all documents
|
|
129
|
+
for idx, _doc_vector in enumerate(document_vectors):
|
|
130
|
+
if idx not in selected_indices:
|
|
131
|
+
relevance = relevance_factor * similarities[idx]
|
|
132
|
+
diversity = 0
|
|
133
|
+
|
|
134
|
+
# Penalize based on similarity to already selected documents
|
|
135
|
+
if selected_indices:
|
|
136
|
+
max_sim_to_selected = max(
|
|
137
|
+
[
|
|
138
|
+
np.dot(document_vectors[idx], document_vectors[s])
|
|
139
|
+
/ (
|
|
140
|
+
np.linalg.norm(document_vectors[idx])
|
|
141
|
+
* np.linalg.norm(document_vectors[s])
|
|
142
|
+
)
|
|
143
|
+
for s in selected_indices
|
|
144
|
+
]
|
|
145
|
+
)
|
|
146
|
+
diversity = (1 - relevance_factor) * max_sim_to_selected
|
|
147
|
+
|
|
148
|
+
mmr_score = relevance - diversity
|
|
149
|
+
|
|
150
|
+
# Update best MMR score and index
|
|
151
|
+
if mmr_score > max_mmr:
|
|
152
|
+
max_mmr = mmr_score
|
|
153
|
+
best_index = idx
|
|
154
|
+
|
|
155
|
+
# Add the best document to the result and mark it as selected
|
|
156
|
+
if best_index is None:
|
|
157
|
+
logger.warning(f"No best index found over {len(document_vectors)} documents.")
|
|
158
|
+
continue
|
|
159
|
+
result_indices.append(best_index)
|
|
160
|
+
selected_indices.add(best_index)
|
|
161
|
+
|
|
162
|
+
return result_indices
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: linkml-store
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: linkml-store
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Author 1
|
|
@@ -12,9 +12,11 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Provides-Extra: all
|
|
15
17
|
Provides-Extra: analytics
|
|
16
18
|
Provides-Extra: app
|
|
17
|
-
Provides-Extra:
|
|
19
|
+
Provides-Extra: bigquery
|
|
18
20
|
Provides-Extra: fastapi
|
|
19
21
|
Provides-Extra: frictionless
|
|
20
22
|
Provides-Extra: h5py
|
|
@@ -29,25 +31,26 @@ Provides-Extra: scipy
|
|
|
29
31
|
Provides-Extra: tests
|
|
30
32
|
Provides-Extra: validation
|
|
31
33
|
Requires-Dist: black (>=24.0.0) ; extra == "tests"
|
|
32
|
-
Requires-Dist: chromadb ; extra == "chromadb"
|
|
33
34
|
Requires-Dist: click
|
|
34
35
|
Requires-Dist: duckdb (>=0.10.1)
|
|
35
36
|
Requires-Dist: duckdb-engine (>=0.11.2)
|
|
36
37
|
Requires-Dist: fastapi ; extra == "fastapi"
|
|
37
38
|
Requires-Dist: frictionless ; extra == "frictionless"
|
|
38
39
|
Requires-Dist: gcsfs ; extra == "ibis"
|
|
40
|
+
Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
|
|
39
41
|
Requires-Dist: h5py ; extra == "h5py"
|
|
40
42
|
Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
|
|
41
43
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
42
44
|
Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
|
|
45
|
+
Requires-Dist: jsonpatch (>=1.33,<2.0)
|
|
43
46
|
Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
|
|
44
47
|
Requires-Dist: linkml-runtime (>=1.8.0)
|
|
45
48
|
Requires-Dist: linkml_map ; extra == "map"
|
|
46
49
|
Requires-Dist: linkml_renderer ; extra == "renderer"
|
|
47
|
-
Requires-Dist: llm ; extra == "llm"
|
|
50
|
+
Requires-Dist: llm ; extra == "llm" or extra == "all"
|
|
48
51
|
Requires-Dist: matplotlib ; extra == "analytics"
|
|
49
52
|
Requires-Dist: multipledispatch ; extra == "ibis"
|
|
50
|
-
Requires-Dist: neo4j ; extra == "neo4j"
|
|
53
|
+
Requires-Dist: neo4j ; extra == "neo4j" or extra == "all"
|
|
51
54
|
Requires-Dist: networkx ; extra == "neo4j"
|
|
52
55
|
Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
|
|
53
56
|
Requires-Dist: plotly ; extra == "analytics"
|
|
@@ -62,8 +65,10 @@ Requires-Dist: scipy ; extra == "scipy"
|
|
|
62
65
|
Requires-Dist: seaborn ; extra == "analytics"
|
|
63
66
|
Requires-Dist: sqlalchemy
|
|
64
67
|
Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
|
|
68
|
+
Requires-Dist: tabulate
|
|
65
69
|
Requires-Dist: tiktoken ; extra == "llm"
|
|
66
70
|
Requires-Dist: uvicorn ; extra == "fastapi"
|
|
71
|
+
Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
|
|
67
72
|
Description-Content-Type: text/markdown
|
|
68
73
|
|
|
69
74
|
# linkml-store
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
linkml_store/__init__.py,sha256=jlU6WOUAn8cKIhzbTULmBTWpW9gZdEt7q_RI6KZN1bY,118
|
|
2
2
|
linkml_store/api/__init__.py,sha256=3CelcFEFz0y3MkQAzhQ9JxHIt1zFk6nYZxSmYTo8YZE,226
|
|
3
|
-
linkml_store/api/client.py,sha256=
|
|
4
|
-
linkml_store/api/collection.py,sha256=
|
|
5
|
-
linkml_store/api/config.py,sha256=
|
|
3
|
+
linkml_store/api/client.py,sha256=wFVgl1NUovaKLqNVUEt9dsnoIzjzqFvktJVncAupdE4,12362
|
|
4
|
+
linkml_store/api/collection.py,sha256=CGvWxH7HRhjDt9Cp3SGdMqyhYx7Q4fRKUtAJV74_l0g,39711
|
|
5
|
+
linkml_store/api/config.py,sha256=pOz210JIwkEEXtfjcsZBp1UEedkBu8RkH62Qa1b4exI,5777
|
|
6
6
|
linkml_store/api/database.py,sha256=nvae8jnOZsQIFCsl_lRBnKcvrpJg4A10ujIKGeMyUS8,29350
|
|
7
7
|
linkml_store/api/queries.py,sha256=tx9fgGY5fC_2ZbIvg4BqTK_MXJwA_DI4mxr8HdQ6Vos,2075
|
|
8
8
|
linkml_store/api/stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -15,7 +15,7 @@ linkml_store/api/stores/duckdb/duckdb_database.py,sha256=GH9bcOfHpNp6r-Eu1C3W0xu
|
|
|
15
15
|
linkml_store/api/stores/duckdb/mappings.py,sha256=tDce3W1Apwammhf4LS6cRJ0m4NiJ0eB7vOI_4U5ETY8,148
|
|
16
16
|
linkml_store/api/stores/filesystem/__init__.py,sha256=KjvCjdttwqMHNeGyL-gr59zRz0--HFEWWUNNCJ5hITs,347
|
|
17
17
|
linkml_store/api/stores/filesystem/filesystem_collection.py,sha256=9gqY2KRZsn_RWk4eKkxFd3_wcxs5YaXvcBI7GGJBMGE,6751
|
|
18
|
-
linkml_store/api/stores/filesystem/filesystem_database.py,sha256=
|
|
18
|
+
linkml_store/api/stores/filesystem/filesystem_database.py,sha256=e9hSGoaOxr_sG_RhjgzV_yvdQ_xbHHXHJDtufWzAX4E,2883
|
|
19
19
|
linkml_store/api/stores/hdf5/__init__.py,sha256=l4cIh3v7P0nPbwGIsfuCMD_serQ8q8c7iuUA9W2Jb4o,97
|
|
20
20
|
linkml_store/api/stores/hdf5/hdf5_collection.py,sha256=mnpLMYehn3PuaIjp2dXrIWu8jh-bdQ84X2Ku83jMdEY,3805
|
|
21
21
|
linkml_store/api/stores/hdf5/hdf5_database.py,sha256=EZbjrpaqiNDEFvoD5dZNcGBXA8z6HRNL81emueTZWNw,2714
|
|
@@ -30,30 +30,30 @@ linkml_store/api/stores/solr/solr_collection.py,sha256=ZlxC3JbVaHfSA4HuTeJTsp6qe
|
|
|
30
30
|
linkml_store/api/stores/solr/solr_database.py,sha256=TFjqbY7jAkdrhAchbNg0E-mChSP7ogNwFExslbvX7Yo,2877
|
|
31
31
|
linkml_store/api/stores/solr/solr_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
32
|
linkml_store/api/types.py,sha256=3aIQtDFMvsSmjuN5qrR2vNK5sHa6yzD_rEOPA6tHwvg,176
|
|
33
|
-
linkml_store/cli.py,sha256=
|
|
33
|
+
linkml_store/cli.py,sha256=bWbWQita8KCBjzovBRzQqHtjbRrf7Ttxq0Fe8zrDuds,30235
|
|
34
34
|
linkml_store/constants.py,sha256=x4ZmDsfE9rZcL5WpA93uTKrRWzCD6GodYXviVzIvR38,112
|
|
35
35
|
linkml_store/graphs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
linkml_store/graphs/graph_map.py,sha256=bYRxv8n1YPnFqE9d6JKNmRawb8EAhsPlHhBue0gvtZE,712
|
|
37
37
|
linkml_store/index/__init__.py,sha256=6SQzDe-WZSSqbGNsbCDfyPTyz0s9ISDKw1dm9xgQuT4,1396
|
|
38
38
|
linkml_store/index/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
-
linkml_store/index/implementations/llm_indexer.py,sha256=
|
|
39
|
+
linkml_store/index/implementations/llm_indexer.py,sha256=ja7UXhQj7F0g6HiRIJ8EBPuM86nOgr49jkh7eh_nCHs,5644
|
|
40
40
|
linkml_store/index/implementations/simple_indexer.py,sha256=KnkFJtXTHnwjhD_D6ZK2rFhBID1dgCedcOVPEWAY2NU,1282
|
|
41
|
-
linkml_store/index/indexer.py,sha256=
|
|
41
|
+
linkml_store/index/indexer.py,sha256=e5dsjh2wjOTDRsfClKJAFTbcK1UC7BOGkUCOfDg9omI,7635
|
|
42
42
|
linkml_store/inference/__init__.py,sha256=b8NAFNZjOYU_8gOvxdyCyoiHOOl5Ai2ckKs1tv7ZkkY,342
|
|
43
43
|
linkml_store/inference/evaluation.py,sha256=YDFYaEu2QLSfFq4oyARrnKfTiPLtNF8irhhspgVDfdY,6013
|
|
44
44
|
linkml_store/inference/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
-
linkml_store/inference/implementations/rag_inference_engine.py,sha256=
|
|
45
|
+
linkml_store/inference/implementations/rag_inference_engine.py,sha256=mN7YQI-BeZglsAnZnNIuAj-Nxg1su5efNaohooEmNmM,10622
|
|
46
46
|
linkml_store/inference/implementations/rule_based_inference_engine.py,sha256=0IEY_fsHJPJy6QKbYQU_qE87RRnPOXQxPuJKXCQG8jU,6250
|
|
47
47
|
linkml_store/inference/implementations/sklearn_inference_engine.py,sha256=Sdi7CoRK3qoLJu3prgLy1Ck_zQ1gHWRKFybHe7XQ4_g,13192
|
|
48
|
-
linkml_store/inference/inference_config.py,sha256=
|
|
49
|
-
linkml_store/inference/inference_engine.py,sha256=
|
|
48
|
+
linkml_store/inference/inference_config.py,sha256=EFGdigxWsfTPREbgqyJVRShN0JktCEmFLLoECrLfXSg,2282
|
|
49
|
+
linkml_store/inference/inference_engine.py,sha256=IxQIOgmXCDI8ilCGtoaVA_1wFROUg4uH1_yGbX78N2U,7139
|
|
50
50
|
linkml_store/inference/inference_engine_registry.py,sha256=6o66gvBYBwdeAKm62zqqvfaBlcopVP_cla3L6uXGsHA,3015
|
|
51
51
|
linkml_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
linkml_store/utils/change_utils.py,sha256=O2rvSvgTKB60reLLz9mX5OWykAA_m93bwnUh5ZWa0EY,471
|
|
53
53
|
linkml_store/utils/file_utils.py,sha256=rQ7-XpmI6_Kx_dhEnI98muFRr0MmgI_kZ_9cgJBf_0I,1411
|
|
54
|
-
linkml_store/utils/format_utils.py,sha256=
|
|
54
|
+
linkml_store/utils/format_utils.py,sha256=sjpdJJ8Ww2ilm03mQt_v4QkZvQMymqUeTiPS3U1ViKM,11067
|
|
55
55
|
linkml_store/utils/io.py,sha256=JHUrWDtlZC2jtN_PQZ4ypdGIyYlftZEN3JaCvEPs44w,884
|
|
56
|
-
linkml_store/utils/llm_utils.py,sha256=
|
|
56
|
+
linkml_store/utils/llm_utils.py,sha256=0lvR_lBSDSuP-0Eum16QBUsSv8sWfDjZPz_MnDSPvn0,3048
|
|
57
57
|
linkml_store/utils/mongodb_utils.py,sha256=Rl1YmMKs1IXwSsJIViSDChbi0Oer5cBnMmjka2TeQS8,4665
|
|
58
58
|
linkml_store/utils/neo4j_utils.py,sha256=y3KPmDZ8mQmePgg0lUeKkeKqzEr2rV226xxEtHc5pRg,1266
|
|
59
59
|
linkml_store/utils/object_utils.py,sha256=Vib-5Ip2DlRVKLZpU-008ZZI813-vfKVSCY0TksRenM,6293
|
|
@@ -64,6 +64,7 @@ linkml_store/utils/schema_utils.py,sha256=iJiZxo5NGr7v87h4DV6V9DrDOZHSswMRuf0N4V
|
|
|
64
64
|
linkml_store/utils/sklearn_utils.py,sha256=itPpcrsbbyOazdjmivaaZ1lyZeytm0a0hJ2AS8ziUgg,7590
|
|
65
65
|
linkml_store/utils/sql_utils.py,sha256=T41w_vsc3SauTJQkDMwid_nOtKW1YOKyUuaxEf470hk,5938
|
|
66
66
|
linkml_store/utils/stats_utils.py,sha256=4KqBb1bqDgAmq-1fJLLu5B2paPgoZZc3A-gnyVam4bI,1799
|
|
67
|
+
linkml_store/utils/vector_utils.py,sha256=Q1RlpDzavJAM9-H2m2XNU5BNUcfZkpIWeEZii2hK0PQ,5449
|
|
67
68
|
linkml_store/webapi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
69
|
linkml_store/webapi/html/__init__.py,sha256=hwp5eeBJKH65Bvv1x9Z4vsT1tLSYtb9Dq4I9r1kL1q0,69
|
|
69
70
|
linkml_store/webapi/html/base.html.j2,sha256=hoiV2uaSxxrQp7VuAZBOHueH7czyJMYcPBRN6dZFYhk,693
|
|
@@ -72,8 +73,8 @@ linkml_store/webapi/html/database_details.html.j2,sha256=qtXdavbZb0mohiObI9dvJtk
|
|
|
72
73
|
linkml_store/webapi/html/databases.html.j2,sha256=a9BCWQYfPeFhdUd31CWhB0yWhTIFXQayO08JgjyqKoc,294
|
|
73
74
|
linkml_store/webapi/html/generic.html.j2,sha256=KtLaO2HUEF2Opq-OwHKgRKetNWe8IWc6JuIkxRPsywk,1018
|
|
74
75
|
linkml_store/webapi/main.py,sha256=B0Da575kKR7X88N9ykm99Dem8FyBAW9f-w3A_JwUzfw,29165
|
|
75
|
-
linkml_store-0.2.
|
|
76
|
-
linkml_store-0.2.
|
|
77
|
-
linkml_store-0.2.
|
|
78
|
-
linkml_store-0.2.
|
|
79
|
-
linkml_store-0.2.
|
|
76
|
+
linkml_store-0.2.4.dist-info/LICENSE,sha256=77mDOslUnalYnuq9xQYZKtIoNEzcH9mIjvWHOKjamnE,1086
|
|
77
|
+
linkml_store-0.2.4.dist-info/METADATA,sha256=PJX-_TSPk6WDXDCmvuFDUb5649ECQc2N6zP4pWqhBvU,7204
|
|
78
|
+
linkml_store-0.2.4.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
79
|
+
linkml_store-0.2.4.dist-info/entry_points.txt,sha256=gWxVsHqx-t-UKWFHFzawQTvs4is4vC1rCF5AeKyqWWk,101
|
|
80
|
+
linkml_store-0.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|