linkml-store 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/collection.py +48 -5
- linkml_store/api/database.py +7 -1
- linkml_store/api/queries.py +3 -1
- linkml_store/api/stores/duckdb/duckdb_collection.py +8 -2
- linkml_store/cli.py +44 -18
- linkml_store/index/implementations/llm_indexer.py +20 -2
- linkml_store/index/indexer.py +51 -1
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/rag_inference_engine.py +120 -33
- linkml_store/inference/implementations/rule_based_inference_engine.py +15 -4
- linkml_store/inference/implementations/sklearn_inference_engine.py +20 -2
- linkml_store/inference/inference_config.py +1 -0
- linkml_store/inference/inference_engine.py +53 -19
- linkml_store/utils/format_utils.py +6 -0
- linkml_store/utils/llm_utils.py +2 -0
- linkml_store/utils/object_utils.py +100 -1
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/METADATA +9 -1
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/RECORD +21 -20
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/entry_points.txt +0 -0
linkml_store/api/collection.py
CHANGED
|
@@ -226,6 +226,18 @@ class Collection(Generic[DatabaseType]):
|
|
|
226
226
|
self._initialized = True
|
|
227
227
|
patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
|
|
228
228
|
self._broadcast(patches, **kwargs)
|
|
229
|
+
self._post_modification_hook(**kwargs)
|
|
230
|
+
|
|
231
|
+
def _post_delete_hook(self, **kwargs):
|
|
232
|
+
self._post_modification_hook(**kwargs)
|
|
233
|
+
|
|
234
|
+
def _post_modification_hook(self, **kwargs):
|
|
235
|
+
for indexer in self.indexers.values():
|
|
236
|
+
ix_collection_name = self.get_index_collection_name(indexer)
|
|
237
|
+
ix_collection = self.parent.get_collection(ix_collection_name)
|
|
238
|
+
# Currently updating the source triggers complete reindexing
|
|
239
|
+
# TODO: make this more efficient by only deleting modified
|
|
240
|
+
ix_collection.delete_where({})
|
|
229
241
|
|
|
230
242
|
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
|
|
231
243
|
"""
|
|
@@ -476,7 +488,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
476
488
|
Now let's index, using the simple trigram-based index
|
|
477
489
|
|
|
478
490
|
>>> index = get_indexer("simple")
|
|
479
|
-
>>> collection.attach_indexer(index)
|
|
491
|
+
>>> _ = collection.attach_indexer(index)
|
|
480
492
|
|
|
481
493
|
Now let's find all objects:
|
|
482
494
|
|
|
@@ -514,7 +526,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
514
526
|
if ix_coll.size() == 0:
|
|
515
527
|
logger.info(f"Index {index_name} is empty; indexing all objects")
|
|
516
528
|
all_objs = self.find(limit=-1).rows
|
|
517
|
-
|
|
529
|
+
if all_objs:
|
|
530
|
+
# print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
|
|
531
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
532
|
+
assert ix_coll.size() > 0
|
|
518
533
|
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
519
534
|
index_col = ix.index_field
|
|
520
535
|
# TODO: optimize this for large indexes
|
|
@@ -648,7 +663,31 @@ class Collection(Generic[DatabaseType]):
|
|
|
648
663
|
"""
|
|
649
664
|
return self.find({}, limit=1).num_rows
|
|
650
665
|
|
|
651
|
-
def
|
|
666
|
+
def rows_iter(self) -> Iterable[OBJECT]:
|
|
667
|
+
"""
|
|
668
|
+
Return an iterator over the objects in the collection.
|
|
669
|
+
|
|
670
|
+
:return:
|
|
671
|
+
"""
|
|
672
|
+
yield from self.find({}, limit=-1).rows
|
|
673
|
+
|
|
674
|
+
def rows(self) -> List[OBJECT]:
|
|
675
|
+
"""
|
|
676
|
+
Return a list of objects in the collection.
|
|
677
|
+
|
|
678
|
+
:return:
|
|
679
|
+
"""
|
|
680
|
+
return list(self.rows_iter())
|
|
681
|
+
|
|
682
|
+
def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
|
|
683
|
+
"""
|
|
684
|
+
Return a list of objects in the collection, with scores.
|
|
685
|
+
"""
|
|
686
|
+
return [(n, obj) for n, obj in enumerate(self.rows_iter())]
|
|
687
|
+
|
|
688
|
+
def attach_indexer(
|
|
689
|
+
self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
|
|
690
|
+
) -> Indexer:
|
|
652
691
|
"""
|
|
653
692
|
Attach an index to the collection.
|
|
654
693
|
|
|
@@ -669,8 +708,8 @@ class Collection(Generic[DatabaseType]):
|
|
|
669
708
|
>>> full_index.name = "full"
|
|
670
709
|
>>> name_index = get_indexer("simple", text_template="{name}")
|
|
671
710
|
>>> name_index.name = "name"
|
|
672
|
-
>>> collection.attach_indexer(full_index)
|
|
673
|
-
>>> collection.attach_indexer(name_index)
|
|
711
|
+
>>> _ = collection.attach_indexer(full_index)
|
|
712
|
+
>>> _ = collection.attach_indexer(name_index)
|
|
674
713
|
|
|
675
714
|
Now let's find objects using the full index, using the string "France".
|
|
676
715
|
We expect the country France to be the top hit, but the score will
|
|
@@ -713,6 +752,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
713
752
|
all_objs = self.find(limit=-1).rows
|
|
714
753
|
logger.info(f"Auto-indexing {len(all_objs)} objects")
|
|
715
754
|
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
755
|
+
return index
|
|
756
|
+
|
|
757
|
+
def get_index_collection_name(self, indexer: Indexer) -> str:
|
|
758
|
+
return self._index_collection_name(indexer.name)
|
|
716
759
|
|
|
717
760
|
def _index_collection_name(self, index_name: str) -> str:
|
|
718
761
|
"""
|
linkml_store/api/database.py
CHANGED
|
@@ -268,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
268
268
|
metadata: Optional[CollectionConfig] = None,
|
|
269
269
|
recreate_if_exists=False,
|
|
270
270
|
**kwargs,
|
|
271
|
-
) ->
|
|
271
|
+
) -> Collection:
|
|
272
272
|
"""
|
|
273
273
|
Create a new collection in the current database.
|
|
274
274
|
|
|
@@ -760,6 +760,12 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
760
760
|
"""
|
|
761
761
|
Export a database to a file or location.
|
|
762
762
|
|
|
763
|
+
>>> from linkml_store.api.client import Client
|
|
764
|
+
>>> client = Client()
|
|
765
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
766
|
+
>>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
|
|
767
|
+
>>> db.export_database("/tmp/iris.yaml", Format.YAML)
|
|
768
|
+
|
|
763
769
|
:param location: location of the file
|
|
764
770
|
:param target_format: target format
|
|
765
771
|
:param kwargs: additional arguments
|
linkml_store/api/queries.py
CHANGED
|
@@ -36,6 +36,9 @@ class DuckDBCollection(Collection):
|
|
|
36
36
|
logger.info(f"Inserting into: {self.alias} // T={table.name}")
|
|
37
37
|
engine = self.parent.engine
|
|
38
38
|
col_names = [c.name for c in table.columns]
|
|
39
|
+
bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
|
|
40
|
+
if bad_objs:
|
|
41
|
+
logger.error(f"Bad objects: {bad_objs}")
|
|
39
42
|
objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
|
|
40
43
|
with engine.connect() as conn:
|
|
41
44
|
with conn.begin():
|
|
@@ -47,8 +50,9 @@ class DuckDBCollection(Collection):
|
|
|
47
50
|
if not isinstance(objs, list):
|
|
48
51
|
objs = [objs]
|
|
49
52
|
cd = self.class_definition()
|
|
50
|
-
if not cd:
|
|
53
|
+
if not cd or not cd.attributes:
|
|
51
54
|
cd = self.induce_class_definition_from_objects(objs)
|
|
55
|
+
assert cd.attributes
|
|
52
56
|
table = self._sqla_table(cd)
|
|
53
57
|
engine = self.parent.engine
|
|
54
58
|
with engine.connect() as conn:
|
|
@@ -58,7 +62,8 @@ class DuckDBCollection(Collection):
|
|
|
58
62
|
stmt = stmt.compile(engine)
|
|
59
63
|
conn.execute(stmt)
|
|
60
64
|
conn.commit()
|
|
61
|
-
|
|
65
|
+
self._post_delete_hook()
|
|
66
|
+
return None
|
|
62
67
|
|
|
63
68
|
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
|
|
64
69
|
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
@@ -84,6 +89,7 @@ class DuckDBCollection(Collection):
|
|
|
84
89
|
if deleted_rows_count == 0 and not missing_ok:
|
|
85
90
|
raise ValueError(f"No rows found for {where}")
|
|
86
91
|
conn.commit()
|
|
92
|
+
self._post_delete_hook()
|
|
87
93
|
return deleted_rows_count if deleted_rows_count > -1 else None
|
|
88
94
|
|
|
89
95
|
def query_facets(
|
linkml_store/cli.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Optional
|
|
|
7
7
|
import click
|
|
8
8
|
import yaml
|
|
9
9
|
from linkml_runtime.dumpers import json_dumper
|
|
10
|
+
from linkml_runtime.utils.formatutils import underscore
|
|
10
11
|
from pydantic import BaseModel
|
|
11
12
|
|
|
12
13
|
from linkml_store import Client
|
|
@@ -17,6 +18,7 @@ from linkml_store.index import get_indexer
|
|
|
17
18
|
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
18
19
|
from linkml_store.index.indexer import Indexer
|
|
19
20
|
from linkml_store.inference import get_inference_engine
|
|
21
|
+
from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
|
|
20
22
|
from linkml_store.inference.inference_config import InferenceConfig
|
|
21
23
|
from linkml_store.inference.inference_engine import ModelSerialization
|
|
22
24
|
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
@@ -74,6 +76,8 @@ class ContextSettings(BaseModel):
|
|
|
74
76
|
if name is None:
|
|
75
77
|
# if len(self.database.list_collections()) > 1:
|
|
76
78
|
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
79
|
+
if not self.database:
|
|
80
|
+
return None
|
|
77
81
|
if not self.database.list_collections():
|
|
78
82
|
return None
|
|
79
83
|
name = list(self.database.list_collections())[0]
|
|
@@ -130,7 +134,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
130
134
|
logger.setLevel(logging.ERROR)
|
|
131
135
|
ctx.ensure_object(dict)
|
|
132
136
|
if input:
|
|
133
|
-
stem = Path(input).stem
|
|
137
|
+
stem = underscore(Path(input).stem)
|
|
134
138
|
database = "duckdb"
|
|
135
139
|
collection = stem
|
|
136
140
|
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
|
|
@@ -216,7 +220,10 @@ def insert(ctx, files, object, format):
|
|
|
216
220
|
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
217
221
|
@click.pass_context
|
|
218
222
|
def store(ctx, files, object, format):
|
|
219
|
-
"""Store objects from files (JSON, YAML, TSV) into the
|
|
223
|
+
"""Store objects from files (JSON, YAML, TSV) into the database.
|
|
224
|
+
|
|
225
|
+
Note: this is similar to insert, but a collection does not need to be specified
|
|
226
|
+
"""
|
|
220
227
|
settings = ctx.obj["settings"]
|
|
221
228
|
db = settings.database
|
|
222
229
|
if not files and not object:
|
|
@@ -496,12 +503,16 @@ def describe(ctx, where, output_type, output, limit):
|
|
|
496
503
|
@click.option(
|
|
497
504
|
"--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
|
|
498
505
|
)
|
|
506
|
+
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
|
|
507
|
+
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
|
|
499
508
|
@click.option("--query", "-q", type=click.STRING, help="query term")
|
|
500
509
|
@click.pass_context
|
|
501
510
|
def infer(
|
|
502
511
|
ctx,
|
|
503
512
|
inference_config_file,
|
|
504
513
|
query,
|
|
514
|
+
evaluation_count,
|
|
515
|
+
evaluation_match_function,
|
|
505
516
|
training_test_data_split,
|
|
506
517
|
predictor_type,
|
|
507
518
|
target_attribute,
|
|
@@ -545,25 +556,28 @@ def infer(
|
|
|
545
556
|
else:
|
|
546
557
|
query_obj = None
|
|
547
558
|
collection = ctx.obj["settings"].collection
|
|
548
|
-
|
|
559
|
+
if collection:
|
|
560
|
+
atts = collection.class_definition().attributes.keys()
|
|
561
|
+
else:
|
|
562
|
+
atts = []
|
|
563
|
+
if feature_attributes:
|
|
564
|
+
features = feature_attributes.split(",")
|
|
565
|
+
features = [f.strip() for f in features]
|
|
566
|
+
else:
|
|
567
|
+
if query_obj:
|
|
568
|
+
features = query_obj.keys()
|
|
569
|
+
else:
|
|
570
|
+
features = None
|
|
571
|
+
if target_attribute:
|
|
572
|
+
target_attributes = list(target_attribute)
|
|
573
|
+
else:
|
|
574
|
+
target_attributes = [att for att in atts if att not in features]
|
|
549
575
|
if model_format:
|
|
550
576
|
model_format = ModelSerialization(model_format)
|
|
551
577
|
if load_model:
|
|
552
578
|
predictor = get_inference_engine(predictor_type)
|
|
553
579
|
predictor = type(predictor).load_model(load_model)
|
|
554
580
|
else:
|
|
555
|
-
if feature_attributes:
|
|
556
|
-
features = feature_attributes.split(",")
|
|
557
|
-
features = [f.strip() for f in features]
|
|
558
|
-
else:
|
|
559
|
-
if query_obj:
|
|
560
|
-
features = query_obj.keys()
|
|
561
|
-
else:
|
|
562
|
-
features = None
|
|
563
|
-
if target_attribute:
|
|
564
|
-
target_attributes = list(target_attribute)
|
|
565
|
-
else:
|
|
566
|
-
target_attributes = [att for att in atts if att not in features]
|
|
567
581
|
if inference_config_file:
|
|
568
582
|
config = InferenceConfig.from_file(inference_config_file)
|
|
569
583
|
else:
|
|
@@ -571,14 +585,26 @@ def infer(
|
|
|
571
585
|
if training_test_data_split:
|
|
572
586
|
config.train_test_split = training_test_data_split
|
|
573
587
|
predictor = get_inference_engine(predictor_type, config=config)
|
|
574
|
-
|
|
588
|
+
if collection:
|
|
589
|
+
predictor.load_and_split_data(collection)
|
|
575
590
|
predictor.initialize_model()
|
|
576
591
|
if export_model:
|
|
577
592
|
logger.info(f"Exporting model to {export_model} in {model_format}")
|
|
578
593
|
predictor.export_model(export_model, model_format)
|
|
579
594
|
if not query_obj:
|
|
580
|
-
if not export_model:
|
|
581
|
-
raise ValueError("Query must be specified if not exporting model")
|
|
595
|
+
if not export_model and not evaluation_count:
|
|
596
|
+
raise ValueError("Query or evaluate must be specified if not exporting model")
|
|
597
|
+
if evaluation_count:
|
|
598
|
+
if evaluation_match_function == "score_text_overlap":
|
|
599
|
+
match_function_fn = score_text_overlap
|
|
600
|
+
elif evaluation_match_function is not None:
|
|
601
|
+
raise ValueError(f"Unknown match function: {evaluation_match_function}")
|
|
602
|
+
else:
|
|
603
|
+
match_function_fn = None
|
|
604
|
+
outcome = evaluate_predictor(
|
|
605
|
+
predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
|
|
606
|
+
)
|
|
607
|
+
print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
|
|
582
608
|
if query_obj:
|
|
583
609
|
result = predictor.derive(query_obj)
|
|
584
610
|
dumped_obj = result.model_dump(exclude_none=True)
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, List
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
+
from tiktoken import encoding_for_model
|
|
6
7
|
|
|
7
8
|
from linkml_store.api.config import CollectionConfig
|
|
8
9
|
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
10
|
+
from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
|
|
9
11
|
|
|
10
12
|
if TYPE_CHECKING:
|
|
11
13
|
import llm
|
|
@@ -29,6 +31,7 @@ class LLMIndexer(Indexer):
|
|
|
29
31
|
cached_embeddings_database: str = None
|
|
30
32
|
cached_embeddings_collection: str = None
|
|
31
33
|
cache_queries: bool = False
|
|
34
|
+
truncation_method: Optional[str] = None
|
|
32
35
|
|
|
33
36
|
@property
|
|
34
37
|
def embedding_model(self):
|
|
@@ -62,6 +65,21 @@ class LLMIndexer(Indexer):
|
|
|
62
65
|
"""
|
|
63
66
|
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
64
67
|
model = self.embedding_model
|
|
68
|
+
token_limit = get_token_limit(model.model_id)
|
|
69
|
+
encoding = encoding_for_model("gpt-4o")
|
|
70
|
+
|
|
71
|
+
def truncate_text(text: str) -> str:
|
|
72
|
+
# split into tokens every 1000 chars:
|
|
73
|
+
parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
|
|
74
|
+
return render_formatted_text(
|
|
75
|
+
lambda x: "".join(x),
|
|
76
|
+
parts,
|
|
77
|
+
encoding,
|
|
78
|
+
token_limit,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
texts = [truncate_text(text) for text in texts]
|
|
82
|
+
|
|
65
83
|
if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
|
|
66
84
|
model_id = model.model_id
|
|
67
85
|
if not model_id:
|
|
@@ -88,7 +106,7 @@ class LLMIndexer(Indexer):
|
|
|
88
106
|
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
89
107
|
else:
|
|
90
108
|
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
91
|
-
|
|
109
|
+
|
|
92
110
|
embeddings = list([None] * len(texts))
|
|
93
111
|
uncached_texts = []
|
|
94
112
|
n = 0
|
linkml_store/index/indexer.py
CHANGED
|
@@ -36,6 +36,54 @@ def cosine_similarity(vector1, vector2) -> float:
|
|
|
36
36
|
class Indexer(BaseModel):
|
|
37
37
|
"""
|
|
38
38
|
An indexer operates on a collection in order to search for objects.
|
|
39
|
+
|
|
40
|
+
You should use a subcllass of this; this can be looked up dynqamically:
|
|
41
|
+
|
|
42
|
+
>>> from linkml_store.index import get_indexer
|
|
43
|
+
>>> indexer = get_indexer("simple")
|
|
44
|
+
|
|
45
|
+
You can customize how objects are indexed by passing in a text template.
|
|
46
|
+
For example, if your collection has objects with "name" and "profession" attributes,
|
|
47
|
+
you can index them as "{name} {profession}".
|
|
48
|
+
|
|
49
|
+
>>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
|
|
50
|
+
|
|
51
|
+
By default, python fstrings are assumed.
|
|
52
|
+
|
|
53
|
+
We can test this works using the :ref:`object_to_text` method (normally
|
|
54
|
+
you would never need to call this directly, but it's useful for testing):
|
|
55
|
+
|
|
56
|
+
>>> obj = {"name": "John", "profession": "doctor"}
|
|
57
|
+
>>> indexer.object_to_text(obj)
|
|
58
|
+
'John :: doctor'
|
|
59
|
+
|
|
60
|
+
You can also use Jinja2 templates; this gives more flexibility and logic,
|
|
61
|
+
e.g. conditional formatting:
|
|
62
|
+
|
|
63
|
+
>>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
|
|
64
|
+
>>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
|
|
65
|
+
>>> indexer.object_to_text(obj)
|
|
66
|
+
'John :: doctor'
|
|
67
|
+
>>> indexer.object_to_text({"name": "John"})
|
|
68
|
+
'John'
|
|
69
|
+
|
|
70
|
+
You can also specify which attributes to index:
|
|
71
|
+
|
|
72
|
+
>>> indexer = get_indexer("simple", index_attributes=["name"])
|
|
73
|
+
>>> indexer.object_to_text(obj)
|
|
74
|
+
'John'
|
|
75
|
+
|
|
76
|
+
The purpose of an indexer is to translate a collection of objects into a collection of objects
|
|
77
|
+
such as vectors for purposes such as search. Unless you are implementing your own indexer, you
|
|
78
|
+
generally don't need to use the methods that return vectors, but we can examine their behavior
|
|
79
|
+
to get a sense of how they work.
|
|
80
|
+
|
|
81
|
+
>>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
|
|
82
|
+
>>> assert cosine_similarity(vectors[0], vectors[1]) > cosine_similarity(vectors[0], vectors[2])
|
|
83
|
+
|
|
84
|
+
Note you should consult the documentation for the specific indexer you are using for more details on
|
|
85
|
+
how text is converted to vectors.
|
|
86
|
+
|
|
39
87
|
"""
|
|
40
88
|
|
|
41
89
|
name: Optional[str] = None
|
|
@@ -122,7 +170,9 @@ class Indexer(BaseModel):
|
|
|
122
170
|
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
|
|
123
171
|
) -> List[Tuple[float, Any]]:
|
|
124
172
|
"""
|
|
125
|
-
|
|
173
|
+
Use the indexer to search against a database of vectors.
|
|
174
|
+
|
|
175
|
+
Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
|
|
126
176
|
|
|
127
177
|
:param query: The query string to search for
|
|
128
178
|
:param vectors: A list of indexed items, where each item is a tuple of (id, vector)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any, List, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from linkml_store.inference import InferenceEngine
|
|
10
|
+
from linkml_store.utils.object_utils import select_nested
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def score_match(target: Optional[Any], candidate: Optional[Any], match_function: Optional[Callable] = None) -> float:
|
|
16
|
+
"""
|
|
17
|
+
Compute a score for a match between two objects
|
|
18
|
+
|
|
19
|
+
>>> score_match("a", "a")
|
|
20
|
+
1.0
|
|
21
|
+
>>> score_match("a", "b")
|
|
22
|
+
0.0
|
|
23
|
+
>>> score_match("abcd", "abcde")
|
|
24
|
+
0.0
|
|
25
|
+
>>> score_match("a", None)
|
|
26
|
+
0.0
|
|
27
|
+
>>> score_match(None, "a")
|
|
28
|
+
0.0
|
|
29
|
+
>>> score_match(None, None)
|
|
30
|
+
1.0
|
|
31
|
+
>>> score_match(["a", "b"], ["a", "b"])
|
|
32
|
+
1.0
|
|
33
|
+
>>> score_match(["a", "b"], ["b", "a"])
|
|
34
|
+
1.0
|
|
35
|
+
>>> round(score_match(["a"], ["b", "a"]), 2)
|
|
36
|
+
0.67
|
|
37
|
+
>>> score_match({"a": 1}, {"a": 1})
|
|
38
|
+
1.0
|
|
39
|
+
>>> score_match({"a": 1}, {"a": 2})
|
|
40
|
+
0.0
|
|
41
|
+
>>> score_match({"a": 1, "b": None}, {"a": 1})
|
|
42
|
+
1.0
|
|
43
|
+
>>> score_match([{"a": 1, "b": 2}, {"a": 3, "b": 4}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
|
|
44
|
+
1.0
|
|
45
|
+
>>> score_match([{"a": 1, "b": 4}, {"a": 3, "b": 2}], [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
|
|
46
|
+
0.5
|
|
47
|
+
>>> def char_match(x, y):
|
|
48
|
+
... return len(set(x).intersection(set(y))) / len(set(x).union(set(y)))
|
|
49
|
+
>>> score_match("abcd", "abc", char_match)
|
|
50
|
+
0.75
|
|
51
|
+
>>> score_match(["abcd", "efgh"], ["ac", "gh"], char_match)
|
|
52
|
+
0.5
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
:param target:
|
|
56
|
+
:param candidate:
|
|
57
|
+
:param match_function: defaults to struct
|
|
58
|
+
:return:
|
|
59
|
+
"""
|
|
60
|
+
if target == candidate:
|
|
61
|
+
return 1.0
|
|
62
|
+
if target is None or candidate is None:
|
|
63
|
+
return 0.0
|
|
64
|
+
if isinstance(target, (set, list)) and isinstance(candidate, (set, list)):
|
|
65
|
+
# create an all by all matrix using numpy
|
|
66
|
+
# for each pair of elements, compute the score
|
|
67
|
+
# return the average score
|
|
68
|
+
score_matrix = np.array([[score_match(t, c, match_function) for c in candidate] for t in target])
|
|
69
|
+
best_matches0 = np.max(score_matrix, axis=0)
|
|
70
|
+
best_matches1 = np.max(score_matrix, axis=1)
|
|
71
|
+
return (np.sum(best_matches0) + np.sum(best_matches1)) / (len(target) + len(candidate))
|
|
72
|
+
if isinstance(target, dict) and isinstance(candidate, dict):
|
|
73
|
+
keys = set(target.keys()).union(candidate.keys())
|
|
74
|
+
scores = [score_match(target.get(k), candidate.get(k), match_function) for k in keys]
|
|
75
|
+
return np.mean(scores)
|
|
76
|
+
if match_function:
|
|
77
|
+
return match_function(target, candidate)
|
|
78
|
+
return 0.0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Outcome(BaseModel):
|
|
82
|
+
true_positive_count: float
|
|
83
|
+
total_count: int
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def accuracy(self) -> float:
|
|
87
|
+
return self.true_positive_count / self.total_count
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def evaluate_predictor(
|
|
91
|
+
predictor: InferenceEngine,
|
|
92
|
+
target_attributes: List[str],
|
|
93
|
+
feature_attributes: Optional[List[str]] = None,
|
|
94
|
+
test_data: pd.DataFrame = None,
|
|
95
|
+
evaluation_count: Optional[int] = 10,
|
|
96
|
+
match_function: Optional[Callable] = None,
|
|
97
|
+
) -> Outcome:
|
|
98
|
+
"""
|
|
99
|
+
Evaluate a predictor by comparing its predictions to the expected values in the testing data.
|
|
100
|
+
|
|
101
|
+
:param predictor:
|
|
102
|
+
:param target_attributes:
|
|
103
|
+
:param feature_attributes:
|
|
104
|
+
:param evaluation_count: max iterations
|
|
105
|
+
:param match_function: function to use for matching
|
|
106
|
+
:return:
|
|
107
|
+
"""
|
|
108
|
+
n = 0
|
|
109
|
+
tp = 0
|
|
110
|
+
if test_data is None:
|
|
111
|
+
test_data = predictor.testing_data.as_dataframe()
|
|
112
|
+
for row in test_data.to_dict(orient="records"):
|
|
113
|
+
expected_obj = select_nested(row, target_attributes)
|
|
114
|
+
if feature_attributes:
|
|
115
|
+
test_obj = {k: v for k, v in row.items() if k not in target_attributes}
|
|
116
|
+
else:
|
|
117
|
+
test_obj = row
|
|
118
|
+
result = predictor.derive(test_obj)
|
|
119
|
+
tp += score_match(result.predicted_object, expected_obj, match_function)
|
|
120
|
+
logger.info(f"TP={tp} MF={match_function} Predicted: {result.predicted_object} Expected: {expected_obj}")
|
|
121
|
+
n += 1
|
|
122
|
+
if evaluation_count is not None and n >= evaluation_count:
|
|
123
|
+
break
|
|
124
|
+
return Outcome(true_positive_count=tp, total_count=n)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def score_text_overlap(str1: Any, str2: Any) -> float:
|
|
128
|
+
"""
|
|
129
|
+
Compute the overlap score between two strings.
|
|
130
|
+
|
|
131
|
+
>>> score_text_overlap("abc", "bcde")
|
|
132
|
+
0.5
|
|
133
|
+
|
|
134
|
+
:param str1:
|
|
135
|
+
:param str2:
|
|
136
|
+
:return:
|
|
137
|
+
"""
|
|
138
|
+
if str1 == str2:
|
|
139
|
+
return 1.0
|
|
140
|
+
if not str1 or not str2:
|
|
141
|
+
return 0.0
|
|
142
|
+
overlap, length = find_longest_overlap(str1, str2)
|
|
143
|
+
return len(overlap) / max(len(str1), len(str2))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def find_longest_overlap(str1: str, str2: str):
|
|
147
|
+
"""
|
|
148
|
+
Find the longest overlapping substring between two strings.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
str1 (str): The first string
|
|
152
|
+
str2 (str): The second string
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
tuple: A tuple containing the longest overlapping substring and its length
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
>>> find_longest_overlap("hello world", "world of programming")
|
|
159
|
+
('world', 5)
|
|
160
|
+
>>> find_longest_overlap("abcdefg", "defghi")
|
|
161
|
+
('defg', 4)
|
|
162
|
+
>>> find_longest_overlap("python", "java")
|
|
163
|
+
('', 0)
|
|
164
|
+
>>> find_longest_overlap("", "test")
|
|
165
|
+
('', 0)
|
|
166
|
+
>>> find_longest_overlap("aabbcc", "ddeeff")
|
|
167
|
+
('', 0)
|
|
168
|
+
>>> find_longest_overlap("programming", "PROGRAMMING")
|
|
169
|
+
('', 0)
|
|
170
|
+
"""
|
|
171
|
+
if not str1 or not str2:
|
|
172
|
+
return "", 0
|
|
173
|
+
|
|
174
|
+
# Create a table to store lengths of matching substrings
|
|
175
|
+
m, n = len(str1), len(str2)
|
|
176
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
177
|
+
|
|
178
|
+
# Variables to store the maximum length and ending position
|
|
179
|
+
max_length = 0
|
|
180
|
+
end_pos = 0
|
|
181
|
+
|
|
182
|
+
# Fill the dp table
|
|
183
|
+
for i in range(1, m + 1):
|
|
184
|
+
for j in range(1, n + 1):
|
|
185
|
+
if str1[i - 1] == str2[j - 1]:
|
|
186
|
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
187
|
+
if dp[i][j] > max_length:
|
|
188
|
+
max_length = dp[i][j]
|
|
189
|
+
end_pos = i
|
|
190
|
+
|
|
191
|
+
# Extract the longest common substring
|
|
192
|
+
start_pos = end_pos - max_length
|
|
193
|
+
longest_substring = str1[start_pos:end_pos]
|
|
194
|
+
|
|
195
|
+
return longest_substring, max_length
|