linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
DEFAULT_IDENTIFIER_ATTRIBUTE = "id"
|
|
7
|
+
DEFAULT_CATEGORY_LABELS_ATTRIBUTE = "category"
|
|
8
|
+
DEFAULT_SUBJECT_ATTRIBUTE = "subject"
|
|
9
|
+
DEFAULT_PREDICATE_ATTRIBUTE = "predicate"
|
|
10
|
+
DEFAULT_OBJECT_ATTRIBUTE = "object"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GraphProjection(BaseModel, ABC):
|
|
14
|
+
identifier_attribute: str = DEFAULT_IDENTIFIER_ATTRIBUTE
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NodeProjection(GraphProjection):
|
|
18
|
+
category_labels_attribute: Optional[str] = DEFAULT_CATEGORY_LABELS_ATTRIBUTE
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EdgeProjection(GraphProjection):
|
|
22
|
+
subject_attribute: str = DEFAULT_SUBJECT_ATTRIBUTE
|
|
23
|
+
predicate_attribute: str = DEFAULT_PREDICATE_ATTRIBUTE
|
|
24
|
+
object_attribute: str = DEFAULT_OBJECT_ATTRIBUTE
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Indexers package.
|
|
3
|
+
|
|
4
|
+
Indexers allow indexes to be added to existing :class:`Collection` objects.
|
|
5
|
+
|
|
6
|
+
Current two are supported:
|
|
7
|
+
|
|
8
|
+
* simple: :class:`SimpleIndexer`
|
|
9
|
+
* llm: :class:`LLMIndexer`
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Type
|
|
13
|
+
|
|
14
|
+
from linkml_store.index.implementations.llm_indexer import LLMIndexer
|
|
15
|
+
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
16
|
+
from linkml_store.index.indexer import Indexer
|
|
17
|
+
|
|
18
|
+
INDEXER_CLASSES = {
|
|
19
|
+
"simple": SimpleIndexer,
|
|
20
|
+
"llm": LLMIndexer,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_indexer_class(name: str) -> Type[Indexer]:
|
|
25
|
+
"""
|
|
26
|
+
Get an indexer class by name.
|
|
27
|
+
|
|
28
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
29
|
+
:return: the indexer class
|
|
30
|
+
"""
|
|
31
|
+
if name not in INDEXER_CLASSES:
|
|
32
|
+
raise ValueError(f"Unknown indexer class: {name}")
|
|
33
|
+
return INDEXER_CLASSES[name]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_indexer(index_type: str, **kwargs) -> Indexer:
|
|
37
|
+
"""
|
|
38
|
+
Get an indexer by name.
|
|
39
|
+
|
|
40
|
+
>>> simple_indexer = get_indexer("simple")
|
|
41
|
+
>>> llm_indexer = get_indexer("llm")
|
|
42
|
+
|
|
43
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
44
|
+
:param kwargs: additional arguments to pass to the indexer
|
|
45
|
+
:return: the indexer
|
|
46
|
+
"""
|
|
47
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
48
|
+
cls = get_indexer_class(index_type)
|
|
49
|
+
kwargs["index_type"] = index_type
|
|
50
|
+
indexer = cls(**kwargs)
|
|
51
|
+
if not indexer.name:
|
|
52
|
+
indexer.name = index_type
|
|
53
|
+
return indexer
|
|
File without changes
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from linkml_store.api.config import CollectionConfig
|
|
8
|
+
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
9
|
+
from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import llm
|
|
13
|
+
|
|
14
|
+
CHUNK_SIZE = 1000
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LLMIndexer(Indexer):
|
|
20
|
+
"""
|
|
21
|
+
An indexer that wraps the llm library.
|
|
22
|
+
|
|
23
|
+
This indexer is used to convert text to vectors using the llm library.
|
|
24
|
+
|
|
25
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
26
|
+
>>> vector = indexer.text_to_vector("hello")
|
|
27
|
+
|
|
28
|
+
TODO: Implement true batching for embedding API calls
|
|
29
|
+
TODO: Add batch_size parameter to control batch processing
|
|
30
|
+
TODO: Support batch embedding APIs (e.g., OpenAI batch endpoint)
|
|
31
|
+
TODO: Add progress reporting for large batch operations
|
|
32
|
+
TODO: Implement smart batching with accumulation and flushing
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
embedding_model_name: str = "text-embedding-ada-002"
|
|
36
|
+
_embedding_model: "llm.EmbeddingModel" = None
|
|
37
|
+
cached_embeddings_database: str = None
|
|
38
|
+
cached_embeddings_collection: str = None
|
|
39
|
+
cache_queries: bool = False
|
|
40
|
+
truncation_method: Optional[str] = None
|
|
41
|
+
# TODO: Add batch_size: int = 100 parameter for batch processing
|
|
42
|
+
# TODO: Add supported_models class variable with model metadata (dims, costs, limits)
|
|
43
|
+
# TODO: Add model_validation to check if model exists before use
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def embedding_model(self):
|
|
47
|
+
import llm
|
|
48
|
+
|
|
49
|
+
if self._embedding_model is None:
|
|
50
|
+
self._embedding_model = llm.get_embedding_model(self.embedding_model_name)
|
|
51
|
+
return self._embedding_model
|
|
52
|
+
|
|
53
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
54
|
+
"""
|
|
55
|
+
Convert a text to an indexable object
|
|
56
|
+
|
|
57
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
58
|
+
>>> vector = indexer.text_to_vector("hello")
|
|
59
|
+
|
|
60
|
+
:param text:
|
|
61
|
+
:return:
|
|
62
|
+
"""
|
|
63
|
+
return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
|
|
64
|
+
|
|
65
|
+
def texts_to_vectors(
|
|
66
|
+
self, texts: List[str], cache: bool = None, token_limit_penalty=0, batch_size: int=None, **kwargs
|
|
67
|
+
) -> List[INDEX_ITEM]:
|
|
68
|
+
"""
|
|
69
|
+
Use LLM to embed.
|
|
70
|
+
|
|
71
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
72
|
+
>>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
|
|
73
|
+
|
|
74
|
+
:param texts:
|
|
75
|
+
:param cache:
|
|
76
|
+
:param token_limit_penalty:
|
|
77
|
+
:return:
|
|
78
|
+
"""
|
|
79
|
+
from tiktoken import encoding_for_model
|
|
80
|
+
|
|
81
|
+
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
82
|
+
model = self.embedding_model
|
|
83
|
+
# TODO: make this more accurate
|
|
84
|
+
token_limit = get_token_limit(model.model_id) - token_limit_penalty
|
|
85
|
+
logging.info(f"Token limit for {model.model_id}: {token_limit}")
|
|
86
|
+
encoding = encoding_for_model(self.embedding_model_name)
|
|
87
|
+
|
|
88
|
+
def truncate_text(text: str) -> str:
|
|
89
|
+
# split into tokens every 1000 chars:
|
|
90
|
+
parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
|
|
91
|
+
truncated = render_formatted_text(
|
|
92
|
+
lambda x: "".join(x),
|
|
93
|
+
parts,
|
|
94
|
+
encoding,
|
|
95
|
+
token_limit,
|
|
96
|
+
)
|
|
97
|
+
logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
|
|
98
|
+
return truncated
|
|
99
|
+
|
|
100
|
+
texts = [truncate_text(text) for text in texts]
|
|
101
|
+
# Calculate average number of tokens per text for accurate batch sizing
|
|
102
|
+
text_token_counts = [len(encoding.encode(t)) for t in texts]
|
|
103
|
+
avg_text_tokens = sum(text_token_counts) / len(text_token_counts)
|
|
104
|
+
logger.info(f"Average text token count: {avg_text_tokens}")
|
|
105
|
+
if batch_size is None:
|
|
106
|
+
# TODO: empirically determine best batch size
|
|
107
|
+
batch_size = max(int(token_limit / avg_text_tokens), 5)
|
|
108
|
+
logger.info(f"Setting batch size to {batch_size}")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
|
|
112
|
+
model_id = model.model_id
|
|
113
|
+
if not model_id:
|
|
114
|
+
raise ValueError("Model ID is required to cache embeddings")
|
|
115
|
+
db_path = Path(self.cached_embeddings_database)
|
|
116
|
+
coll_name = self.cached_embeddings_collection
|
|
117
|
+
if not coll_name:
|
|
118
|
+
coll_name = "all_embeddings"
|
|
119
|
+
from linkml_store import Client
|
|
120
|
+
|
|
121
|
+
embeddings_client = Client()
|
|
122
|
+
config = CollectionConfig(
|
|
123
|
+
alias=coll_name,
|
|
124
|
+
type="Embeddings",
|
|
125
|
+
attributes={
|
|
126
|
+
"text": {"range": "string"},
|
|
127
|
+
"model_id": {"range": "string"},
|
|
128
|
+
"embedding": {"range": "float", "array": {}},
|
|
129
|
+
},
|
|
130
|
+
)
|
|
131
|
+
embeddings_db = embeddings_client.get_database(f"duckdb:///{db_path}")
|
|
132
|
+
if coll_name in embeddings_db.list_collection_names():
|
|
133
|
+
# Load existing collection and use its model
|
|
134
|
+
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
135
|
+
else:
|
|
136
|
+
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
137
|
+
|
|
138
|
+
embeddings = list([None] * len(texts))
|
|
139
|
+
uncached_texts = []
|
|
140
|
+
n = 0
|
|
141
|
+
# TODO: Implement batch lookup for cache checking (single query for all texts)
|
|
142
|
+
# TODO: Use IN clause or batch query to check multiple texts at once
|
|
143
|
+
logger.info(f"Checking cache for {len(texts)} texts")
|
|
144
|
+
for i in range(len(texts)):
|
|
145
|
+
# TODO: optimize this - currently makes N database queries for N texts
|
|
146
|
+
text = texts[i]
|
|
147
|
+
logger.debug(f"Looking for cached embedding for {text}")
|
|
148
|
+
r = embeddings_collection.find({"text": text, "model_id": model_id})
|
|
149
|
+
if r.num_rows:
|
|
150
|
+
embeddings[i] = r.rows[0]["embedding"]
|
|
151
|
+
n += 1
|
|
152
|
+
logger.info("Found")
|
|
153
|
+
else:
|
|
154
|
+
uncached_texts.append((text, i))
|
|
155
|
+
logger.info("NOT Found")
|
|
156
|
+
logger.info(f"Found {n} cached embeddings")
|
|
157
|
+
if uncached_texts:
|
|
158
|
+
logger.info(f"Embedding {len(uncached_texts)} uncached texts")
|
|
159
|
+
uncached_texts, uncached_indices = zip(*uncached_texts)
|
|
160
|
+
uncached_embeddings = list(model.embed_multi(uncached_texts, batch_size=batch_size))
|
|
161
|
+
# TODO: Combine into a single insert with multiple rows for better performance
|
|
162
|
+
# TODO: Use insert_many or bulk insert instead of individual inserts
|
|
163
|
+
for i, index in enumerate(uncached_indices):
|
|
164
|
+
logger.debug(f"Indexing text at {i}")
|
|
165
|
+
embeddings[index] = uncached_embeddings[i]
|
|
166
|
+
embeddings_collection.insert(
|
|
167
|
+
{"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
|
|
168
|
+
)
|
|
169
|
+
embeddings_collection.commit()
|
|
170
|
+
else:
|
|
171
|
+
logger.info(f"Embedding {len(texts)} texts")
|
|
172
|
+
# TODO: Add progress callback for large batches without cache
|
|
173
|
+
embeddings = list(model.embed_multi(texts, batch_size=batch_size))
|
|
174
|
+
return [np.array(v, dtype=float) for v in embeddings]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SimpleIndexer(Indexer):
|
|
12
|
+
"""
|
|
13
|
+
A implementations index that uses a hash function to generate an index from text.
|
|
14
|
+
|
|
15
|
+
This uses a naive method to generate an index from text. It is not suitable for production use.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
19
|
+
"""
|
|
20
|
+
This is a naive method purely for testing
|
|
21
|
+
|
|
22
|
+
:param text:
|
|
23
|
+
:return:
|
|
24
|
+
"""
|
|
25
|
+
vector_length = self.vector_default_length
|
|
26
|
+
text = text.lower()
|
|
27
|
+
# trigrams
|
|
28
|
+
words = [text[i : i + 3] for i in range(len(text) - 2)]
|
|
29
|
+
|
|
30
|
+
vector = np.zeros(vector_length, dtype=float)
|
|
31
|
+
|
|
32
|
+
# Iterate over each trigram in the text
|
|
33
|
+
for word in words:
|
|
34
|
+
# Generate a hash value for the word
|
|
35
|
+
hash_value = int(hashlib.sha1(word.encode("utf-8")).hexdigest(), 16)
|
|
36
|
+
|
|
37
|
+
# Compute the index in the vector using modulo
|
|
38
|
+
index = hash_value % vector_length
|
|
39
|
+
|
|
40
|
+
# Increment the count at the computed index
|
|
41
|
+
vector[index] += 1.0
|
|
42
|
+
logger.debug(f"Indexed text: {text} as {vector}")
|
|
43
|
+
return vector
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from linkml_store.utils.vector_utils import mmr_diversified_search, pairwise_cosine_similarity
|
|
9
|
+
|
|
10
|
+
INDEX_ITEM = np.ndarray
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TemplateSyntaxEnum(str, Enum):
|
|
16
|
+
"""
|
|
17
|
+
Template syntax types.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
jinja2 = "jinja2"
|
|
21
|
+
fstring = "fstring"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Indexer(BaseModel):
|
|
25
|
+
"""
|
|
26
|
+
An indexer operates on a collection in order to search for objects.
|
|
27
|
+
|
|
28
|
+
You should use a subcllass of this; this can be looked up dynqamically:
|
|
29
|
+
|
|
30
|
+
>>> from linkml_store.index import get_indexer
|
|
31
|
+
>>> indexer = get_indexer("simple")
|
|
32
|
+
|
|
33
|
+
You can customize how objects are indexed by passing in a text template.
|
|
34
|
+
For example, if your collection has objects with "name" and "profession" attributes,
|
|
35
|
+
you can index them as "{name} {profession}".
|
|
36
|
+
|
|
37
|
+
>>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
|
|
38
|
+
|
|
39
|
+
By default, python fstrings are assumed.
|
|
40
|
+
|
|
41
|
+
We can test this works using the :ref:`object_to_text` method (normally
|
|
42
|
+
you would never need to call this directly, but it's useful for testing):
|
|
43
|
+
|
|
44
|
+
>>> obj = {"name": "John", "profession": "doctor"}
|
|
45
|
+
>>> indexer.object_to_text(obj)
|
|
46
|
+
'John :: doctor'
|
|
47
|
+
|
|
48
|
+
You can also use Jinja2 templates; this gives more flexibility and logic,
|
|
49
|
+
e.g. conditional formatting:
|
|
50
|
+
|
|
51
|
+
>>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
|
|
52
|
+
>>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
|
|
53
|
+
>>> indexer.object_to_text(obj)
|
|
54
|
+
'John :: doctor'
|
|
55
|
+
>>> indexer.object_to_text({"name": "John"})
|
|
56
|
+
'John'
|
|
57
|
+
|
|
58
|
+
You can also specify which attributes to index:
|
|
59
|
+
|
|
60
|
+
>>> indexer = get_indexer("simple", index_attributes=["name"])
|
|
61
|
+
>>> indexer.object_to_text(obj)
|
|
62
|
+
'John'
|
|
63
|
+
|
|
64
|
+
The purpose of an indexer is to translate a collection of objects into a collection of objects
|
|
65
|
+
such as vectors for purposes such as search. Unless you are implementing your own indexer, you
|
|
66
|
+
generally don't need to use the methods that return vectors, but we can examine their behavior
|
|
67
|
+
to get a sense of how they work.
|
|
68
|
+
|
|
69
|
+
>>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
|
|
70
|
+
>>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
|
|
71
|
+
|
|
72
|
+
Note you should consult the documentation for the specific indexer you are using for more details on
|
|
73
|
+
how text is converted to vectors.
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
name: Optional[str] = None
|
|
78
|
+
index_type: Optional[str] = None
|
|
79
|
+
index_function: Optional[Callable] = None
|
|
80
|
+
distance_function: Optional[Callable] = None
|
|
81
|
+
index_attributes: Optional[List[str]] = None
|
|
82
|
+
text_template: Optional[str] = None
|
|
83
|
+
text_template_syntax: Optional[TemplateSyntaxEnum] = None
|
|
84
|
+
filter_nulls: Optional[bool] = True
|
|
85
|
+
vector_default_length: Optional[int] = 1000
|
|
86
|
+
index_field: Optional[str] = "__index__"
|
|
87
|
+
index_value_field: Optional[str] = "__index_value__"
|
|
88
|
+
|
|
89
|
+
def object_to_vector(self, obj: Dict[str, Any]) -> INDEX_ITEM:
|
|
90
|
+
"""
|
|
91
|
+
Convert an object to an indexable object
|
|
92
|
+
|
|
93
|
+
:param obj:
|
|
94
|
+
:return:
|
|
95
|
+
"""
|
|
96
|
+
return self.text_to_vector(self.object_to_text(obj))
|
|
97
|
+
|
|
98
|
+
def objects_to_vectors(self, objs: List[Dict[str, Any]]) -> List[INDEX_ITEM]:
|
|
99
|
+
"""
|
|
100
|
+
Convert a list of objects to indexable objects
|
|
101
|
+
|
|
102
|
+
:param objs:
|
|
103
|
+
:return: list of vectors
|
|
104
|
+
"""
|
|
105
|
+
return self.texts_to_vectors([self.object_to_text(obj) for obj in objs])
|
|
106
|
+
|
|
107
|
+
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
|
|
108
|
+
"""
|
|
109
|
+
Convert a list of texts to indexable objects
|
|
110
|
+
|
|
111
|
+
:param texts:
|
|
112
|
+
:return:
|
|
113
|
+
"""
|
|
114
|
+
return [self.text_to_vector(text, cache=cache, **kwargs) for text in texts]
|
|
115
|
+
|
|
116
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
117
|
+
"""
|
|
118
|
+
Convert a text to an indexable object
|
|
119
|
+
|
|
120
|
+
:param text:
|
|
121
|
+
:param cache:
|
|
122
|
+
:return:
|
|
123
|
+
"""
|
|
124
|
+
raise NotImplementedError
|
|
125
|
+
|
|
126
|
+
def object_to_text(self, obj: Dict[str, Any]) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Convert an object to a text representation
|
|
129
|
+
|
|
130
|
+
:param obj:
|
|
131
|
+
:return:
|
|
132
|
+
"""
|
|
133
|
+
if self.index_attributes:
|
|
134
|
+
if len(self.index_attributes) == 1 and not self.text_template:
|
|
135
|
+
return str(obj[self.index_attributes[0]])
|
|
136
|
+
obj = {k: v for k, v in obj.items() if k in self.index_attributes}
|
|
137
|
+
if self.filter_nulls:
|
|
138
|
+
obj = {k: v for k, v in obj.items() if v is not None}
|
|
139
|
+
if self.text_template:
|
|
140
|
+
syntax = self.text_template_syntax
|
|
141
|
+
if not syntax:
|
|
142
|
+
if "{%" in self.text_template or "{{" in self.text_template:
|
|
143
|
+
logger.info("Detected Jinja2 syntax in text template")
|
|
144
|
+
syntax = TemplateSyntaxEnum.jinja2
|
|
145
|
+
if not syntax:
|
|
146
|
+
syntax = TemplateSyntaxEnum.fstring
|
|
147
|
+
if syntax == TemplateSyntaxEnum.jinja2:
|
|
148
|
+
from jinja2 import Template
|
|
149
|
+
|
|
150
|
+
template = Template(self.text_template)
|
|
151
|
+
return template.render(**obj)
|
|
152
|
+
elif syntax == TemplateSyntaxEnum.fstring:
|
|
153
|
+
return self.text_template.format(**obj)
|
|
154
|
+
else:
|
|
155
|
+
raise NotImplementedError(f"Cannot handle template syntax: {syntax}")
|
|
156
|
+
return str(obj)
|
|
157
|
+
|
|
158
|
+
def search(
|
|
159
|
+
self,
|
|
160
|
+
query: str,
|
|
161
|
+
vectors: List[Tuple[str, INDEX_ITEM]],
|
|
162
|
+
limit: Optional[int] = None,
|
|
163
|
+
mmr_relevance_factor: Optional[float] = None,
|
|
164
|
+
) -> List[Tuple[float, Any]]:
|
|
165
|
+
"""
|
|
166
|
+
Use the indexer to search against a database of vectors.
|
|
167
|
+
|
|
168
|
+
Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
|
|
169
|
+
|
|
170
|
+
:param query: The query string to search for
|
|
171
|
+
:param vectors: A list of indexed items, where each item is a tuple of (id, vector)
|
|
172
|
+
:param limit: The maximum number of results to return (optional)
|
|
173
|
+
:return: A list of item IDs or objects that match the query
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
# Convert the query string to a vector
|
|
177
|
+
query_vector = self.text_to_vector(query, cache=False)
|
|
178
|
+
|
|
179
|
+
if mmr_relevance_factor is not None:
|
|
180
|
+
vlist = [v for _, v in vectors]
|
|
181
|
+
idlist = [id for id, _ in vectors]
|
|
182
|
+
sorted_indices = mmr_diversified_search(
|
|
183
|
+
query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit
|
|
184
|
+
)
|
|
185
|
+
results = []
|
|
186
|
+
# TODO: this is inefficient when limit is high
|
|
187
|
+
for i in range(limit):
|
|
188
|
+
if i >= len(sorted_indices):
|
|
189
|
+
break
|
|
190
|
+
pos = sorted_indices[i]
|
|
191
|
+
score = pairwise_cosine_similarity(query_vector, vlist[pos])
|
|
192
|
+
results.append((score, idlist[pos]))
|
|
193
|
+
return results
|
|
194
|
+
|
|
195
|
+
distances = []
|
|
196
|
+
|
|
197
|
+
# Iterate over each indexed item
|
|
198
|
+
for item_id, item_vector in vectors:
|
|
199
|
+
# Calculate the Euclidean distance between the query vector and the item vector
|
|
200
|
+
# distance = 1-np.linalg.norm(query_vector - item_vector)
|
|
201
|
+
distance = pairwise_cosine_similarity(query_vector, item_vector)
|
|
202
|
+
distances.append((distance, item_id))
|
|
203
|
+
|
|
204
|
+
# Sort the distances in ascending order
|
|
205
|
+
distances.sort(key=lambda x: -x[0])
|
|
206
|
+
|
|
207
|
+
# Limit the number of results if specified
|
|
208
|
+
if limit is not None:
|
|
209
|
+
distances = distances[:limit]
|
|
210
|
+
|
|
211
|
+
return distances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
inference engine package.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from linkml_store.inference.inference_config import InferenceConfig
|
|
6
|
+
from linkml_store.inference.inference_engine import InferenceEngine
|
|
7
|
+
from linkml_store.inference.inference_engine_registry import get_inference_engine
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"InferenceEngine",
|
|
11
|
+
"InferenceConfig",
|
|
12
|
+
"get_inference_engine",
|
|
13
|
+
]
|