linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,7 @@
1
+ import pystow
2
+
3
+ __all__ = [
4
+ "LINKML_STORE_MODULE",
5
+ ]
6
+
7
+ LINKML_STORE_MODULE = pystow.module("linkml", "store")
File without changes
@@ -0,0 +1,24 @@
1
+ from abc import ABC
2
+ from typing import Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ DEFAULT_IDENTIFIER_ATTRIBUTE = "id"
7
+ DEFAULT_CATEGORY_LABELS_ATTRIBUTE = "category"
8
+ DEFAULT_SUBJECT_ATTRIBUTE = "subject"
9
+ DEFAULT_PREDICATE_ATTRIBUTE = "predicate"
10
+ DEFAULT_OBJECT_ATTRIBUTE = "object"
11
+
12
+
13
+ class GraphProjection(BaseModel, ABC):
14
+ identifier_attribute: str = DEFAULT_IDENTIFIER_ATTRIBUTE
15
+
16
+
17
+ class NodeProjection(GraphProjection):
18
+ category_labels_attribute: Optional[str] = DEFAULT_CATEGORY_LABELS_ATTRIBUTE
19
+
20
+
21
+ class EdgeProjection(GraphProjection):
22
+ subject_attribute: str = DEFAULT_SUBJECT_ATTRIBUTE
23
+ predicate_attribute: str = DEFAULT_PREDICATE_ATTRIBUTE
24
+ object_attribute: str = DEFAULT_OBJECT_ATTRIBUTE
@@ -0,0 +1,53 @@
1
+ """
2
+ Indexers package.
3
+
4
+ Indexers allow indexes to be added to existing :class:`Collection` objects.
5
+
6
+ Current two are supported:
7
+
8
+ * simple: :class:`SimpleIndexer`
9
+ * llm: :class:`LLMIndexer`
10
+ """
11
+
12
+ from typing import Type
13
+
14
+ from linkml_store.index.implementations.llm_indexer import LLMIndexer
15
+ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
16
+ from linkml_store.index.indexer import Indexer
17
+
18
+ INDEXER_CLASSES = {
19
+ "simple": SimpleIndexer,
20
+ "llm": LLMIndexer,
21
+ }
22
+
23
+
24
+ def get_indexer_class(name: str) -> Type[Indexer]:
25
+ """
26
+ Get an indexer class by name.
27
+
28
+ :param name: the name of the indexer (simple, llm, ...)
29
+ :return: the indexer class
30
+ """
31
+ if name not in INDEXER_CLASSES:
32
+ raise ValueError(f"Unknown indexer class: {name}")
33
+ return INDEXER_CLASSES[name]
34
+
35
+
36
+ def get_indexer(index_type: str, **kwargs) -> Indexer:
37
+ """
38
+ Get an indexer by name.
39
+
40
+ >>> simple_indexer = get_indexer("simple")
41
+ >>> llm_indexer = get_indexer("llm")
42
+
43
+ :param name: the name of the indexer (simple, llm, ...)
44
+ :param kwargs: additional arguments to pass to the indexer
45
+ :return: the indexer
46
+ """
47
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
48
+ cls = get_indexer_class(index_type)
49
+ kwargs["index_type"] = index_type
50
+ indexer = cls(**kwargs)
51
+ if not indexer.name:
52
+ indexer.name = index_type
53
+ return indexer
File without changes
@@ -0,0 +1,174 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, List, Optional
4
+
5
+ import numpy as np
6
+
7
+ from linkml_store.api.config import CollectionConfig
8
+ from linkml_store.index.indexer import INDEX_ITEM, Indexer
9
+ from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
10
+
11
+ if TYPE_CHECKING:
12
+ import llm
13
+
14
+ CHUNK_SIZE = 1000
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LLMIndexer(Indexer):
20
+ """
21
+ An indexer that wraps the llm library.
22
+
23
+ This indexer is used to convert text to vectors using the llm library.
24
+
25
+ >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
26
+ >>> vector = indexer.text_to_vector("hello")
27
+
28
+ TODO: Implement true batching for embedding API calls
29
+ TODO: Add batch_size parameter to control batch processing
30
+ TODO: Support batch embedding APIs (e.g., OpenAI batch endpoint)
31
+ TODO: Add progress reporting for large batch operations
32
+ TODO: Implement smart batching with accumulation and flushing
33
+ """
34
+
35
+ embedding_model_name: str = "text-embedding-ada-002"
36
+ _embedding_model: "llm.EmbeddingModel" = None
37
+ cached_embeddings_database: str = None
38
+ cached_embeddings_collection: str = None
39
+ cache_queries: bool = False
40
+ truncation_method: Optional[str] = None
41
+ # TODO: Add batch_size: int = 100 parameter for batch processing
42
+ # TODO: Add supported_models class variable with model metadata (dims, costs, limits)
43
+ # TODO: Add model_validation to check if model exists before use
44
+
45
+ @property
46
+ def embedding_model(self):
47
+ import llm
48
+
49
+ if self._embedding_model is None:
50
+ self._embedding_model = llm.get_embedding_model(self.embedding_model_name)
51
+ return self._embedding_model
52
+
53
+ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
54
+ """
55
+ Convert a text to an indexable object
56
+
57
+ >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
58
+ >>> vector = indexer.text_to_vector("hello")
59
+
60
+ :param text:
61
+ :return:
62
+ """
63
+ return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
64
+
65
+ def texts_to_vectors(
66
+ self, texts: List[str], cache: bool = None, token_limit_penalty=0, batch_size: int=None, **kwargs
67
+ ) -> List[INDEX_ITEM]:
68
+ """
69
+ Use LLM to embed.
70
+
71
+ >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
72
+ >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
73
+
74
+ :param texts:
75
+ :param cache:
76
+ :param token_limit_penalty:
77
+ :return:
78
+ """
79
+ from tiktoken import encoding_for_model
80
+
81
+ logging.info(f"Converting {len(texts)} texts to vectors")
82
+ model = self.embedding_model
83
+ # TODO: make this more accurate
84
+ token_limit = get_token_limit(model.model_id) - token_limit_penalty
85
+ logging.info(f"Token limit for {model.model_id}: {token_limit}")
86
+ encoding = encoding_for_model(self.embedding_model_name)
87
+
88
+ def truncate_text(text: str) -> str:
89
+ # split into tokens every 1000 chars:
90
+ parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
91
+ truncated = render_formatted_text(
92
+ lambda x: "".join(x),
93
+ parts,
94
+ encoding,
95
+ token_limit,
96
+ )
97
+ logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
98
+ return truncated
99
+
100
+ texts = [truncate_text(text) for text in texts]
101
+ # Calculate average number of tokens per text for accurate batch sizing
102
+ text_token_counts = [len(encoding.encode(t)) for t in texts]
103
+ avg_text_tokens = sum(text_token_counts) / len(text_token_counts)
104
+ logger.info(f"Average text token count: {avg_text_tokens}")
105
+ if batch_size is None:
106
+ # TODO: empirically determine best batch size
107
+ batch_size = max(int(token_limit / avg_text_tokens), 5)
108
+ logger.info(f"Setting batch size to {batch_size}")
109
+
110
+
111
+ if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
112
+ model_id = model.model_id
113
+ if not model_id:
114
+ raise ValueError("Model ID is required to cache embeddings")
115
+ db_path = Path(self.cached_embeddings_database)
116
+ coll_name = self.cached_embeddings_collection
117
+ if not coll_name:
118
+ coll_name = "all_embeddings"
119
+ from linkml_store import Client
120
+
121
+ embeddings_client = Client()
122
+ config = CollectionConfig(
123
+ alias=coll_name,
124
+ type="Embeddings",
125
+ attributes={
126
+ "text": {"range": "string"},
127
+ "model_id": {"range": "string"},
128
+ "embedding": {"range": "float", "array": {}},
129
+ },
130
+ )
131
+ embeddings_db = embeddings_client.get_database(f"duckdb:///{db_path}")
132
+ if coll_name in embeddings_db.list_collection_names():
133
+ # Load existing collection and use its model
134
+ embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
135
+ else:
136
+ embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
137
+
138
+ embeddings = list([None] * len(texts))
139
+ uncached_texts = []
140
+ n = 0
141
+ # TODO: Implement batch lookup for cache checking (single query for all texts)
142
+ # TODO: Use IN clause or batch query to check multiple texts at once
143
+ logger.info(f"Checking cache for {len(texts)} texts")
144
+ for i in range(len(texts)):
145
+ # TODO: optimize this - currently makes N database queries for N texts
146
+ text = texts[i]
147
+ logger.debug(f"Looking for cached embedding for {text}")
148
+ r = embeddings_collection.find({"text": text, "model_id": model_id})
149
+ if r.num_rows:
150
+ embeddings[i] = r.rows[0]["embedding"]
151
+ n += 1
152
+ logger.info("Found")
153
+ else:
154
+ uncached_texts.append((text, i))
155
+ logger.info("NOT Found")
156
+ logger.info(f"Found {n} cached embeddings")
157
+ if uncached_texts:
158
+ logger.info(f"Embedding {len(uncached_texts)} uncached texts")
159
+ uncached_texts, uncached_indices = zip(*uncached_texts)
160
+ uncached_embeddings = list(model.embed_multi(uncached_texts, batch_size=batch_size))
161
+ # TODO: Combine into a single insert with multiple rows for better performance
162
+ # TODO: Use insert_many or bulk insert instead of individual inserts
163
+ for i, index in enumerate(uncached_indices):
164
+ logger.debug(f"Indexing text at {i}")
165
+ embeddings[index] = uncached_embeddings[i]
166
+ embeddings_collection.insert(
167
+ {"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
168
+ )
169
+ embeddings_collection.commit()
170
+ else:
171
+ logger.info(f"Embedding {len(texts)} texts")
172
+ # TODO: Add progress callback for large batches without cache
173
+ embeddings = list(model.embed_multi(texts, batch_size=batch_size))
174
+ return [np.array(v, dtype=float) for v in embeddings]
@@ -0,0 +1,43 @@
1
+ import hashlib
2
+ import logging
3
+
4
+ import numpy as np
5
+
6
+ from linkml_store.index.indexer import INDEX_ITEM, Indexer
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class SimpleIndexer(Indexer):
12
+ """
13
+ A implementations index that uses a hash function to generate an index from text.
14
+
15
+ This uses a naive method to generate an index from text. It is not suitable for production use.
16
+ """
17
+
18
+ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
19
+ """
20
+ This is a naive method purely for testing
21
+
22
+ :param text:
23
+ :return:
24
+ """
25
+ vector_length = self.vector_default_length
26
+ text = text.lower()
27
+ # trigrams
28
+ words = [text[i : i + 3] for i in range(len(text) - 2)]
29
+
30
+ vector = np.zeros(vector_length, dtype=float)
31
+
32
+ # Iterate over each trigram in the text
33
+ for word in words:
34
+ # Generate a hash value for the word
35
+ hash_value = int(hashlib.sha1(word.encode("utf-8")).hexdigest(), 16)
36
+
37
+ # Compute the index in the vector using modulo
38
+ index = hash_value % vector_length
39
+
40
+ # Increment the count at the computed index
41
+ vector[index] += 1.0
42
+ logger.debug(f"Indexed text: {text} as {vector}")
43
+ return vector
@@ -0,0 +1,211 @@
1
+ import logging
2
+ from enum import Enum
3
+ from typing import Any, Callable, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ from pydantic import BaseModel
7
+
8
+ from linkml_store.utils.vector_utils import mmr_diversified_search, pairwise_cosine_similarity
9
+
10
+ INDEX_ITEM = np.ndarray
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class TemplateSyntaxEnum(str, Enum):
16
+ """
17
+ Template syntax types.
18
+ """
19
+
20
+ jinja2 = "jinja2"
21
+ fstring = "fstring"
22
+
23
+
24
+ class Indexer(BaseModel):
25
+ """
26
+ An indexer operates on a collection in order to search for objects.
27
+
28
+ You should use a subcllass of this; this can be looked up dynqamically:
29
+
30
+ >>> from linkml_store.index import get_indexer
31
+ >>> indexer = get_indexer("simple")
32
+
33
+ You can customize how objects are indexed by passing in a text template.
34
+ For example, if your collection has objects with "name" and "profession" attributes,
35
+ you can index them as "{name} {profession}".
36
+
37
+ >>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
38
+
39
+ By default, python fstrings are assumed.
40
+
41
+ We can test this works using the :ref:`object_to_text` method (normally
42
+ you would never need to call this directly, but it's useful for testing):
43
+
44
+ >>> obj = {"name": "John", "profession": "doctor"}
45
+ >>> indexer.object_to_text(obj)
46
+ 'John :: doctor'
47
+
48
+ You can also use Jinja2 templates; this gives more flexibility and logic,
49
+ e.g. conditional formatting:
50
+
51
+ >>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
52
+ >>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
53
+ >>> indexer.object_to_text(obj)
54
+ 'John :: doctor'
55
+ >>> indexer.object_to_text({"name": "John"})
56
+ 'John'
57
+
58
+ You can also specify which attributes to index:
59
+
60
+ >>> indexer = get_indexer("simple", index_attributes=["name"])
61
+ >>> indexer.object_to_text(obj)
62
+ 'John'
63
+
64
+ The purpose of an indexer is to translate a collection of objects into a collection of objects
65
+ such as vectors for purposes such as search. Unless you are implementing your own indexer, you
66
+ generally don't need to use the methods that return vectors, but we can examine their behavior
67
+ to get a sense of how they work.
68
+
69
+ >>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
70
+ >>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
71
+
72
+ Note you should consult the documentation for the specific indexer you are using for more details on
73
+ how text is converted to vectors.
74
+
75
+ """
76
+
77
+ name: Optional[str] = None
78
+ index_type: Optional[str] = None
79
+ index_function: Optional[Callable] = None
80
+ distance_function: Optional[Callable] = None
81
+ index_attributes: Optional[List[str]] = None
82
+ text_template: Optional[str] = None
83
+ text_template_syntax: Optional[TemplateSyntaxEnum] = None
84
+ filter_nulls: Optional[bool] = True
85
+ vector_default_length: Optional[int] = 1000
86
+ index_field: Optional[str] = "__index__"
87
+ index_value_field: Optional[str] = "__index_value__"
88
+
89
+ def object_to_vector(self, obj: Dict[str, Any]) -> INDEX_ITEM:
90
+ """
91
+ Convert an object to an indexable object
92
+
93
+ :param obj:
94
+ :return:
95
+ """
96
+ return self.text_to_vector(self.object_to_text(obj))
97
+
98
+ def objects_to_vectors(self, objs: List[Dict[str, Any]]) -> List[INDEX_ITEM]:
99
+ """
100
+ Convert a list of objects to indexable objects
101
+
102
+ :param objs:
103
+ :return: list of vectors
104
+ """
105
+ return self.texts_to_vectors([self.object_to_text(obj) for obj in objs])
106
+
107
+ def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
108
+ """
109
+ Convert a list of texts to indexable objects
110
+
111
+ :param texts:
112
+ :return:
113
+ """
114
+ return [self.text_to_vector(text, cache=cache, **kwargs) for text in texts]
115
+
116
+ def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
117
+ """
118
+ Convert a text to an indexable object
119
+
120
+ :param text:
121
+ :param cache:
122
+ :return:
123
+ """
124
+ raise NotImplementedError
125
+
126
+ def object_to_text(self, obj: Dict[str, Any]) -> str:
127
+ """
128
+ Convert an object to a text representation
129
+
130
+ :param obj:
131
+ :return:
132
+ """
133
+ if self.index_attributes:
134
+ if len(self.index_attributes) == 1 and not self.text_template:
135
+ return str(obj[self.index_attributes[0]])
136
+ obj = {k: v for k, v in obj.items() if k in self.index_attributes}
137
+ if self.filter_nulls:
138
+ obj = {k: v for k, v in obj.items() if v is not None}
139
+ if self.text_template:
140
+ syntax = self.text_template_syntax
141
+ if not syntax:
142
+ if "{%" in self.text_template or "{{" in self.text_template:
143
+ logger.info("Detected Jinja2 syntax in text template")
144
+ syntax = TemplateSyntaxEnum.jinja2
145
+ if not syntax:
146
+ syntax = TemplateSyntaxEnum.fstring
147
+ if syntax == TemplateSyntaxEnum.jinja2:
148
+ from jinja2 import Template
149
+
150
+ template = Template(self.text_template)
151
+ return template.render(**obj)
152
+ elif syntax == TemplateSyntaxEnum.fstring:
153
+ return self.text_template.format(**obj)
154
+ else:
155
+ raise NotImplementedError(f"Cannot handle template syntax: {syntax}")
156
+ return str(obj)
157
+
158
+ def search(
159
+ self,
160
+ query: str,
161
+ vectors: List[Tuple[str, INDEX_ITEM]],
162
+ limit: Optional[int] = None,
163
+ mmr_relevance_factor: Optional[float] = None,
164
+ ) -> List[Tuple[float, Any]]:
165
+ """
166
+ Use the indexer to search against a database of vectors.
167
+
168
+ Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
169
+
170
+ :param query: The query string to search for
171
+ :param vectors: A list of indexed items, where each item is a tuple of (id, vector)
172
+ :param limit: The maximum number of results to return (optional)
173
+ :return: A list of item IDs or objects that match the query
174
+ """
175
+
176
+ # Convert the query string to a vector
177
+ query_vector = self.text_to_vector(query, cache=False)
178
+
179
+ if mmr_relevance_factor is not None:
180
+ vlist = [v for _, v in vectors]
181
+ idlist = [id for id, _ in vectors]
182
+ sorted_indices = mmr_diversified_search(
183
+ query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit
184
+ )
185
+ results = []
186
+ # TODO: this is inefficient when limit is high
187
+ for i in range(limit):
188
+ if i >= len(sorted_indices):
189
+ break
190
+ pos = sorted_indices[i]
191
+ score = pairwise_cosine_similarity(query_vector, vlist[pos])
192
+ results.append((score, idlist[pos]))
193
+ return results
194
+
195
+ distances = []
196
+
197
+ # Iterate over each indexed item
198
+ for item_id, item_vector in vectors:
199
+ # Calculate the Euclidean distance between the query vector and the item vector
200
+ # distance = 1-np.linalg.norm(query_vector - item_vector)
201
+ distance = pairwise_cosine_similarity(query_vector, item_vector)
202
+ distances.append((distance, item_id))
203
+
204
+ # Sort the distances in ascending order
205
+ distances.sort(key=lambda x: -x[0])
206
+
207
+ # Limit the number of results if specified
208
+ if limit is not None:
209
+ distances = distances[:limit]
210
+
211
+ return distances
@@ -0,0 +1,13 @@
1
+ """
2
+ inference engine package.
3
+ """
4
+
5
+ from linkml_store.inference.inference_config import InferenceConfig
6
+ from linkml_store.inference.inference_engine import InferenceEngine
7
+ from linkml_store.inference.inference_engine_registry import get_inference_engine
8
+
9
+ __all__ = [
10
+ "InferenceEngine",
11
+ "InferenceConfig",
12
+ "get_inference_engine",
13
+ ]