mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
txtai/util/resolver.py ADDED
@@ -0,0 +1,32 @@
1
+ """
2
+ Resolver module
3
+ """
4
+
5
+
6
+ class Resolver:
7
+ """
8
+ Resolves a Python class path
9
+ """
10
+
11
+ def __call__(self, path):
12
+ """
13
+ Class instance to resolve.
14
+
15
+ Args:
16
+ path: path to class
17
+
18
+ Returns:
19
+ class instance
20
+ """
21
+
22
+ # Split into path components
23
+ parts = path.split(".")
24
+
25
+ # Resolve each path component
26
+ module = ".".join(parts[:-1])
27
+ m = __import__(module)
28
+ for comp in parts[1:]:
29
+ m = getattr(m, comp)
30
+
31
+ # Return class instance
32
+ return m
@@ -0,0 +1,62 @@
1
+ """
2
+ SparseArray module
3
+ """
4
+
5
+ import numpy as np
6
+
7
+ # Conditional import
8
+ try:
9
+ from scipy.sparse import csr_matrix
10
+
11
+ SCIPY = True
12
+ except ImportError:
13
+ SCIPY = False
14
+
15
+
16
+ class SparseArray:
17
+ """
18
+ Methods to load and save sparse arrays to file.
19
+ """
20
+
21
+ def __init__(self):
22
+ """
23
+ Creates a SparseArray instance.
24
+ """
25
+
26
+ if not SCIPY:
27
+ raise ImportError("SciPy is not available - install scipy to enable")
28
+
29
+ def load(self, f):
30
+ """
31
+ Loads a sparse array from file.
32
+
33
+ Args:
34
+ f: input file handle
35
+
36
+ Returns:
37
+ sparse array
38
+ """
39
+
40
+ # Load raw data
41
+ data, indices, indptr, shape = (
42
+ np.load(f, allow_pickle=False),
43
+ np.load(f, allow_pickle=False),
44
+ np.load(f, allow_pickle=False),
45
+ np.load(f, allow_pickle=False),
46
+ )
47
+
48
+ # Load data into sparse array
49
+ return csr_matrix((data, indices, indptr), shape=shape)
50
+
51
+ def save(self, f, array):
52
+ """
53
+ Saves a sparse array to file.
54
+
55
+ Args:
56
+ f: output file handle
57
+ array: sparse array
58
+ """
59
+
60
+ # Save sparse array to file
61
+ for x in [array.data, array.indices, array.indptr, array.shape]:
62
+ np.save(f, x, allow_pickle=False)
txtai/util/template.py ADDED
@@ -0,0 +1,16 @@
1
+ """
2
+ Template module
3
+ """
4
+
5
+ from string import Formatter
6
+
7
+
8
+ class TemplateFormatter(Formatter):
9
+ """
10
+ Custom Formatter that requires each argument to be consumed.
11
+ """
12
+
13
+ def check_unused_args(self, used_args, args, kwargs):
14
+ difference = set(kwargs).difference(used_args)
15
+ if difference:
16
+ raise KeyError(difference)
@@ -0,0 +1,8 @@
1
+ """
2
+ Vectors imports
3
+ """
4
+
5
+ from .base import Vectors
6
+ from .dense import *
7
+ from .recovery import Recovery
8
+ from .sparse import *
txtai/vectors/base.py ADDED
@@ -0,0 +1,476 @@
1
+ """
2
+ Vectors module
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import tempfile
8
+ import uuid
9
+
10
+ import numpy as np
11
+
12
+ from ..pipeline import Tokenizer
13
+
14
+ from .recovery import Recovery
15
+
16
+
17
+ class Vectors:
18
+ """
19
+ Base class for vector models. Vector models transform input content into numeric vectors.
20
+ """
21
+
22
+ def __init__(self, config, scoring, models):
23
+ """
24
+ Creates a new vectors instance.
25
+
26
+ Args:
27
+ config: vector configuration
28
+ scoring: optional scoring instance for term weighting
29
+ models: models cache
30
+ """
31
+
32
+ # Store parameters
33
+ self.config = config
34
+ self.scoring = scoring
35
+ self.models = models
36
+
37
+ if config:
38
+ # Detect if this is an initialized configuration
39
+ self.initialized = "dimensions" in config
40
+
41
+ # Enables optional string tokenization
42
+ self.tokenize = config.get("tokenize")
43
+
44
+ # Load model
45
+ self.model = self.load(config.get("path"))
46
+
47
+ # Encode batch size - controls underlying model batch size when encoding vectors
48
+ self.encodebatch = config.get("encodebatch", 32)
49
+
50
+ # Embeddings instructions
51
+ self.instructions = config.get("instructions")
52
+
53
+ # Truncate embeddings to this dimensionality
54
+ self.dimensionality = config.get("dimensionality")
55
+
56
+ # Scalar quantization - supports 1-bit through 8-bit quantization
57
+ quantize = config.get("quantize")
58
+ self.qbits = max(min(quantize, 8), 1) if isinstance(quantize, int) and not isinstance(quantize, bool) else None
59
+
60
+ def loadmodel(self, path):
61
+ """
62
+ Loads vector model at path.
63
+
64
+ Args:
65
+ path: path to vector model
66
+
67
+ Returns:
68
+ vector model
69
+ """
70
+
71
+ raise NotImplementedError
72
+
73
+ def encode(self, data, category=None):
74
+ """
75
+ Encodes a batch of data using vector model.
76
+
77
+ Args:
78
+ data: batch of data
79
+ category: optional category for instruction-based embeddings
80
+
81
+ Return:
82
+ transformed data
83
+ """
84
+
85
+ raise NotImplementedError
86
+
87
+ def load(self, path):
88
+ """
89
+ Loads a model using the current configuration. This method will return previously cached models
90
+ if available.
91
+
92
+ Returns:
93
+ model
94
+ """
95
+
96
+ # Check if model is cached
97
+ if self.models and path in self.models:
98
+ return self.models[path]
99
+
100
+ # Create new model
101
+ model = self.loadmodel(path)
102
+
103
+ # Store model in cache
104
+ if self.models is not None and path:
105
+ self.models[path] = model
106
+
107
+ return model
108
+
109
+ def index(self, documents, batchsize=500, checkpoint=None):
110
+ """
111
+ Converts a list of documents to a temporary file with embeddings arrays. Returns a tuple of document ids,
112
+ number of dimensions and temporary file with embeddings.
113
+
114
+ Args:
115
+ documents: list of (id, data, tags)
116
+ batchsize: index batch size
117
+ checkpoint: optional checkpoint directory, enables indexing restart
118
+
119
+ Returns:
120
+ (ids, dimensions, batches, stream)
121
+ """
122
+
123
+ ids, dimensions, batches, stream = [], None, 0, None
124
+
125
+ # Generate recovery config if checkpoint is set
126
+ vectorsid = self.vectorsid() if checkpoint else None
127
+ recovery = Recovery(checkpoint, vectorsid, self.loadembeddings) if checkpoint else None
128
+
129
+ # Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
130
+ with self.spool(checkpoint, vectorsid) as output:
131
+ stream = output.name
132
+ batch = []
133
+ for document in documents:
134
+ batch.append(document)
135
+
136
+ if len(batch) == batchsize:
137
+ # Convert batch to embeddings
138
+ uids, dimensions = self.batch(batch, output, recovery)
139
+ ids.extend(uids)
140
+ batches += 1
141
+
142
+ batch = []
143
+
144
+ # Final batch
145
+ if batch:
146
+ uids, dimensions = self.batch(batch, output, recovery)
147
+ ids.extend(uids)
148
+ batches += 1
149
+
150
+ return (ids, dimensions, batches, stream)
151
+
152
+ def vectors(self, documents, batchsize=500, checkpoint=None, buffer=None, dtype=None):
153
+ """
154
+ Bulk encodes documents into vectors using index(). Return the data as a mmap-ed array.
155
+
156
+ Args:
157
+ documents: list of (id, data, tags)
158
+ batchsize: index batch size
159
+ checkpoint: optional checkpoint directory, enables indexing restart
160
+ buffer: file path used for memmap buffer
161
+ dtype: dtype for buffer
162
+
163
+ Returns:
164
+ (ids, dimensions, embeddings)
165
+ """
166
+
167
+ # Consume stream and transform documents to vectors
168
+ ids, dimensions, batches, stream = self.index(documents, batchsize, checkpoint)
169
+
170
+ # Check that embeddings are available and load as a memmap
171
+ embeddings = None
172
+ if ids:
173
+ # Write batches
174
+ embeddings = np.memmap(buffer, dtype=dtype, shape=(len(ids), dimensions), mode="w+")
175
+ with open(stream, "rb") as queue:
176
+ x = 0
177
+ for _ in range(batches):
178
+ batch = self.loadembeddings(queue)
179
+ embeddings[x : x + batch.shape[0]] = batch
180
+ x += batch.shape[0]
181
+
182
+ # Remove temporary file (if checkpointing is disabled)
183
+ if not checkpoint:
184
+ os.remove(stream)
185
+
186
+ return (ids, dimensions, embeddings)
187
+
188
+ def close(self):
189
+ """
190
+ Closes this vectors instance.
191
+ """
192
+
193
+ self.model = None
194
+
195
+ def transform(self, document):
196
+ """
197
+ Transforms document into an embeddings vector.
198
+
199
+ Args:
200
+ document: (id, data, tags)
201
+
202
+ Returns:
203
+ embeddings vector
204
+ """
205
+
206
+ # Prepare input document for vectors model and build embeddings
207
+ return self.batchtransform([document])[0]
208
+
209
+ def batchtransform(self, documents, category=None):
210
+ """
211
+ Transforms batch of documents into embeddings vectors.
212
+
213
+ Args:
214
+ documents: list of documents used to build embeddings
215
+ category: category for instruction-based embeddings
216
+
217
+ Returns:
218
+ embeddings vectors
219
+ """
220
+
221
+ # Prepare input documents for vectors model
222
+ documents = [self.prepare(data, category) for _, data, _ in documents]
223
+
224
+ # Skip encoding data if it's already an array
225
+ if documents and isinstance(documents[0], np.ndarray):
226
+ return np.array(documents, dtype=np.float32)
227
+
228
+ return self.vectorize(documents, category)
229
+
230
+ def dot(self, queries, data):
231
+ """
232
+ Calculates the dot product similarity between queries and documents. This method
233
+ assumes each of the inputs are normalized.
234
+
235
+ Args:
236
+ queries: queries
237
+ data: search data
238
+
239
+ Returns:
240
+ dot product scores
241
+ """
242
+
243
+ return np.dot(queries, data.T).tolist()
244
+
245
+ def vectorsid(self):
246
+ """
247
+ Generates vectors uid for this vectors instance.
248
+
249
+ Returns:
250
+ vectors uid
251
+ """
252
+
253
+ # Select config options that determine uniqueness
254
+ select = ["path", "method", "tokenizer", "maxlength", "tokenize", "instructions", "dimensionality", "quantize"]
255
+ config = {k: v for k, v in self.config.items() if k in select}
256
+ config.update(self.config.get("vectors", {}))
257
+
258
+ # Generate a deterministic UUID
259
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, json.dumps(config, sort_keys=True)))
260
+
261
+ def spool(self, checkpoint, vectorsid):
262
+ """
263
+ Opens a spool file for queuing generated vectors.
264
+
265
+ Args:
266
+ checkpoint: optional checkpoint directory, enables indexing restart
267
+ vectorsid: vectors uid for current configuration
268
+
269
+ Returns:
270
+ vectors spool file
271
+ """
272
+
273
+ # Spool to vectors checkpoint file
274
+ if checkpoint:
275
+ os.makedirs(checkpoint, exist_ok=True)
276
+ return open(f"{checkpoint}/{vectorsid}", "wb")
277
+
278
+ # Spool to temporary file
279
+ return tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False)
280
+
281
+ def batch(self, documents, output, recovery):
282
+ """
283
+ Builds a batch of embeddings.
284
+
285
+ Args:
286
+ documents: list of documents used to build embeddings
287
+ output: output temp file to store embeddings
288
+ recovery: optional recovery instance
289
+
290
+ Returns:
291
+ (ids, dimensions) list of ids and number of dimensions in embeddings
292
+ """
293
+
294
+ # Extract ids and prepare input documents for vectors model
295
+ ids = [uid for uid, _, _ in documents]
296
+ documents = [self.prepare(data, "data") for _, data, _ in documents]
297
+ dimensions = None
298
+
299
+ # Attempt to read embeddings from a recovery file
300
+ embeddings = recovery() if recovery else None
301
+ embeddings = self.vectorize(documents, "data") if embeddings is None else embeddings
302
+ if embeddings is not None:
303
+ dimensions = embeddings.shape[1]
304
+ self.saveembeddings(output, embeddings)
305
+
306
+ return (ids, dimensions)
307
+
308
+ def prepare(self, data, category=None):
309
+ """
310
+ Prepares input data for vector model.
311
+
312
+ Args:
313
+ data: input data
314
+ category: category for instruction-based embeddings
315
+
316
+ Returns:
317
+ data formatted for vector model
318
+ """
319
+
320
+ # Prepares tokens for the model
321
+ data = self.tokens(data)
322
+
323
+ # Default instruction category
324
+ category = category if category else "query"
325
+
326
+ # Prepend instructions, if applicable
327
+ if self.instructions and category in self.instructions and isinstance(data, str):
328
+ # Prepend category instruction
329
+ data = f"{self.instructions[category]}{data}"
330
+
331
+ return data
332
+
333
+ def tokens(self, data):
334
+ """
335
+ Prepare data as tokens model can accept.
336
+
337
+ Args:
338
+ data: input data
339
+
340
+ Returns:
341
+ tokens formatted for model
342
+ """
343
+
344
+ # Optional string tokenization
345
+ if self.tokenize and isinstance(data, str):
346
+ data = Tokenizer.tokenize(data)
347
+
348
+ # Convert token list to string
349
+ if isinstance(data, list):
350
+ data = " ".join(data)
351
+
352
+ return data
353
+
354
+ def vectorize(self, data, category=None):
355
+ """
356
+ Runs data vectorization, which consists of the following steps.
357
+
358
+ 1. Encode data into vectors using underlying model
359
+ 2. Truncate vectors, if necessary
360
+ 3. Normalize vectors
361
+ 4. Quantize vectors, if necessary
362
+
363
+ Args:
364
+ data: input data
365
+ category: category for instruction-based embeddings
366
+
367
+ Returns:
368
+ embeddings vectors
369
+ """
370
+
371
+ # Default instruction category
372
+ category = category if category else "query"
373
+
374
+ # Transform data into vectors
375
+ embeddings = self.encode(data, category)
376
+
377
+ if embeddings is not None:
378
+ # Truncate embeddings, if necessary
379
+ if self.dimensionality and self.dimensionality < embeddings.shape[1]:
380
+ embeddings = self.truncate(embeddings)
381
+
382
+ # Normalize data
383
+ embeddings = self.normalize(embeddings)
384
+
385
+ # Apply quantization, if necessary
386
+ if self.qbits:
387
+ embeddings = self.quantize(embeddings)
388
+
389
+ return embeddings
390
+
391
+ def loadembeddings(self, f):
392
+ """
393
+ Loads embeddings from file.
394
+
395
+ Args:
396
+ f: file to load from
397
+
398
+ Returns:
399
+ embeddings
400
+ """
401
+
402
+ return np.load(f, allow_pickle=False)
403
+
404
+ def saveembeddings(self, f, embeddings):
405
+ """
406
+ Saves embeddings to output.
407
+
408
+ Args:
409
+ f: output file
410
+ embeddings: embeddings to save
411
+ """
412
+
413
+ np.save(f, embeddings, allow_pickle=False)
414
+
415
+ def truncate(self, embeddings):
416
+ """
417
+ Truncates embeddings to the configured dimensionality.
418
+
419
+ This is only useful for models trained to store more important information in
420
+ earlier dimensions such as Matryoshka Representation Learning (MRL).
421
+
422
+ Args:
423
+ embeddings: input embeddings
424
+
425
+ Returns:
426
+ truncated embeddings
427
+ """
428
+
429
+ return embeddings[:, : self.dimensionality]
430
+
431
+ def normalize(self, embeddings):
432
+ """
433
+ Normalizes embeddings using L2 normalization. Operation applied directly on array.
434
+
435
+ Args:
436
+ embeddings: input embeddings
437
+
438
+ Returns:
439
+ embeddings
440
+ """
441
+
442
+ # Calculation is different for matrices vs vectors
443
+ if len(embeddings.shape) > 1:
444
+ embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
445
+ else:
446
+ embeddings /= np.linalg.norm(embeddings)
447
+
448
+ return embeddings
449
+
450
+ def quantize(self, embeddings):
451
+ """
452
+ Quantizes embeddings using scalar quantization.
453
+
454
+ Args:
455
+ embeddings: input embeddings
456
+
457
+ Returns:
458
+ quantized embeddings
459
+ """
460
+
461
+ # Scale factor is midpoint in range
462
+ factor = 2 ** (self.qbits - 1)
463
+
464
+ # Quantize to uint8
465
+ scalars = embeddings * factor
466
+ scalars = scalars.clip(-factor, factor - 1) + factor
467
+ scalars = scalars.astype(np.uint8)
468
+
469
+ # Transform uint8 to bits
470
+ bits = np.unpackbits(scalars.reshape(-1, 1), axis=1)
471
+
472
+ # Remove unused bits (i.e. for 3-bit quantization, the leading 5 bits are removed)
473
+ bits = bits[:, -self.qbits :]
474
+
475
+ # Reshape using original data dimensions and pack bits into uint8 array
476
+ return np.packbits(bits.reshape(embeddings.shape[0], embeddings.shape[1] * self.qbits), axis=1)
@@ -0,0 +1,12 @@
1
+ """
2
+ Dense vectors imports
3
+ """
4
+
5
+ from .external import External
6
+ from .factory import VectorsFactory
7
+ from .huggingface import HFVectors
8
+ from .litellm import LiteLLM
9
+ from .llama import LlamaCpp
10
+ from .m2v import Model2Vec
11
+ from .sbert import STVectors
12
+ from .words import WordVectors
@@ -0,0 +1,55 @@
1
+ """
2
+ External module
3
+ """
4
+
5
+ import types
6
+
7
+ import numpy as np
8
+
9
+ from ...util import Resolver
10
+
11
+ from ..base import Vectors
12
+
13
+
14
+ class External(Vectors):
15
+ """
16
+ Builds vectors using an external method. This can be a local function or an external API call.
17
+ """
18
+
19
+ def __init__(self, config, scoring, models):
20
+ super().__init__(config, scoring, models)
21
+
22
+ # Lookup and resolve transform function
23
+ self.transform = self.resolve(config.get("transform"))
24
+
25
+ def loadmodel(self, path):
26
+ return None
27
+
28
+ def encode(self, data, category=None):
29
+ # Call external transform function, if available and data not already an array
30
+ # Batching is handed by the external transform function
31
+ if self.transform and data and not isinstance(data[0], np.ndarray):
32
+ data = self.transform(data)
33
+
34
+ # Cast to float32
35
+ return data.astype(np.float32) if isinstance(data, np.ndarray) else np.array(data, dtype=np.float32)
36
+
37
+ def resolve(self, transform):
38
+ """
39
+ Resolves a transform function.
40
+
41
+ Args:
42
+ transform: transform function
43
+
44
+ Returns:
45
+ resolved transform function
46
+ """
47
+
48
+ if transform:
49
+ # Resolve transform instance, if necessary
50
+ transform = Resolver()(transform) if transform and isinstance(transform, str) else transform
51
+
52
+ # Get function or callable instance
53
+ transform = transform if isinstance(transform, types.FunctionType) else transform()
54
+
55
+ return transform