mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
txtai/util/resolver.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
"""
|
2
|
+
Resolver module
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class Resolver:
|
7
|
+
"""
|
8
|
+
Resolves a Python class path
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __call__(self, path):
|
12
|
+
"""
|
13
|
+
Class instance to resolve.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
path: path to class
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
class instance
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Split into path components
|
23
|
+
parts = path.split(".")
|
24
|
+
|
25
|
+
# Resolve each path component
|
26
|
+
module = ".".join(parts[:-1])
|
27
|
+
m = __import__(module)
|
28
|
+
for comp in parts[1:]:
|
29
|
+
m = getattr(m, comp)
|
30
|
+
|
31
|
+
# Return class instance
|
32
|
+
return m
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""
|
2
|
+
SparseArray module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
# Conditional import
|
8
|
+
try:
|
9
|
+
from scipy.sparse import csr_matrix
|
10
|
+
|
11
|
+
SCIPY = True
|
12
|
+
except ImportError:
|
13
|
+
SCIPY = False
|
14
|
+
|
15
|
+
|
16
|
+
class SparseArray:
|
17
|
+
"""
|
18
|
+
Methods to load and save sparse arrays to file.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self):
|
22
|
+
"""
|
23
|
+
Creates a SparseArray instance.
|
24
|
+
"""
|
25
|
+
|
26
|
+
if not SCIPY:
|
27
|
+
raise ImportError("SciPy is not available - install scipy to enable")
|
28
|
+
|
29
|
+
def load(self, f):
|
30
|
+
"""
|
31
|
+
Loads a sparse array from file.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
f: input file handle
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
sparse array
|
38
|
+
"""
|
39
|
+
|
40
|
+
# Load raw data
|
41
|
+
data, indices, indptr, shape = (
|
42
|
+
np.load(f, allow_pickle=False),
|
43
|
+
np.load(f, allow_pickle=False),
|
44
|
+
np.load(f, allow_pickle=False),
|
45
|
+
np.load(f, allow_pickle=False),
|
46
|
+
)
|
47
|
+
|
48
|
+
# Load data into sparse array
|
49
|
+
return csr_matrix((data, indices, indptr), shape=shape)
|
50
|
+
|
51
|
+
def save(self, f, array):
|
52
|
+
"""
|
53
|
+
Saves a sparse array to file.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
f: output file handle
|
57
|
+
array: sparse array
|
58
|
+
"""
|
59
|
+
|
60
|
+
# Save sparse array to file
|
61
|
+
for x in [array.data, array.indices, array.indptr, array.shape]:
|
62
|
+
np.save(f, x, allow_pickle=False)
|
txtai/util/template.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
"""
|
2
|
+
Template module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from string import Formatter
|
6
|
+
|
7
|
+
|
8
|
+
class TemplateFormatter(Formatter):
|
9
|
+
"""
|
10
|
+
Custom Formatter that requires each argument to be consumed.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def check_unused_args(self, used_args, args, kwargs):
|
14
|
+
difference = set(kwargs).difference(used_args)
|
15
|
+
if difference:
|
16
|
+
raise KeyError(difference)
|
txtai/vectors/base.py
ADDED
@@ -0,0 +1,476 @@
|
|
1
|
+
"""
|
2
|
+
Vectors module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
import tempfile
|
8
|
+
import uuid
|
9
|
+
|
10
|
+
import numpy as np
|
11
|
+
|
12
|
+
from ..pipeline import Tokenizer
|
13
|
+
|
14
|
+
from .recovery import Recovery
|
15
|
+
|
16
|
+
|
17
|
+
class Vectors:
|
18
|
+
"""
|
19
|
+
Base class for vector models. Vector models transform input content into numeric vectors.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, config, scoring, models):
|
23
|
+
"""
|
24
|
+
Creates a new vectors instance.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
config: vector configuration
|
28
|
+
scoring: optional scoring instance for term weighting
|
29
|
+
models: models cache
|
30
|
+
"""
|
31
|
+
|
32
|
+
# Store parameters
|
33
|
+
self.config = config
|
34
|
+
self.scoring = scoring
|
35
|
+
self.models = models
|
36
|
+
|
37
|
+
if config:
|
38
|
+
# Detect if this is an initialized configuration
|
39
|
+
self.initialized = "dimensions" in config
|
40
|
+
|
41
|
+
# Enables optional string tokenization
|
42
|
+
self.tokenize = config.get("tokenize")
|
43
|
+
|
44
|
+
# Load model
|
45
|
+
self.model = self.load(config.get("path"))
|
46
|
+
|
47
|
+
# Encode batch size - controls underlying model batch size when encoding vectors
|
48
|
+
self.encodebatch = config.get("encodebatch", 32)
|
49
|
+
|
50
|
+
# Embeddings instructions
|
51
|
+
self.instructions = config.get("instructions")
|
52
|
+
|
53
|
+
# Truncate embeddings to this dimensionality
|
54
|
+
self.dimensionality = config.get("dimensionality")
|
55
|
+
|
56
|
+
# Scalar quantization - supports 1-bit through 8-bit quantization
|
57
|
+
quantize = config.get("quantize")
|
58
|
+
self.qbits = max(min(quantize, 8), 1) if isinstance(quantize, int) and not isinstance(quantize, bool) else None
|
59
|
+
|
60
|
+
def loadmodel(self, path):
|
61
|
+
"""
|
62
|
+
Loads vector model at path.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
path: path to vector model
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
vector model
|
69
|
+
"""
|
70
|
+
|
71
|
+
raise NotImplementedError
|
72
|
+
|
73
|
+
def encode(self, data, category=None):
|
74
|
+
"""
|
75
|
+
Encodes a batch of data using vector model.
|
76
|
+
|
77
|
+
Args:
|
78
|
+
data: batch of data
|
79
|
+
category: optional category for instruction-based embeddings
|
80
|
+
|
81
|
+
Return:
|
82
|
+
transformed data
|
83
|
+
"""
|
84
|
+
|
85
|
+
raise NotImplementedError
|
86
|
+
|
87
|
+
def load(self, path):
|
88
|
+
"""
|
89
|
+
Loads a model using the current configuration. This method will return previously cached models
|
90
|
+
if available.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
model
|
94
|
+
"""
|
95
|
+
|
96
|
+
# Check if model is cached
|
97
|
+
if self.models and path in self.models:
|
98
|
+
return self.models[path]
|
99
|
+
|
100
|
+
# Create new model
|
101
|
+
model = self.loadmodel(path)
|
102
|
+
|
103
|
+
# Store model in cache
|
104
|
+
if self.models is not None and path:
|
105
|
+
self.models[path] = model
|
106
|
+
|
107
|
+
return model
|
108
|
+
|
109
|
+
def index(self, documents, batchsize=500, checkpoint=None):
|
110
|
+
"""
|
111
|
+
Converts a list of documents to a temporary file with embeddings arrays. Returns a tuple of document ids,
|
112
|
+
number of dimensions and temporary file with embeddings.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
documents: list of (id, data, tags)
|
116
|
+
batchsize: index batch size
|
117
|
+
checkpoint: optional checkpoint directory, enables indexing restart
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
(ids, dimensions, batches, stream)
|
121
|
+
"""
|
122
|
+
|
123
|
+
ids, dimensions, batches, stream = [], None, 0, None
|
124
|
+
|
125
|
+
# Generate recovery config if checkpoint is set
|
126
|
+
vectorsid = self.vectorsid() if checkpoint else None
|
127
|
+
recovery = Recovery(checkpoint, vectorsid, self.loadembeddings) if checkpoint else None
|
128
|
+
|
129
|
+
# Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
|
130
|
+
with self.spool(checkpoint, vectorsid) as output:
|
131
|
+
stream = output.name
|
132
|
+
batch = []
|
133
|
+
for document in documents:
|
134
|
+
batch.append(document)
|
135
|
+
|
136
|
+
if len(batch) == batchsize:
|
137
|
+
# Convert batch to embeddings
|
138
|
+
uids, dimensions = self.batch(batch, output, recovery)
|
139
|
+
ids.extend(uids)
|
140
|
+
batches += 1
|
141
|
+
|
142
|
+
batch = []
|
143
|
+
|
144
|
+
# Final batch
|
145
|
+
if batch:
|
146
|
+
uids, dimensions = self.batch(batch, output, recovery)
|
147
|
+
ids.extend(uids)
|
148
|
+
batches += 1
|
149
|
+
|
150
|
+
return (ids, dimensions, batches, stream)
|
151
|
+
|
152
|
+
def vectors(self, documents, batchsize=500, checkpoint=None, buffer=None, dtype=None):
|
153
|
+
"""
|
154
|
+
Bulk encodes documents into vectors using index(). Return the data as a mmap-ed array.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
documents: list of (id, data, tags)
|
158
|
+
batchsize: index batch size
|
159
|
+
checkpoint: optional checkpoint directory, enables indexing restart
|
160
|
+
buffer: file path used for memmap buffer
|
161
|
+
dtype: dtype for buffer
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
(ids, dimensions, embeddings)
|
165
|
+
"""
|
166
|
+
|
167
|
+
# Consume stream and transform documents to vectors
|
168
|
+
ids, dimensions, batches, stream = self.index(documents, batchsize, checkpoint)
|
169
|
+
|
170
|
+
# Check that embeddings are available and load as a memmap
|
171
|
+
embeddings = None
|
172
|
+
if ids:
|
173
|
+
# Write batches
|
174
|
+
embeddings = np.memmap(buffer, dtype=dtype, shape=(len(ids), dimensions), mode="w+")
|
175
|
+
with open(stream, "rb") as queue:
|
176
|
+
x = 0
|
177
|
+
for _ in range(batches):
|
178
|
+
batch = self.loadembeddings(queue)
|
179
|
+
embeddings[x : x + batch.shape[0]] = batch
|
180
|
+
x += batch.shape[0]
|
181
|
+
|
182
|
+
# Remove temporary file (if checkpointing is disabled)
|
183
|
+
if not checkpoint:
|
184
|
+
os.remove(stream)
|
185
|
+
|
186
|
+
return (ids, dimensions, embeddings)
|
187
|
+
|
188
|
+
def close(self):
|
189
|
+
"""
|
190
|
+
Closes this vectors instance.
|
191
|
+
"""
|
192
|
+
|
193
|
+
self.model = None
|
194
|
+
|
195
|
+
def transform(self, document):
|
196
|
+
"""
|
197
|
+
Transforms document into an embeddings vector.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
document: (id, data, tags)
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
embeddings vector
|
204
|
+
"""
|
205
|
+
|
206
|
+
# Prepare input document for vectors model and build embeddings
|
207
|
+
return self.batchtransform([document])[0]
|
208
|
+
|
209
|
+
def batchtransform(self, documents, category=None):
|
210
|
+
"""
|
211
|
+
Transforms batch of documents into embeddings vectors.
|
212
|
+
|
213
|
+
Args:
|
214
|
+
documents: list of documents used to build embeddings
|
215
|
+
category: category for instruction-based embeddings
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
embeddings vectors
|
219
|
+
"""
|
220
|
+
|
221
|
+
# Prepare input documents for vectors model
|
222
|
+
documents = [self.prepare(data, category) for _, data, _ in documents]
|
223
|
+
|
224
|
+
# Skip encoding data if it's already an array
|
225
|
+
if documents and isinstance(documents[0], np.ndarray):
|
226
|
+
return np.array(documents, dtype=np.float32)
|
227
|
+
|
228
|
+
return self.vectorize(documents, category)
|
229
|
+
|
230
|
+
def dot(self, queries, data):
|
231
|
+
"""
|
232
|
+
Calculates the dot product similarity between queries and documents. This method
|
233
|
+
assumes each of the inputs are normalized.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
queries: queries
|
237
|
+
data: search data
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
dot product scores
|
241
|
+
"""
|
242
|
+
|
243
|
+
return np.dot(queries, data.T).tolist()
|
244
|
+
|
245
|
+
def vectorsid(self):
|
246
|
+
"""
|
247
|
+
Generates vectors uid for this vectors instance.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
vectors uid
|
251
|
+
"""
|
252
|
+
|
253
|
+
# Select config options that determine uniqueness
|
254
|
+
select = ["path", "method", "tokenizer", "maxlength", "tokenize", "instructions", "dimensionality", "quantize"]
|
255
|
+
config = {k: v for k, v in self.config.items() if k in select}
|
256
|
+
config.update(self.config.get("vectors", {}))
|
257
|
+
|
258
|
+
# Generate a deterministic UUID
|
259
|
+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, json.dumps(config, sort_keys=True)))
|
260
|
+
|
261
|
+
def spool(self, checkpoint, vectorsid):
|
262
|
+
"""
|
263
|
+
Opens a spool file for queuing generated vectors.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
checkpoint: optional checkpoint directory, enables indexing restart
|
267
|
+
vectorsid: vectors uid for current configuration
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
vectors spool file
|
271
|
+
"""
|
272
|
+
|
273
|
+
# Spool to vectors checkpoint file
|
274
|
+
if checkpoint:
|
275
|
+
os.makedirs(checkpoint, exist_ok=True)
|
276
|
+
return open(f"{checkpoint}/{vectorsid}", "wb")
|
277
|
+
|
278
|
+
# Spool to temporary file
|
279
|
+
return tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False)
|
280
|
+
|
281
|
+
def batch(self, documents, output, recovery):
|
282
|
+
"""
|
283
|
+
Builds a batch of embeddings.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
documents: list of documents used to build embeddings
|
287
|
+
output: output temp file to store embeddings
|
288
|
+
recovery: optional recovery instance
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
(ids, dimensions) list of ids and number of dimensions in embeddings
|
292
|
+
"""
|
293
|
+
|
294
|
+
# Extract ids and prepare input documents for vectors model
|
295
|
+
ids = [uid for uid, _, _ in documents]
|
296
|
+
documents = [self.prepare(data, "data") for _, data, _ in documents]
|
297
|
+
dimensions = None
|
298
|
+
|
299
|
+
# Attempt to read embeddings from a recovery file
|
300
|
+
embeddings = recovery() if recovery else None
|
301
|
+
embeddings = self.vectorize(documents, "data") if embeddings is None else embeddings
|
302
|
+
if embeddings is not None:
|
303
|
+
dimensions = embeddings.shape[1]
|
304
|
+
self.saveembeddings(output, embeddings)
|
305
|
+
|
306
|
+
return (ids, dimensions)
|
307
|
+
|
308
|
+
def prepare(self, data, category=None):
|
309
|
+
"""
|
310
|
+
Prepares input data for vector model.
|
311
|
+
|
312
|
+
Args:
|
313
|
+
data: input data
|
314
|
+
category: category for instruction-based embeddings
|
315
|
+
|
316
|
+
Returns:
|
317
|
+
data formatted for vector model
|
318
|
+
"""
|
319
|
+
|
320
|
+
# Prepares tokens for the model
|
321
|
+
data = self.tokens(data)
|
322
|
+
|
323
|
+
# Default instruction category
|
324
|
+
category = category if category else "query"
|
325
|
+
|
326
|
+
# Prepend instructions, if applicable
|
327
|
+
if self.instructions and category in self.instructions and isinstance(data, str):
|
328
|
+
# Prepend category instruction
|
329
|
+
data = f"{self.instructions[category]}{data}"
|
330
|
+
|
331
|
+
return data
|
332
|
+
|
333
|
+
def tokens(self, data):
|
334
|
+
"""
|
335
|
+
Prepare data as tokens model can accept.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
data: input data
|
339
|
+
|
340
|
+
Returns:
|
341
|
+
tokens formatted for model
|
342
|
+
"""
|
343
|
+
|
344
|
+
# Optional string tokenization
|
345
|
+
if self.tokenize and isinstance(data, str):
|
346
|
+
data = Tokenizer.tokenize(data)
|
347
|
+
|
348
|
+
# Convert token list to string
|
349
|
+
if isinstance(data, list):
|
350
|
+
data = " ".join(data)
|
351
|
+
|
352
|
+
return data
|
353
|
+
|
354
|
+
def vectorize(self, data, category=None):
|
355
|
+
"""
|
356
|
+
Runs data vectorization, which consists of the following steps.
|
357
|
+
|
358
|
+
1. Encode data into vectors using underlying model
|
359
|
+
2. Truncate vectors, if necessary
|
360
|
+
3. Normalize vectors
|
361
|
+
4. Quantize vectors, if necessary
|
362
|
+
|
363
|
+
Args:
|
364
|
+
data: input data
|
365
|
+
category: category for instruction-based embeddings
|
366
|
+
|
367
|
+
Returns:
|
368
|
+
embeddings vectors
|
369
|
+
"""
|
370
|
+
|
371
|
+
# Default instruction category
|
372
|
+
category = category if category else "query"
|
373
|
+
|
374
|
+
# Transform data into vectors
|
375
|
+
embeddings = self.encode(data, category)
|
376
|
+
|
377
|
+
if embeddings is not None:
|
378
|
+
# Truncate embeddings, if necessary
|
379
|
+
if self.dimensionality and self.dimensionality < embeddings.shape[1]:
|
380
|
+
embeddings = self.truncate(embeddings)
|
381
|
+
|
382
|
+
# Normalize data
|
383
|
+
embeddings = self.normalize(embeddings)
|
384
|
+
|
385
|
+
# Apply quantization, if necessary
|
386
|
+
if self.qbits:
|
387
|
+
embeddings = self.quantize(embeddings)
|
388
|
+
|
389
|
+
return embeddings
|
390
|
+
|
391
|
+
def loadembeddings(self, f):
|
392
|
+
"""
|
393
|
+
Loads embeddings from file.
|
394
|
+
|
395
|
+
Args:
|
396
|
+
f: file to load from
|
397
|
+
|
398
|
+
Returns:
|
399
|
+
embeddings
|
400
|
+
"""
|
401
|
+
|
402
|
+
return np.load(f, allow_pickle=False)
|
403
|
+
|
404
|
+
def saveembeddings(self, f, embeddings):
|
405
|
+
"""
|
406
|
+
Saves embeddings to output.
|
407
|
+
|
408
|
+
Args:
|
409
|
+
f: output file
|
410
|
+
embeddings: embeddings to save
|
411
|
+
"""
|
412
|
+
|
413
|
+
np.save(f, embeddings, allow_pickle=False)
|
414
|
+
|
415
|
+
def truncate(self, embeddings):
|
416
|
+
"""
|
417
|
+
Truncates embeddings to the configured dimensionality.
|
418
|
+
|
419
|
+
This is only useful for models trained to store more important information in
|
420
|
+
earlier dimensions such as Matryoshka Representation Learning (MRL).
|
421
|
+
|
422
|
+
Args:
|
423
|
+
embeddings: input embeddings
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
truncated embeddings
|
427
|
+
"""
|
428
|
+
|
429
|
+
return embeddings[:, : self.dimensionality]
|
430
|
+
|
431
|
+
def normalize(self, embeddings):
|
432
|
+
"""
|
433
|
+
Normalizes embeddings using L2 normalization. Operation applied directly on array.
|
434
|
+
|
435
|
+
Args:
|
436
|
+
embeddings: input embeddings
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
embeddings
|
440
|
+
"""
|
441
|
+
|
442
|
+
# Calculation is different for matrices vs vectors
|
443
|
+
if len(embeddings.shape) > 1:
|
444
|
+
embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
|
445
|
+
else:
|
446
|
+
embeddings /= np.linalg.norm(embeddings)
|
447
|
+
|
448
|
+
return embeddings
|
449
|
+
|
450
|
+
def quantize(self, embeddings):
|
451
|
+
"""
|
452
|
+
Quantizes embeddings using scalar quantization.
|
453
|
+
|
454
|
+
Args:
|
455
|
+
embeddings: input embeddings
|
456
|
+
|
457
|
+
Returns:
|
458
|
+
quantized embeddings
|
459
|
+
"""
|
460
|
+
|
461
|
+
# Scale factor is midpoint in range
|
462
|
+
factor = 2 ** (self.qbits - 1)
|
463
|
+
|
464
|
+
# Quantize to uint8
|
465
|
+
scalars = embeddings * factor
|
466
|
+
scalars = scalars.clip(-factor, factor - 1) + factor
|
467
|
+
scalars = scalars.astype(np.uint8)
|
468
|
+
|
469
|
+
# Transform uint8 to bits
|
470
|
+
bits = np.unpackbits(scalars.reshape(-1, 1), axis=1)
|
471
|
+
|
472
|
+
# Remove unused bits (i.e. for 3-bit quantization, the leading 5 bits are removed)
|
473
|
+
bits = bits[:, -self.qbits :]
|
474
|
+
|
475
|
+
# Reshape using original data dimensions and pack bits into uint8 array
|
476
|
+
return np.packbits(bits.reshape(embeddings.shape[0], embeddings.shape[1] * self.qbits), axis=1)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
"""
|
2
|
+
Dense vectors imports
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .external import External
|
6
|
+
from .factory import VectorsFactory
|
7
|
+
from .huggingface import HFVectors
|
8
|
+
from .litellm import LiteLLM
|
9
|
+
from .llama import LlamaCpp
|
10
|
+
from .m2v import Model2Vec
|
11
|
+
from .sbert import STVectors
|
12
|
+
from .words import WordVectors
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""
|
2
|
+
External module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import types
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from ...util import Resolver
|
10
|
+
|
11
|
+
from ..base import Vectors
|
12
|
+
|
13
|
+
|
14
|
+
class External(Vectors):
|
15
|
+
"""
|
16
|
+
Builds vectors using an external method. This can be a local function or an external API call.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, config, scoring, models):
|
20
|
+
super().__init__(config, scoring, models)
|
21
|
+
|
22
|
+
# Lookup and resolve transform function
|
23
|
+
self.transform = self.resolve(config.get("transform"))
|
24
|
+
|
25
|
+
def loadmodel(self, path):
|
26
|
+
return None
|
27
|
+
|
28
|
+
def encode(self, data, category=None):
|
29
|
+
# Call external transform function, if available and data not already an array
|
30
|
+
# Batching is handed by the external transform function
|
31
|
+
if self.transform and data and not isinstance(data[0], np.ndarray):
|
32
|
+
data = self.transform(data)
|
33
|
+
|
34
|
+
# Cast to float32
|
35
|
+
return data.astype(np.float32) if isinstance(data, np.ndarray) else np.array(data, dtype=np.float32)
|
36
|
+
|
37
|
+
def resolve(self, transform):
|
38
|
+
"""
|
39
|
+
Resolves a transform function.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
transform: transform function
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
resolved transform function
|
46
|
+
"""
|
47
|
+
|
48
|
+
if transform:
|
49
|
+
# Resolve transform instance, if necessary
|
50
|
+
transform = Resolver()(transform) if transform and isinstance(transform, str) else transform
|
51
|
+
|
52
|
+
# Get function or callable instance
|
53
|
+
transform = transform if isinstance(transform, types.FunctionType) else transform()
|
54
|
+
|
55
|
+
return transform
|