mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
"""
|
2
|
+
Scan module
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class Scan:
|
7
|
+
"""
|
8
|
+
Scans indexes for query matches.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, search, limit, weights, index):
|
12
|
+
"""
|
13
|
+
Creates a new scan instance.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
search: index search function
|
17
|
+
limit: maximum results
|
18
|
+
weights: default hybrid score weights
|
19
|
+
index: default index name
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Index search function
|
23
|
+
self.search = search
|
24
|
+
|
25
|
+
# Default query limit
|
26
|
+
self.limit = limit
|
27
|
+
|
28
|
+
# Default number of candidates
|
29
|
+
self.candidates = None
|
30
|
+
|
31
|
+
# Default query weights
|
32
|
+
self.weights = weights
|
33
|
+
|
34
|
+
# Default index
|
35
|
+
self.index = index
|
36
|
+
|
37
|
+
def __call__(self, queries, parameters):
|
38
|
+
"""
|
39
|
+
Executes a scan for a list of queries.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
queries: list of queries to run
|
43
|
+
parameters: list of dicts of named parameters to bind to placeholders
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
list of (id, score) per query
|
47
|
+
"""
|
48
|
+
|
49
|
+
# Query results group by unique query clause id
|
50
|
+
results = {}
|
51
|
+
|
52
|
+
# Default number of candidates
|
53
|
+
default = None
|
54
|
+
|
55
|
+
# Group by index and run
|
56
|
+
for index, iqueries in self.parse(queries, parameters).items():
|
57
|
+
# Query limit to pass to batch search
|
58
|
+
candidates = [query.candidates for query in iqueries if query.candidates]
|
59
|
+
if not candidates and not default:
|
60
|
+
default = self.default(queries)
|
61
|
+
|
62
|
+
candidates = max(candidates) if candidates else default
|
63
|
+
|
64
|
+
# Query weights to pass to batch search
|
65
|
+
weights = [query.weights for query in iqueries if query.weights is not None]
|
66
|
+
weights = max(weights) if weights else self.weights
|
67
|
+
|
68
|
+
# Index to run query against
|
69
|
+
index = index if index else self.index
|
70
|
+
|
71
|
+
# Run index searches
|
72
|
+
for x, result in enumerate(self.search([query.text for query in iqueries], candidates, weights, index)):
|
73
|
+
# Save query id and results to later join to original query
|
74
|
+
results[iqueries[x].uid] = (iqueries[x].qid, result)
|
75
|
+
|
76
|
+
# Sort by query uid and return results
|
77
|
+
return [result for _, result in sorted(results.items())]
|
78
|
+
|
79
|
+
def parse(self, queries, parameters):
|
80
|
+
"""
|
81
|
+
Parse index query clauses from a list of parsed queries.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
queries: list of parsed queries
|
85
|
+
parameters: list of dicts of named parameters to bind to placeholders
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
index query clauses grouped by index
|
89
|
+
"""
|
90
|
+
|
91
|
+
results, uid = {}, 0
|
92
|
+
for x, query in enumerate(queries):
|
93
|
+
if "similar" in query:
|
94
|
+
# Extract similar query clauses
|
95
|
+
for params in query["similar"]:
|
96
|
+
# Resolve bind parameters
|
97
|
+
if parameters and parameters[x]:
|
98
|
+
params = self.bind(params, parameters[x])
|
99
|
+
|
100
|
+
# Parse query clause
|
101
|
+
clause = Clause(uid, x, params)
|
102
|
+
|
103
|
+
# Create clause list for index
|
104
|
+
if clause.index not in results:
|
105
|
+
results[clause.index] = []
|
106
|
+
|
107
|
+
# Add query to index list, increment uid
|
108
|
+
results[clause.index].append(clause)
|
109
|
+
uid += 1
|
110
|
+
|
111
|
+
return results
|
112
|
+
|
113
|
+
def bind(self, similar, parameters):
|
114
|
+
"""
|
115
|
+
Resolves bind parameters for a similar function call.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
similar: similar function call arguments
|
119
|
+
parameters: bind parameters
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
similar function call arguments with resolved bind parameters
|
123
|
+
"""
|
124
|
+
|
125
|
+
resolved = []
|
126
|
+
for p in similar:
|
127
|
+
# Resolve bind parameters
|
128
|
+
if isinstance(p, str) and p.startswith(":") and p[1:] in parameters:
|
129
|
+
resolved.append(parameters[p[1:]])
|
130
|
+
else:
|
131
|
+
resolved.append(p)
|
132
|
+
|
133
|
+
return resolved
|
134
|
+
|
135
|
+
def default(self, queries):
|
136
|
+
"""
|
137
|
+
Derives the default number of candidates. The number of candidates are the number of results to bring back
|
138
|
+
from index queries. This is an optional argument to similar() clauses.
|
139
|
+
|
140
|
+
For a single query filter clause, the default is the query limit. With multiple filtering clauses, the default is
|
141
|
+
10x the query limit. This ensures that limit results are still returned with additional filtering after an index query.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
queries: list of queries
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
default candidate list size
|
148
|
+
"""
|
149
|
+
|
150
|
+
multitoken = any(query.get("where") and len(query["where"].split()) > 1 for query in queries)
|
151
|
+
return self.limit * 10 if multitoken else self.limit
|
152
|
+
|
153
|
+
|
154
|
+
class Clause:
|
155
|
+
"""
|
156
|
+
Parses and stores query clause parameters.
|
157
|
+
"""
|
158
|
+
|
159
|
+
def __init__(self, uid, qid, params):
|
160
|
+
"""
|
161
|
+
Creates a new query clause.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
uid: query clause id
|
165
|
+
qid: query id clause is a part of
|
166
|
+
params: query parameters to parse
|
167
|
+
"""
|
168
|
+
|
169
|
+
self.uid, self.qid = uid, qid
|
170
|
+
self.text, self.index = params[0], None
|
171
|
+
self.candidates, self.weights = None, None
|
172
|
+
|
173
|
+
# Parse additional similar clause parameters
|
174
|
+
if len(params) > 1:
|
175
|
+
self.parse(params[1:])
|
176
|
+
|
177
|
+
def parse(self, params):
|
178
|
+
"""
|
179
|
+
Parses clause parameters into this instance.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
params: query clause parameters
|
183
|
+
"""
|
184
|
+
|
185
|
+
for param in params:
|
186
|
+
if (isinstance(param, str) and param.isdigit()) or isinstance(param, int):
|
187
|
+
# Number of query candidates
|
188
|
+
self.candidates = int(param)
|
189
|
+
|
190
|
+
elif (isinstance(param, str) and param.replace(".", "").isdigit()) or isinstance(param, float):
|
191
|
+
# Hybrid score weights
|
192
|
+
self.weights = float(param)
|
193
|
+
|
194
|
+
else:
|
195
|
+
# Target index
|
196
|
+
self.index = param
|
@@ -0,0 +1,46 @@
|
|
1
|
+
"""
|
2
|
+
Terms module
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class Terms:
|
7
|
+
"""
|
8
|
+
Reduces a query statement down to keyword terms. This method extracts the query text from similar clauses if it's a SQL statement.
|
9
|
+
Otherwise, the original query is returned.
|
10
|
+
"""
|
11
|
+
|
12
|
+
def __init__(self, embeddings):
|
13
|
+
"""
|
14
|
+
Create a new terms action.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
embeddings: embeddings instance
|
18
|
+
"""
|
19
|
+
|
20
|
+
self.database = embeddings.database
|
21
|
+
|
22
|
+
def __call__(self, queries):
|
23
|
+
"""
|
24
|
+
Extracts keyword terms from a list of queries.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
queries: list of queries
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
list of queries reduced down to keyword term strings
|
31
|
+
"""
|
32
|
+
|
33
|
+
# Parse queries and extract keyword terms for each query
|
34
|
+
if self.database:
|
35
|
+
terms = []
|
36
|
+
for query in queries:
|
37
|
+
# Parse query
|
38
|
+
parse = self.database.parse(query)
|
39
|
+
|
40
|
+
# Join terms from similar clauses
|
41
|
+
terms.append(" ".join(" ".join(s) for s in parse["similar"]))
|
42
|
+
|
43
|
+
return terms
|
44
|
+
|
45
|
+
# Return original query when database is None
|
46
|
+
return queries
|