mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
txtai/app/base.py
ADDED
@@ -0,0 +1,821 @@
|
|
1
|
+
"""
|
2
|
+
Application module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
|
7
|
+
from multiprocessing.pool import ThreadPool
|
8
|
+
from threading import RLock
|
9
|
+
|
10
|
+
import yaml
|
11
|
+
|
12
|
+
from ..agent import Agent
|
13
|
+
from ..embeddings import Documents, Embeddings
|
14
|
+
from ..pipeline import PipelineFactory
|
15
|
+
from ..workflow import WorkflowFactory
|
16
|
+
|
17
|
+
|
18
|
+
# pylint: disable=R0904
|
19
|
+
class Application:
|
20
|
+
"""
|
21
|
+
Builds YAML-configured txtai applications.
|
22
|
+
"""
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def read(data):
|
26
|
+
"""
|
27
|
+
Reads a YAML configuration file.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
data: input data
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
yaml
|
34
|
+
"""
|
35
|
+
|
36
|
+
if isinstance(data, str):
|
37
|
+
if os.path.exists(data):
|
38
|
+
# Read yaml from file
|
39
|
+
with open(data, "r", encoding="utf-8") as f:
|
40
|
+
# Read configuration
|
41
|
+
return yaml.safe_load(f)
|
42
|
+
|
43
|
+
# Attempt to read yaml from input
|
44
|
+
data = yaml.safe_load(data)
|
45
|
+
if not isinstance(data, str):
|
46
|
+
return data
|
47
|
+
|
48
|
+
# File not found and input is not yaml, raise error
|
49
|
+
raise FileNotFoundError(f"Unable to load file '{data}'")
|
50
|
+
|
51
|
+
# Return unmodified
|
52
|
+
return data
|
53
|
+
|
54
|
+
def __init__(self, config, loaddata=True):
|
55
|
+
"""
|
56
|
+
Creates an Application instance, which encapsulates embeddings, pipelines and workflows.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
config: index configuration
|
60
|
+
loaddata: If True (default), load existing index data, if available. Otherwise, only load models.
|
61
|
+
"""
|
62
|
+
|
63
|
+
# Initialize member variables
|
64
|
+
self.config, self.documents, self.embeddings = Application.read(config), None, None
|
65
|
+
|
66
|
+
# Write lock - allows only a single thread to update embeddings
|
67
|
+
self.lock = RLock()
|
68
|
+
|
69
|
+
# ThreadPool - runs scheduled workflows
|
70
|
+
self.pool = None
|
71
|
+
|
72
|
+
# Create pipelines
|
73
|
+
self.createpipelines()
|
74
|
+
|
75
|
+
# Create workflows
|
76
|
+
self.createworkflows()
|
77
|
+
|
78
|
+
# Create agents
|
79
|
+
self.createagents()
|
80
|
+
|
81
|
+
# Create embeddings index
|
82
|
+
self.indexes(loaddata)
|
83
|
+
|
84
|
+
def __del__(self):
|
85
|
+
"""
|
86
|
+
Close threadpool when this object is garbage collected.
|
87
|
+
"""
|
88
|
+
|
89
|
+
if hasattr(self, "pool") and self.pool:
|
90
|
+
self.pool.close()
|
91
|
+
self.pool = None
|
92
|
+
|
93
|
+
def createpipelines(self):
|
94
|
+
"""
|
95
|
+
Create pipelines.
|
96
|
+
"""
|
97
|
+
|
98
|
+
# Pipeline definitions
|
99
|
+
self.pipelines = {}
|
100
|
+
|
101
|
+
# Default pipelines
|
102
|
+
pipelines = list(PipelineFactory.list().keys())
|
103
|
+
|
104
|
+
# Add custom pipelines
|
105
|
+
for key in self.config:
|
106
|
+
if "." in key:
|
107
|
+
pipelines.append(key)
|
108
|
+
|
109
|
+
# Move dependent pipelines to end of list
|
110
|
+
dependent = ["similarity", "extractor", "rag", "reranker"]
|
111
|
+
pipelines = sorted(pipelines, key=lambda x: dependent.index(x) + 1 if x in dependent else 0)
|
112
|
+
|
113
|
+
# Create pipelines
|
114
|
+
for pipeline in pipelines:
|
115
|
+
if pipeline in self.config:
|
116
|
+
config = self.config[pipeline] if self.config[pipeline] else {}
|
117
|
+
|
118
|
+
# Add application reference, if requested
|
119
|
+
if "application" in config:
|
120
|
+
config["application"] = self
|
121
|
+
|
122
|
+
# Custom pipeline parameters
|
123
|
+
if pipeline in ["extractor", "rag"]:
|
124
|
+
if "similarity" not in config:
|
125
|
+
# Add placeholder, will be set to embeddings index once initialized
|
126
|
+
config["similarity"] = None
|
127
|
+
|
128
|
+
# Resolve reference pipelines
|
129
|
+
if config.get("similarity") in self.pipelines:
|
130
|
+
config["similarity"] = self.pipelines[config["similarity"]]
|
131
|
+
|
132
|
+
if config.get("path") in self.pipelines:
|
133
|
+
config["path"] = self.pipelines[config["path"]]
|
134
|
+
|
135
|
+
elif pipeline == "similarity" and "path" not in config and "labels" in self.pipelines:
|
136
|
+
config["model"] = self.pipelines["labels"]
|
137
|
+
|
138
|
+
elif pipeline == "reranker":
|
139
|
+
config["embeddings"] = None
|
140
|
+
config["similarity"] = self.pipelines["similarity"]
|
141
|
+
|
142
|
+
self.pipelines[pipeline] = PipelineFactory.create(config, pipeline)
|
143
|
+
|
144
|
+
def createworkflows(self):
|
145
|
+
"""
|
146
|
+
Create workflows.
|
147
|
+
"""
|
148
|
+
|
149
|
+
# Workflow definitions
|
150
|
+
self.workflows = {}
|
151
|
+
|
152
|
+
# Create workflows
|
153
|
+
if "workflow" in self.config:
|
154
|
+
for workflow, config in self.config["workflow"].items():
|
155
|
+
# Create copy of config
|
156
|
+
config = config.copy()
|
157
|
+
|
158
|
+
# Resolve callable functions
|
159
|
+
config["tasks"] = [self.resolvetask(task) for task in config["tasks"]]
|
160
|
+
|
161
|
+
# Resolve stream functions
|
162
|
+
if "stream" in config:
|
163
|
+
config["stream"] = self.resolvetask(config["stream"])
|
164
|
+
|
165
|
+
# Get scheduler config
|
166
|
+
schedule = config.pop("schedule", None)
|
167
|
+
|
168
|
+
# Create workflow
|
169
|
+
self.workflows[workflow] = WorkflowFactory.create(config, workflow)
|
170
|
+
|
171
|
+
# Schedule job if necessary
|
172
|
+
if schedule:
|
173
|
+
# Create pool if necessary
|
174
|
+
if not self.pool:
|
175
|
+
self.pool = ThreadPool()
|
176
|
+
|
177
|
+
self.pool.apply_async(self.workflows[workflow].schedule, kwds=schedule)
|
178
|
+
|
179
|
+
def createagents(self):
|
180
|
+
"""
|
181
|
+
Create agents.
|
182
|
+
"""
|
183
|
+
|
184
|
+
# Agent definitions
|
185
|
+
self.agents = {}
|
186
|
+
|
187
|
+
# Create agents
|
188
|
+
if "agent" in self.config:
|
189
|
+
for agent, config in self.config["agent"].items():
|
190
|
+
# Create copy of config
|
191
|
+
config = config.copy()
|
192
|
+
|
193
|
+
# Resolve LLM
|
194
|
+
config["llm"] = self.function("llm")
|
195
|
+
|
196
|
+
# Resolve tools
|
197
|
+
for tool in config.get("tools", []):
|
198
|
+
if isinstance(tool, dict) and "target" in tool:
|
199
|
+
tool["target"] = self.function(tool["target"])
|
200
|
+
|
201
|
+
# Create agent
|
202
|
+
self.agents[agent] = Agent(**config)
|
203
|
+
|
204
|
+
def indexes(self, loaddata):
|
205
|
+
"""
|
206
|
+
Initialize an embeddings index.
|
207
|
+
|
208
|
+
Args:
|
209
|
+
loaddata: If True (default), load existing index data, if available. Otherwise, only load models.
|
210
|
+
"""
|
211
|
+
|
212
|
+
# Get embeddings configuration
|
213
|
+
config = self.config.get("embeddings")
|
214
|
+
if config:
|
215
|
+
# Resolve application functions in embeddings config
|
216
|
+
config = self.resolveconfig(config.copy())
|
217
|
+
|
218
|
+
# Load embeddings index if loaddata and index exists
|
219
|
+
if loaddata and Embeddings().exists(self.config.get("path"), self.config.get("cloud")):
|
220
|
+
# Initialize empty embeddings
|
221
|
+
self.embeddings = Embeddings()
|
222
|
+
|
223
|
+
# Pass path and cloud settings. Set application functions as config overrides.
|
224
|
+
self.embeddings.load(
|
225
|
+
self.config.get("path"),
|
226
|
+
self.config.get("cloud"),
|
227
|
+
{key: config[key] for key in ["functions", "transform"] if key in config} if config else None,
|
228
|
+
)
|
229
|
+
|
230
|
+
elif "embeddings" in self.config:
|
231
|
+
# Create new embeddings with config
|
232
|
+
self.embeddings = Embeddings(config)
|
233
|
+
|
234
|
+
# If an extractor pipeline is defined and the similarity attribute is None, set to embeddings index
|
235
|
+
for key in ["extractor", "rag"]:
|
236
|
+
pipeline = self.pipelines.get(key)
|
237
|
+
config = self.config.get(key)
|
238
|
+
|
239
|
+
if pipeline and config is not None and config["similarity"] is None:
|
240
|
+
pipeline.similarity = self.embeddings
|
241
|
+
|
242
|
+
# Attach embeddings to reranker
|
243
|
+
if "reranker" in self.pipelines:
|
244
|
+
self.pipelines["reranker"].embeddings = self.embeddings
|
245
|
+
|
246
|
+
def resolvetask(self, task):
|
247
|
+
"""
|
248
|
+
Resolves callable functions for a task.
|
249
|
+
|
250
|
+
Args:
|
251
|
+
task: input task config
|
252
|
+
"""
|
253
|
+
|
254
|
+
# Check for task shorthand syntax
|
255
|
+
task = {"action": task} if isinstance(task, (str, list)) else task
|
256
|
+
|
257
|
+
if "action" in task:
|
258
|
+
action = task["action"]
|
259
|
+
values = [action] if not isinstance(action, list) else action
|
260
|
+
|
261
|
+
actions = []
|
262
|
+
for a in values:
|
263
|
+
if a in ["index", "upsert"]:
|
264
|
+
# Add queue action to buffer documents to index
|
265
|
+
actions.append(self.add)
|
266
|
+
|
267
|
+
# Override and disable unpacking for indexing actions
|
268
|
+
task["unpack"] = False
|
269
|
+
|
270
|
+
# Add finalize to trigger indexing
|
271
|
+
task["finalize"] = self.upsert if a == "upsert" else self.index
|
272
|
+
elif a == "search":
|
273
|
+
actions.append(self.batchsearch)
|
274
|
+
elif a == "transform":
|
275
|
+
# Transform vectors
|
276
|
+
actions.append(self.batchtransform)
|
277
|
+
|
278
|
+
# Override and disable one-to-many transformations
|
279
|
+
task["onetomany"] = False
|
280
|
+
else:
|
281
|
+
# Resolve action to callable function
|
282
|
+
actions.append(self.function(a))
|
283
|
+
|
284
|
+
# Save resolved action(s)
|
285
|
+
task["action"] = actions[0] if not isinstance(action, list) else actions
|
286
|
+
|
287
|
+
# Resolve initializer
|
288
|
+
if "initialize" in task and isinstance(task["initialize"], str):
|
289
|
+
task["initialize"] = self.function(task["initialize"])
|
290
|
+
|
291
|
+
# Resolve finalizer
|
292
|
+
if "finalize" in task and isinstance(task["finalize"], str):
|
293
|
+
task["finalize"] = self.function(task["finalize"])
|
294
|
+
|
295
|
+
return task
|
296
|
+
|
297
|
+
def resolveconfig(self, config):
|
298
|
+
"""
|
299
|
+
Resolves callable functions stored in embeddings configuration.
|
300
|
+
|
301
|
+
Args:
|
302
|
+
config: embeddings config
|
303
|
+
|
304
|
+
Returns:
|
305
|
+
resolved config
|
306
|
+
"""
|
307
|
+
|
308
|
+
if "functions" in config:
|
309
|
+
# Resolve callable functions
|
310
|
+
functions = []
|
311
|
+
for fn in config["functions"]:
|
312
|
+
original = fn
|
313
|
+
try:
|
314
|
+
if isinstance(fn, dict):
|
315
|
+
fn = fn.copy()
|
316
|
+
fn["function"] = self.function(fn["function"])
|
317
|
+
else:
|
318
|
+
fn = self.function(fn)
|
319
|
+
|
320
|
+
# pylint: disable=W0703
|
321
|
+
except Exception:
|
322
|
+
# Not a resolvable function, pipeline or workflow - further resolution will happen in embeddings
|
323
|
+
fn = original
|
324
|
+
|
325
|
+
functions.append(fn)
|
326
|
+
|
327
|
+
config["functions"] = functions
|
328
|
+
|
329
|
+
if "transform" in config:
|
330
|
+
# Resolve transform function
|
331
|
+
config["transform"] = self.function(config["transform"])
|
332
|
+
|
333
|
+
return config
|
334
|
+
|
335
|
+
def function(self, function):
|
336
|
+
"""
|
337
|
+
Get a handle to a callable function.
|
338
|
+
|
339
|
+
Args:
|
340
|
+
function: function name
|
341
|
+
|
342
|
+
Returns:
|
343
|
+
resolved function
|
344
|
+
"""
|
345
|
+
|
346
|
+
# Check if function is a pipeline
|
347
|
+
if function in self.pipelines:
|
348
|
+
return self.pipelines[function]
|
349
|
+
|
350
|
+
# Check if function is a workflow
|
351
|
+
if function in self.workflows:
|
352
|
+
return self.workflows[function]
|
353
|
+
|
354
|
+
# Attempt to resolve action as a callable function
|
355
|
+
return PipelineFactory.create({}, function)
|
356
|
+
|
357
|
+
def search(self, query, limit=10, weights=None, index=None, parameters=None, graph=False):
|
358
|
+
"""
|
359
|
+
Finds documents most similar to the input query. This method will run either an index search
|
360
|
+
or an index + database search depending on if a database is available.
|
361
|
+
|
362
|
+
Args:
|
363
|
+
query: input query
|
364
|
+
limit: maximum results
|
365
|
+
weights: hybrid score weights, if applicable
|
366
|
+
index: index name, if applicable
|
367
|
+
parameters: dict of named parameters to bind to placeholders
|
368
|
+
graph: return graph results if True
|
369
|
+
|
370
|
+
Returns:
|
371
|
+
list of {id: value, score: value} for index search, list of dict for an index + database search
|
372
|
+
"""
|
373
|
+
|
374
|
+
if self.embeddings:
|
375
|
+
with self.lock:
|
376
|
+
results = self.embeddings.search(query, limit, weights, index, parameters, graph)
|
377
|
+
|
378
|
+
# Unpack (id, score) tuple, if necessary. Otherwise, results are dictionaries.
|
379
|
+
return results if graph else [{"id": r[0], "score": float(r[1])} if isinstance(r, tuple) else r for r in results]
|
380
|
+
|
381
|
+
return None
|
382
|
+
|
383
|
+
def batchsearch(self, queries, limit=10, weights=None, index=None, parameters=None, graph=False):
|
384
|
+
"""
|
385
|
+
Finds documents most similar to the input queries. This method will run either an index search
|
386
|
+
or an index + database search depending on if a database is available.
|
387
|
+
|
388
|
+
Args:
|
389
|
+
queries: input queries
|
390
|
+
limit: maximum results
|
391
|
+
weights: hybrid score weights, if applicable
|
392
|
+
index: index name, if applicable
|
393
|
+
parameters: list of dicts of named parameters to bind to placeholders
|
394
|
+
graph: return graph results if True
|
395
|
+
|
396
|
+
Returns:
|
397
|
+
list of {id: value, score: value} per query for index search, list of dict per query for an index + database search
|
398
|
+
"""
|
399
|
+
|
400
|
+
if self.embeddings:
|
401
|
+
with self.lock:
|
402
|
+
search = self.embeddings.batchsearch(queries, limit, weights, index, parameters, graph)
|
403
|
+
|
404
|
+
results = []
|
405
|
+
for result in search:
|
406
|
+
# Unpack (id, score) tuple, if necessary. Otherwise, results are dictionaries.
|
407
|
+
results.append(result if graph else [{"id": r[0], "score": float(r[1])} if isinstance(r, tuple) else r for r in result])
|
408
|
+
return results
|
409
|
+
|
410
|
+
return None
|
411
|
+
|
412
|
+
def add(self, documents):
|
413
|
+
"""
|
414
|
+
Adds a batch of documents for indexing.
|
415
|
+
|
416
|
+
Args:
|
417
|
+
documents: list of {id: value, data: value, tags: value}
|
418
|
+
|
419
|
+
Returns:
|
420
|
+
unmodified input documents
|
421
|
+
"""
|
422
|
+
|
423
|
+
# Raise error if index is not writable
|
424
|
+
if not self.config.get("writable"):
|
425
|
+
raise ReadOnlyError("Attempting to add documents to a read-only index (writable != True)")
|
426
|
+
|
427
|
+
if self.embeddings:
|
428
|
+
with self.lock:
|
429
|
+
# Create documents file if not already open
|
430
|
+
if not self.documents:
|
431
|
+
self.documents = Documents()
|
432
|
+
|
433
|
+
# Add documents
|
434
|
+
self.documents.add(list(documents))
|
435
|
+
|
436
|
+
# Return unmodified input documents
|
437
|
+
return documents
|
438
|
+
|
439
|
+
def addobject(self, data, uid, field):
|
440
|
+
"""
|
441
|
+
Helper method that builds a batch of object documents.
|
442
|
+
|
443
|
+
Args:
|
444
|
+
data: object content
|
445
|
+
uid: optional list of corresponding uids
|
446
|
+
field: optional field to set
|
447
|
+
|
448
|
+
Returns:
|
449
|
+
documents
|
450
|
+
"""
|
451
|
+
|
452
|
+
# Raise error if index is not writable
|
453
|
+
if not self.config.get("writable"):
|
454
|
+
raise ReadOnlyError("Attempting to add documents to a read-only index (writable != True)")
|
455
|
+
|
456
|
+
documents = []
|
457
|
+
for x, content in enumerate(data):
|
458
|
+
if field:
|
459
|
+
row = {"id": uid[x], field: content} if uid else {field: content}
|
460
|
+
elif uid:
|
461
|
+
row = (uid[x], content)
|
462
|
+
else:
|
463
|
+
row = content
|
464
|
+
|
465
|
+
documents.append(row)
|
466
|
+
|
467
|
+
return self.add(documents)
|
468
|
+
|
469
|
+
def index(self):
|
470
|
+
"""
|
471
|
+
Builds an embeddings index for previously batched documents.
|
472
|
+
"""
|
473
|
+
|
474
|
+
# Raise error if index is not writable
|
475
|
+
if not self.config.get("writable"):
|
476
|
+
raise ReadOnlyError("Attempting to index a read-only index (writable != True)")
|
477
|
+
|
478
|
+
if self.embeddings and self.documents:
|
479
|
+
with self.lock:
|
480
|
+
# Reset index
|
481
|
+
self.indexes(False)
|
482
|
+
|
483
|
+
# Build scoring index if term weighting is enabled
|
484
|
+
if self.embeddings.isweighted():
|
485
|
+
self.embeddings.score(self.documents)
|
486
|
+
|
487
|
+
# Build embeddings index
|
488
|
+
self.embeddings.index(self.documents)
|
489
|
+
|
490
|
+
# Save index if path available, otherwise this is an memory-only index
|
491
|
+
if self.config.get("path"):
|
492
|
+
self.embeddings.save(self.config["path"], self.config.get("cloud"))
|
493
|
+
|
494
|
+
# Reset document stream
|
495
|
+
self.documents.close()
|
496
|
+
self.documents = None
|
497
|
+
|
498
|
+
def upsert(self):
|
499
|
+
"""
|
500
|
+
Runs an embeddings upsert operation for previously batched documents.
|
501
|
+
"""
|
502
|
+
|
503
|
+
# Raise error if index is not writable
|
504
|
+
if not self.config.get("writable"):
|
505
|
+
raise ReadOnlyError("Attempting to upsert a read-only index (writable != True)")
|
506
|
+
|
507
|
+
if self.embeddings and self.documents:
|
508
|
+
with self.lock:
|
509
|
+
# Run upsert
|
510
|
+
self.embeddings.upsert(self.documents)
|
511
|
+
|
512
|
+
# Save index if path available, otherwise this is an memory-only index
|
513
|
+
if self.config.get("path"):
|
514
|
+
self.embeddings.save(self.config["path"], self.config.get("cloud"))
|
515
|
+
|
516
|
+
# Reset document stream
|
517
|
+
self.documents.close()
|
518
|
+
self.documents = None
|
519
|
+
|
520
|
+
def delete(self, ids):
|
521
|
+
"""
|
522
|
+
Deletes from an embeddings index. Returns list of ids deleted.
|
523
|
+
|
524
|
+
Args:
|
525
|
+
ids: list of ids to delete
|
526
|
+
|
527
|
+
Returns:
|
528
|
+
ids deleted
|
529
|
+
"""
|
530
|
+
|
531
|
+
# Raise error if index is not writable
|
532
|
+
if not self.config.get("writable"):
|
533
|
+
raise ReadOnlyError("Attempting to delete from a read-only index (writable != True)")
|
534
|
+
|
535
|
+
if self.embeddings:
|
536
|
+
with self.lock:
|
537
|
+
# Run delete operation
|
538
|
+
deleted = self.embeddings.delete(ids)
|
539
|
+
|
540
|
+
# Save index if path available, otherwise this is an memory-only index
|
541
|
+
if self.config.get("path"):
|
542
|
+
self.embeddings.save(self.config["path"], self.config.get("cloud"))
|
543
|
+
|
544
|
+
# Return deleted ids
|
545
|
+
return deleted
|
546
|
+
|
547
|
+
return None
|
548
|
+
|
549
|
+
def reindex(self, config, function=None):
|
550
|
+
"""
|
551
|
+
Recreates embeddings index using config. This method only works if document content storage is enabled.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
config: new config
|
555
|
+
function: optional function to prepare content for indexing
|
556
|
+
"""
|
557
|
+
|
558
|
+
# Raise error if index is not writable
|
559
|
+
if not self.config.get("writable"):
|
560
|
+
raise ReadOnlyError("Attempting to reindex a read-only index (writable != True)")
|
561
|
+
|
562
|
+
if self.embeddings:
|
563
|
+
with self.lock:
|
564
|
+
# Resolve function, if necessary
|
565
|
+
function = self.function(function) if function and isinstance(function, str) else function
|
566
|
+
|
567
|
+
# Reindex
|
568
|
+
self.embeddings.reindex(config, function)
|
569
|
+
|
570
|
+
# Save index if path available, otherwise this is an memory-only index
|
571
|
+
if self.config.get("path"):
|
572
|
+
self.embeddings.save(self.config["path"], self.config.get("cloud"))
|
573
|
+
|
574
|
+
def count(self):
|
575
|
+
"""
|
576
|
+
Total number of elements in this embeddings index.
|
577
|
+
|
578
|
+
Returns:
|
579
|
+
number of elements in embeddings index
|
580
|
+
"""
|
581
|
+
|
582
|
+
if self.embeddings:
|
583
|
+
return self.embeddings.count()
|
584
|
+
|
585
|
+
return None
|
586
|
+
|
587
|
+
def similarity(self, query, texts):
|
588
|
+
"""
|
589
|
+
Computes the similarity between query and list of text. Returns a list of
|
590
|
+
{id: value, score: value} sorted by highest score, where id is the index
|
591
|
+
in texts.
|
592
|
+
|
593
|
+
Args:
|
594
|
+
query: query text
|
595
|
+
texts: list of text
|
596
|
+
|
597
|
+
Returns:
|
598
|
+
list of {id: value, score: value}
|
599
|
+
"""
|
600
|
+
|
601
|
+
# Use similarity instance if available otherwise fall back to embeddings model
|
602
|
+
if "similarity" in self.pipelines:
|
603
|
+
return [{"id": uid, "score": float(score)} for uid, score in self.pipelines["similarity"](query, texts)]
|
604
|
+
if self.embeddings:
|
605
|
+
return [{"id": uid, "score": float(score)} for uid, score in self.embeddings.similarity(query, texts)]
|
606
|
+
|
607
|
+
return None
|
608
|
+
|
609
|
+
def batchsimilarity(self, queries, texts):
|
610
|
+
"""
|
611
|
+
Computes the similarity between list of queries and list of text. Returns a list
|
612
|
+
of {id: value, score: value} sorted by highest score per query, where id is the
|
613
|
+
index in texts.
|
614
|
+
|
615
|
+
Args:
|
616
|
+
queries: queries text
|
617
|
+
texts: list of text
|
618
|
+
|
619
|
+
Returns:
|
620
|
+
list of {id: value, score: value} per query
|
621
|
+
"""
|
622
|
+
|
623
|
+
# Use similarity instance if available otherwise fall back to embeddings model
|
624
|
+
if "similarity" in self.pipelines:
|
625
|
+
return [[{"id": uid, "score": float(score)} for uid, score in r] for r in self.pipelines["similarity"](queries, texts)]
|
626
|
+
if self.embeddings:
|
627
|
+
return [[{"id": uid, "score": float(score)} for uid, score in r] for r in self.embeddings.batchsimilarity(queries, texts)]
|
628
|
+
|
629
|
+
return None
|
630
|
+
|
631
|
+
def explain(self, query, texts=None, limit=10):
|
632
|
+
"""
|
633
|
+
Explains the importance of each input token in text for a query.
|
634
|
+
|
635
|
+
Args:
|
636
|
+
query: query text
|
637
|
+
texts: optional list of text, otherwise runs search query
|
638
|
+
limit: optional limit if texts is None
|
639
|
+
|
640
|
+
Returns:
|
641
|
+
list of dict per input text where a higher token scores represents higher importance relative to the query
|
642
|
+
"""
|
643
|
+
|
644
|
+
if self.embeddings:
|
645
|
+
with self.lock:
|
646
|
+
return self.embeddings.explain(query, texts, limit)
|
647
|
+
|
648
|
+
return None
|
649
|
+
|
650
|
+
def batchexplain(self, queries, texts=None, limit=10):
|
651
|
+
"""
|
652
|
+
Explains the importance of each input token in text for a list of queries.
|
653
|
+
|
654
|
+
Args:
|
655
|
+
query: queries text
|
656
|
+
texts: optional list of text, otherwise runs search queries
|
657
|
+
limit: optional limit if texts is None
|
658
|
+
|
659
|
+
Returns:
|
660
|
+
list of dict per input text per query where a higher token scores represents higher importance relative to the query
|
661
|
+
"""
|
662
|
+
|
663
|
+
if self.embeddings:
|
664
|
+
with self.lock:
|
665
|
+
return self.embeddings.batchexplain(queries, texts, limit)
|
666
|
+
|
667
|
+
return None
|
668
|
+
|
669
|
+
def transform(self, text, category=None, index=None):
|
670
|
+
"""
|
671
|
+
Transforms text into embeddings arrays.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
text: input text
|
675
|
+
category: category for instruction-based embeddings
|
676
|
+
index: index name, if applicable
|
677
|
+
|
678
|
+
Returns:
|
679
|
+
embeddings array
|
680
|
+
"""
|
681
|
+
|
682
|
+
if self.embeddings:
|
683
|
+
return [float(x) for x in self.embeddings.transform(text, category, index)]
|
684
|
+
|
685
|
+
return None
|
686
|
+
|
687
|
+
def batchtransform(self, texts, category=None, index=None):
|
688
|
+
"""
|
689
|
+
Transforms list of text into embeddings arrays.
|
690
|
+
|
691
|
+
Args:
|
692
|
+
texts: list of text
|
693
|
+
category: category for instruction-based embeddings
|
694
|
+
index: index name, if applicable
|
695
|
+
|
696
|
+
Returns:
|
697
|
+
embeddings arrays
|
698
|
+
"""
|
699
|
+
|
700
|
+
if self.embeddings:
|
701
|
+
return [[float(x) for x in result] for result in self.embeddings.batchtransform(texts, category, index)]
|
702
|
+
|
703
|
+
return None
|
704
|
+
|
705
|
+
def extract(self, queue, texts=None):
|
706
|
+
"""
|
707
|
+
Extracts answers to input questions.
|
708
|
+
|
709
|
+
Args:
|
710
|
+
queue: list of {name: value, query: value, question: value, snippet: value}
|
711
|
+
texts: optional list of text
|
712
|
+
|
713
|
+
Returns:
|
714
|
+
list of {name: value, answer: value}
|
715
|
+
"""
|
716
|
+
|
717
|
+
if self.embeddings and "extractor" in self.pipelines:
|
718
|
+
# Get extractor instance
|
719
|
+
extractor = self.pipelines["extractor"]
|
720
|
+
|
721
|
+
# Run extractor and return results as dicts
|
722
|
+
return extractor(queue, texts)
|
723
|
+
|
724
|
+
return None
|
725
|
+
|
726
|
+
def label(self, text, labels):
|
727
|
+
"""
|
728
|
+
Applies a zero shot classifier to text using a list of labels. Returns a list of
|
729
|
+
{id: value, score: value} sorted by highest score, where id is the index in labels.
|
730
|
+
|
731
|
+
Args:
|
732
|
+
text: text|list
|
733
|
+
labels: list of labels
|
734
|
+
|
735
|
+
Returns:
|
736
|
+
list of {id: value, score: value} per text element
|
737
|
+
"""
|
738
|
+
|
739
|
+
if "labels" in self.pipelines:
|
740
|
+
# Text is a string
|
741
|
+
if isinstance(text, str):
|
742
|
+
return [{"id": uid, "score": float(score)} for uid, score in self.pipelines["labels"](text, labels)]
|
743
|
+
|
744
|
+
# Text is a list
|
745
|
+
return [[{"id": uid, "score": float(score)} for uid, score in result] for result in self.pipelines["labels"](text, labels)]
|
746
|
+
|
747
|
+
return None
|
748
|
+
|
749
|
+
def pipeline(self, name, *args, **kwargs):
|
750
|
+
"""
|
751
|
+
Generic pipeline execution method.
|
752
|
+
|
753
|
+
Args:
|
754
|
+
name: pipeline name
|
755
|
+
args: pipeline positional arguments
|
756
|
+
kwargs: pipeline keyword arguments
|
757
|
+
|
758
|
+
Returns:
|
759
|
+
pipeline results
|
760
|
+
"""
|
761
|
+
|
762
|
+
# Backwards compatible with previous pipeline function arguments
|
763
|
+
args = args[0] if args and len(args) == 1 and isinstance(args[0], tuple) else args
|
764
|
+
|
765
|
+
if name in self.pipelines:
|
766
|
+
return self.pipelines[name](*args, **kwargs)
|
767
|
+
|
768
|
+
return None
|
769
|
+
|
770
|
+
def workflow(self, name, elements):
|
771
|
+
"""
|
772
|
+
Executes a workflow.
|
773
|
+
|
774
|
+
Args:
|
775
|
+
name: workflow name
|
776
|
+
elements: elements to process
|
777
|
+
|
778
|
+
Returns:
|
779
|
+
processed elements
|
780
|
+
"""
|
781
|
+
|
782
|
+
if hasattr(elements, "__len__") and hasattr(elements, "__getitem__"):
|
783
|
+
# Convert to tuples and return as a list since input is sized
|
784
|
+
elements = [tuple(element) if isinstance(element, list) else element for element in elements]
|
785
|
+
else:
|
786
|
+
# Convert to tuples and return as a generator since input is not sized
|
787
|
+
elements = (tuple(element) if isinstance(element, list) else element for element in elements)
|
788
|
+
|
789
|
+
# Execute workflow
|
790
|
+
return self.workflows[name](elements)
|
791
|
+
|
792
|
+
def agent(self, name, *args, **kwargs):
|
793
|
+
"""
|
794
|
+
Executes an agent.
|
795
|
+
|
796
|
+
Args:
|
797
|
+
name: agent name
|
798
|
+
args: agent positional arguments
|
799
|
+
kwargs: agent keyword arguments
|
800
|
+
"""
|
801
|
+
|
802
|
+
if name in self.agents:
|
803
|
+
return self.agents[name](*args, **kwargs)
|
804
|
+
|
805
|
+
return None
|
806
|
+
|
807
|
+
def wait(self):
|
808
|
+
"""
|
809
|
+
Closes threadpool and waits for completion.
|
810
|
+
"""
|
811
|
+
|
812
|
+
if self.pool:
|
813
|
+
self.pool.close()
|
814
|
+
self.pool.join()
|
815
|
+
self.pool = None
|
816
|
+
|
817
|
+
|
818
|
+
class ReadOnlyError(Exception):
|
819
|
+
"""
|
820
|
+
Error raised when trying to modify a read-only index
|
821
|
+
"""
|