mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
txtai/app/base.py ADDED
@@ -0,0 +1,821 @@
1
+ """
2
+ Application module
3
+ """
4
+
5
+ import os
6
+
7
+ from multiprocessing.pool import ThreadPool
8
+ from threading import RLock
9
+
10
+ import yaml
11
+
12
+ from ..agent import Agent
13
+ from ..embeddings import Documents, Embeddings
14
+ from ..pipeline import PipelineFactory
15
+ from ..workflow import WorkflowFactory
16
+
17
+
18
+ # pylint: disable=R0904
19
+ class Application:
20
+ """
21
+ Builds YAML-configured txtai applications.
22
+ """
23
+
24
+ @staticmethod
25
+ def read(data):
26
+ """
27
+ Reads a YAML configuration file.
28
+
29
+ Args:
30
+ data: input data
31
+
32
+ Returns:
33
+ yaml
34
+ """
35
+
36
+ if isinstance(data, str):
37
+ if os.path.exists(data):
38
+ # Read yaml from file
39
+ with open(data, "r", encoding="utf-8") as f:
40
+ # Read configuration
41
+ return yaml.safe_load(f)
42
+
43
+ # Attempt to read yaml from input
44
+ data = yaml.safe_load(data)
45
+ if not isinstance(data, str):
46
+ return data
47
+
48
+ # File not found and input is not yaml, raise error
49
+ raise FileNotFoundError(f"Unable to load file '{data}'")
50
+
51
+ # Return unmodified
52
+ return data
53
+
54
+ def __init__(self, config, loaddata=True):
55
+ """
56
+ Creates an Application instance, which encapsulates embeddings, pipelines and workflows.
57
+
58
+ Args:
59
+ config: index configuration
60
+ loaddata: If True (default), load existing index data, if available. Otherwise, only load models.
61
+ """
62
+
63
+ # Initialize member variables
64
+ self.config, self.documents, self.embeddings = Application.read(config), None, None
65
+
66
+ # Write lock - allows only a single thread to update embeddings
67
+ self.lock = RLock()
68
+
69
+ # ThreadPool - runs scheduled workflows
70
+ self.pool = None
71
+
72
+ # Create pipelines
73
+ self.createpipelines()
74
+
75
+ # Create workflows
76
+ self.createworkflows()
77
+
78
+ # Create agents
79
+ self.createagents()
80
+
81
+ # Create embeddings index
82
+ self.indexes(loaddata)
83
+
84
+ def __del__(self):
85
+ """
86
+ Close threadpool when this object is garbage collected.
87
+ """
88
+
89
+ if hasattr(self, "pool") and self.pool:
90
+ self.pool.close()
91
+ self.pool = None
92
+
93
+ def createpipelines(self):
94
+ """
95
+ Create pipelines.
96
+ """
97
+
98
+ # Pipeline definitions
99
+ self.pipelines = {}
100
+
101
+ # Default pipelines
102
+ pipelines = list(PipelineFactory.list().keys())
103
+
104
+ # Add custom pipelines
105
+ for key in self.config:
106
+ if "." in key:
107
+ pipelines.append(key)
108
+
109
+ # Move dependent pipelines to end of list
110
+ dependent = ["similarity", "extractor", "rag", "reranker"]
111
+ pipelines = sorted(pipelines, key=lambda x: dependent.index(x) + 1 if x in dependent else 0)
112
+
113
+ # Create pipelines
114
+ for pipeline in pipelines:
115
+ if pipeline in self.config:
116
+ config = self.config[pipeline] if self.config[pipeline] else {}
117
+
118
+ # Add application reference, if requested
119
+ if "application" in config:
120
+ config["application"] = self
121
+
122
+ # Custom pipeline parameters
123
+ if pipeline in ["extractor", "rag"]:
124
+ if "similarity" not in config:
125
+ # Add placeholder, will be set to embeddings index once initialized
126
+ config["similarity"] = None
127
+
128
+ # Resolve reference pipelines
129
+ if config.get("similarity") in self.pipelines:
130
+ config["similarity"] = self.pipelines[config["similarity"]]
131
+
132
+ if config.get("path") in self.pipelines:
133
+ config["path"] = self.pipelines[config["path"]]
134
+
135
+ elif pipeline == "similarity" and "path" not in config and "labels" in self.pipelines:
136
+ config["model"] = self.pipelines["labels"]
137
+
138
+ elif pipeline == "reranker":
139
+ config["embeddings"] = None
140
+ config["similarity"] = self.pipelines["similarity"]
141
+
142
+ self.pipelines[pipeline] = PipelineFactory.create(config, pipeline)
143
+
144
+ def createworkflows(self):
145
+ """
146
+ Create workflows.
147
+ """
148
+
149
+ # Workflow definitions
150
+ self.workflows = {}
151
+
152
+ # Create workflows
153
+ if "workflow" in self.config:
154
+ for workflow, config in self.config["workflow"].items():
155
+ # Create copy of config
156
+ config = config.copy()
157
+
158
+ # Resolve callable functions
159
+ config["tasks"] = [self.resolvetask(task) for task in config["tasks"]]
160
+
161
+ # Resolve stream functions
162
+ if "stream" in config:
163
+ config["stream"] = self.resolvetask(config["stream"])
164
+
165
+ # Get scheduler config
166
+ schedule = config.pop("schedule", None)
167
+
168
+ # Create workflow
169
+ self.workflows[workflow] = WorkflowFactory.create(config, workflow)
170
+
171
+ # Schedule job if necessary
172
+ if schedule:
173
+ # Create pool if necessary
174
+ if not self.pool:
175
+ self.pool = ThreadPool()
176
+
177
+ self.pool.apply_async(self.workflows[workflow].schedule, kwds=schedule)
178
+
179
+ def createagents(self):
180
+ """
181
+ Create agents.
182
+ """
183
+
184
+ # Agent definitions
185
+ self.agents = {}
186
+
187
+ # Create agents
188
+ if "agent" in self.config:
189
+ for agent, config in self.config["agent"].items():
190
+ # Create copy of config
191
+ config = config.copy()
192
+
193
+ # Resolve LLM
194
+ config["llm"] = self.function("llm")
195
+
196
+ # Resolve tools
197
+ for tool in config.get("tools", []):
198
+ if isinstance(tool, dict) and "target" in tool:
199
+ tool["target"] = self.function(tool["target"])
200
+
201
+ # Create agent
202
+ self.agents[agent] = Agent(**config)
203
+
204
+ def indexes(self, loaddata):
205
+ """
206
+ Initialize an embeddings index.
207
+
208
+ Args:
209
+ loaddata: If True (default), load existing index data, if available. Otherwise, only load models.
210
+ """
211
+
212
+ # Get embeddings configuration
213
+ config = self.config.get("embeddings")
214
+ if config:
215
+ # Resolve application functions in embeddings config
216
+ config = self.resolveconfig(config.copy())
217
+
218
+ # Load embeddings index if loaddata and index exists
219
+ if loaddata and Embeddings().exists(self.config.get("path"), self.config.get("cloud")):
220
+ # Initialize empty embeddings
221
+ self.embeddings = Embeddings()
222
+
223
+ # Pass path and cloud settings. Set application functions as config overrides.
224
+ self.embeddings.load(
225
+ self.config.get("path"),
226
+ self.config.get("cloud"),
227
+ {key: config[key] for key in ["functions", "transform"] if key in config} if config else None,
228
+ )
229
+
230
+ elif "embeddings" in self.config:
231
+ # Create new embeddings with config
232
+ self.embeddings = Embeddings(config)
233
+
234
+ # If an extractor pipeline is defined and the similarity attribute is None, set to embeddings index
235
+ for key in ["extractor", "rag"]:
236
+ pipeline = self.pipelines.get(key)
237
+ config = self.config.get(key)
238
+
239
+ if pipeline and config is not None and config["similarity"] is None:
240
+ pipeline.similarity = self.embeddings
241
+
242
+ # Attach embeddings to reranker
243
+ if "reranker" in self.pipelines:
244
+ self.pipelines["reranker"].embeddings = self.embeddings
245
+
246
+ def resolvetask(self, task):
247
+ """
248
+ Resolves callable functions for a task.
249
+
250
+ Args:
251
+ task: input task config
252
+ """
253
+
254
+ # Check for task shorthand syntax
255
+ task = {"action": task} if isinstance(task, (str, list)) else task
256
+
257
+ if "action" in task:
258
+ action = task["action"]
259
+ values = [action] if not isinstance(action, list) else action
260
+
261
+ actions = []
262
+ for a in values:
263
+ if a in ["index", "upsert"]:
264
+ # Add queue action to buffer documents to index
265
+ actions.append(self.add)
266
+
267
+ # Override and disable unpacking for indexing actions
268
+ task["unpack"] = False
269
+
270
+ # Add finalize to trigger indexing
271
+ task["finalize"] = self.upsert if a == "upsert" else self.index
272
+ elif a == "search":
273
+ actions.append(self.batchsearch)
274
+ elif a == "transform":
275
+ # Transform vectors
276
+ actions.append(self.batchtransform)
277
+
278
+ # Override and disable one-to-many transformations
279
+ task["onetomany"] = False
280
+ else:
281
+ # Resolve action to callable function
282
+ actions.append(self.function(a))
283
+
284
+ # Save resolved action(s)
285
+ task["action"] = actions[0] if not isinstance(action, list) else actions
286
+
287
+ # Resolve initializer
288
+ if "initialize" in task and isinstance(task["initialize"], str):
289
+ task["initialize"] = self.function(task["initialize"])
290
+
291
+ # Resolve finalizer
292
+ if "finalize" in task and isinstance(task["finalize"], str):
293
+ task["finalize"] = self.function(task["finalize"])
294
+
295
+ return task
296
+
297
+ def resolveconfig(self, config):
298
+ """
299
+ Resolves callable functions stored in embeddings configuration.
300
+
301
+ Args:
302
+ config: embeddings config
303
+
304
+ Returns:
305
+ resolved config
306
+ """
307
+
308
+ if "functions" in config:
309
+ # Resolve callable functions
310
+ functions = []
311
+ for fn in config["functions"]:
312
+ original = fn
313
+ try:
314
+ if isinstance(fn, dict):
315
+ fn = fn.copy()
316
+ fn["function"] = self.function(fn["function"])
317
+ else:
318
+ fn = self.function(fn)
319
+
320
+ # pylint: disable=W0703
321
+ except Exception:
322
+ # Not a resolvable function, pipeline or workflow - further resolution will happen in embeddings
323
+ fn = original
324
+
325
+ functions.append(fn)
326
+
327
+ config["functions"] = functions
328
+
329
+ if "transform" in config:
330
+ # Resolve transform function
331
+ config["transform"] = self.function(config["transform"])
332
+
333
+ return config
334
+
335
+ def function(self, function):
336
+ """
337
+ Get a handle to a callable function.
338
+
339
+ Args:
340
+ function: function name
341
+
342
+ Returns:
343
+ resolved function
344
+ """
345
+
346
+ # Check if function is a pipeline
347
+ if function in self.pipelines:
348
+ return self.pipelines[function]
349
+
350
+ # Check if function is a workflow
351
+ if function in self.workflows:
352
+ return self.workflows[function]
353
+
354
+ # Attempt to resolve action as a callable function
355
+ return PipelineFactory.create({}, function)
356
+
357
+ def search(self, query, limit=10, weights=None, index=None, parameters=None, graph=False):
358
+ """
359
+ Finds documents most similar to the input query. This method will run either an index search
360
+ or an index + database search depending on if a database is available.
361
+
362
+ Args:
363
+ query: input query
364
+ limit: maximum results
365
+ weights: hybrid score weights, if applicable
366
+ index: index name, if applicable
367
+ parameters: dict of named parameters to bind to placeholders
368
+ graph: return graph results if True
369
+
370
+ Returns:
371
+ list of {id: value, score: value} for index search, list of dict for an index + database search
372
+ """
373
+
374
+ if self.embeddings:
375
+ with self.lock:
376
+ results = self.embeddings.search(query, limit, weights, index, parameters, graph)
377
+
378
+ # Unpack (id, score) tuple, if necessary. Otherwise, results are dictionaries.
379
+ return results if graph else [{"id": r[0], "score": float(r[1])} if isinstance(r, tuple) else r for r in results]
380
+
381
+ return None
382
+
383
+ def batchsearch(self, queries, limit=10, weights=None, index=None, parameters=None, graph=False):
384
+ """
385
+ Finds documents most similar to the input queries. This method will run either an index search
386
+ or an index + database search depending on if a database is available.
387
+
388
+ Args:
389
+ queries: input queries
390
+ limit: maximum results
391
+ weights: hybrid score weights, if applicable
392
+ index: index name, if applicable
393
+ parameters: list of dicts of named parameters to bind to placeholders
394
+ graph: return graph results if True
395
+
396
+ Returns:
397
+ list of {id: value, score: value} per query for index search, list of dict per query for an index + database search
398
+ """
399
+
400
+ if self.embeddings:
401
+ with self.lock:
402
+ search = self.embeddings.batchsearch(queries, limit, weights, index, parameters, graph)
403
+
404
+ results = []
405
+ for result in search:
406
+ # Unpack (id, score) tuple, if necessary. Otherwise, results are dictionaries.
407
+ results.append(result if graph else [{"id": r[0], "score": float(r[1])} if isinstance(r, tuple) else r for r in result])
408
+ return results
409
+
410
+ return None
411
+
412
+ def add(self, documents):
413
+ """
414
+ Adds a batch of documents for indexing.
415
+
416
+ Args:
417
+ documents: list of {id: value, data: value, tags: value}
418
+
419
+ Returns:
420
+ unmodified input documents
421
+ """
422
+
423
+ # Raise error if index is not writable
424
+ if not self.config.get("writable"):
425
+ raise ReadOnlyError("Attempting to add documents to a read-only index (writable != True)")
426
+
427
+ if self.embeddings:
428
+ with self.lock:
429
+ # Create documents file if not already open
430
+ if not self.documents:
431
+ self.documents = Documents()
432
+
433
+ # Add documents
434
+ self.documents.add(list(documents))
435
+
436
+ # Return unmodified input documents
437
+ return documents
438
+
439
+ def addobject(self, data, uid, field):
440
+ """
441
+ Helper method that builds a batch of object documents.
442
+
443
+ Args:
444
+ data: object content
445
+ uid: optional list of corresponding uids
446
+ field: optional field to set
447
+
448
+ Returns:
449
+ documents
450
+ """
451
+
452
+ # Raise error if index is not writable
453
+ if not self.config.get("writable"):
454
+ raise ReadOnlyError("Attempting to add documents to a read-only index (writable != True)")
455
+
456
+ documents = []
457
+ for x, content in enumerate(data):
458
+ if field:
459
+ row = {"id": uid[x], field: content} if uid else {field: content}
460
+ elif uid:
461
+ row = (uid[x], content)
462
+ else:
463
+ row = content
464
+
465
+ documents.append(row)
466
+
467
+ return self.add(documents)
468
+
469
+ def index(self):
470
+ """
471
+ Builds an embeddings index for previously batched documents.
472
+ """
473
+
474
+ # Raise error if index is not writable
475
+ if not self.config.get("writable"):
476
+ raise ReadOnlyError("Attempting to index a read-only index (writable != True)")
477
+
478
+ if self.embeddings and self.documents:
479
+ with self.lock:
480
+ # Reset index
481
+ self.indexes(False)
482
+
483
+ # Build scoring index if term weighting is enabled
484
+ if self.embeddings.isweighted():
485
+ self.embeddings.score(self.documents)
486
+
487
+ # Build embeddings index
488
+ self.embeddings.index(self.documents)
489
+
490
+ # Save index if path available, otherwise this is an memory-only index
491
+ if self.config.get("path"):
492
+ self.embeddings.save(self.config["path"], self.config.get("cloud"))
493
+
494
+ # Reset document stream
495
+ self.documents.close()
496
+ self.documents = None
497
+
498
+ def upsert(self):
499
+ """
500
+ Runs an embeddings upsert operation for previously batched documents.
501
+ """
502
+
503
+ # Raise error if index is not writable
504
+ if not self.config.get("writable"):
505
+ raise ReadOnlyError("Attempting to upsert a read-only index (writable != True)")
506
+
507
+ if self.embeddings and self.documents:
508
+ with self.lock:
509
+ # Run upsert
510
+ self.embeddings.upsert(self.documents)
511
+
512
+ # Save index if path available, otherwise this is an memory-only index
513
+ if self.config.get("path"):
514
+ self.embeddings.save(self.config["path"], self.config.get("cloud"))
515
+
516
+ # Reset document stream
517
+ self.documents.close()
518
+ self.documents = None
519
+
520
+ def delete(self, ids):
521
+ """
522
+ Deletes from an embeddings index. Returns list of ids deleted.
523
+
524
+ Args:
525
+ ids: list of ids to delete
526
+
527
+ Returns:
528
+ ids deleted
529
+ """
530
+
531
+ # Raise error if index is not writable
532
+ if not self.config.get("writable"):
533
+ raise ReadOnlyError("Attempting to delete from a read-only index (writable != True)")
534
+
535
+ if self.embeddings:
536
+ with self.lock:
537
+ # Run delete operation
538
+ deleted = self.embeddings.delete(ids)
539
+
540
+ # Save index if path available, otherwise this is an memory-only index
541
+ if self.config.get("path"):
542
+ self.embeddings.save(self.config["path"], self.config.get("cloud"))
543
+
544
+ # Return deleted ids
545
+ return deleted
546
+
547
+ return None
548
+
549
+ def reindex(self, config, function=None):
550
+ """
551
+ Recreates embeddings index using config. This method only works if document content storage is enabled.
552
+
553
+ Args:
554
+ config: new config
555
+ function: optional function to prepare content for indexing
556
+ """
557
+
558
+ # Raise error if index is not writable
559
+ if not self.config.get("writable"):
560
+ raise ReadOnlyError("Attempting to reindex a read-only index (writable != True)")
561
+
562
+ if self.embeddings:
563
+ with self.lock:
564
+ # Resolve function, if necessary
565
+ function = self.function(function) if function and isinstance(function, str) else function
566
+
567
+ # Reindex
568
+ self.embeddings.reindex(config, function)
569
+
570
+ # Save index if path available, otherwise this is an memory-only index
571
+ if self.config.get("path"):
572
+ self.embeddings.save(self.config["path"], self.config.get("cloud"))
573
+
574
+ def count(self):
575
+ """
576
+ Total number of elements in this embeddings index.
577
+
578
+ Returns:
579
+ number of elements in embeddings index
580
+ """
581
+
582
+ if self.embeddings:
583
+ return self.embeddings.count()
584
+
585
+ return None
586
+
587
+ def similarity(self, query, texts):
588
+ """
589
+ Computes the similarity between query and list of text. Returns a list of
590
+ {id: value, score: value} sorted by highest score, where id is the index
591
+ in texts.
592
+
593
+ Args:
594
+ query: query text
595
+ texts: list of text
596
+
597
+ Returns:
598
+ list of {id: value, score: value}
599
+ """
600
+
601
+ # Use similarity instance if available otherwise fall back to embeddings model
602
+ if "similarity" in self.pipelines:
603
+ return [{"id": uid, "score": float(score)} for uid, score in self.pipelines["similarity"](query, texts)]
604
+ if self.embeddings:
605
+ return [{"id": uid, "score": float(score)} for uid, score in self.embeddings.similarity(query, texts)]
606
+
607
+ return None
608
+
609
+ def batchsimilarity(self, queries, texts):
610
+ """
611
+ Computes the similarity between list of queries and list of text. Returns a list
612
+ of {id: value, score: value} sorted by highest score per query, where id is the
613
+ index in texts.
614
+
615
+ Args:
616
+ queries: queries text
617
+ texts: list of text
618
+
619
+ Returns:
620
+ list of {id: value, score: value} per query
621
+ """
622
+
623
+ # Use similarity instance if available otherwise fall back to embeddings model
624
+ if "similarity" in self.pipelines:
625
+ return [[{"id": uid, "score": float(score)} for uid, score in r] for r in self.pipelines["similarity"](queries, texts)]
626
+ if self.embeddings:
627
+ return [[{"id": uid, "score": float(score)} for uid, score in r] for r in self.embeddings.batchsimilarity(queries, texts)]
628
+
629
+ return None
630
+
631
+ def explain(self, query, texts=None, limit=10):
632
+ """
633
+ Explains the importance of each input token in text for a query.
634
+
635
+ Args:
636
+ query: query text
637
+ texts: optional list of text, otherwise runs search query
638
+ limit: optional limit if texts is None
639
+
640
+ Returns:
641
+ list of dict per input text where a higher token scores represents higher importance relative to the query
642
+ """
643
+
644
+ if self.embeddings:
645
+ with self.lock:
646
+ return self.embeddings.explain(query, texts, limit)
647
+
648
+ return None
649
+
650
+ def batchexplain(self, queries, texts=None, limit=10):
651
+ """
652
+ Explains the importance of each input token in text for a list of queries.
653
+
654
+ Args:
655
+ query: queries text
656
+ texts: optional list of text, otherwise runs search queries
657
+ limit: optional limit if texts is None
658
+
659
+ Returns:
660
+ list of dict per input text per query where a higher token scores represents higher importance relative to the query
661
+ """
662
+
663
+ if self.embeddings:
664
+ with self.lock:
665
+ return self.embeddings.batchexplain(queries, texts, limit)
666
+
667
+ return None
668
+
669
+ def transform(self, text, category=None, index=None):
670
+ """
671
+ Transforms text into embeddings arrays.
672
+
673
+ Args:
674
+ text: input text
675
+ category: category for instruction-based embeddings
676
+ index: index name, if applicable
677
+
678
+ Returns:
679
+ embeddings array
680
+ """
681
+
682
+ if self.embeddings:
683
+ return [float(x) for x in self.embeddings.transform(text, category, index)]
684
+
685
+ return None
686
+
687
+ def batchtransform(self, texts, category=None, index=None):
688
+ """
689
+ Transforms list of text into embeddings arrays.
690
+
691
+ Args:
692
+ texts: list of text
693
+ category: category for instruction-based embeddings
694
+ index: index name, if applicable
695
+
696
+ Returns:
697
+ embeddings arrays
698
+ """
699
+
700
+ if self.embeddings:
701
+ return [[float(x) for x in result] for result in self.embeddings.batchtransform(texts, category, index)]
702
+
703
+ return None
704
+
705
+ def extract(self, queue, texts=None):
706
+ """
707
+ Extracts answers to input questions.
708
+
709
+ Args:
710
+ queue: list of {name: value, query: value, question: value, snippet: value}
711
+ texts: optional list of text
712
+
713
+ Returns:
714
+ list of {name: value, answer: value}
715
+ """
716
+
717
+ if self.embeddings and "extractor" in self.pipelines:
718
+ # Get extractor instance
719
+ extractor = self.pipelines["extractor"]
720
+
721
+ # Run extractor and return results as dicts
722
+ return extractor(queue, texts)
723
+
724
+ return None
725
+
726
+ def label(self, text, labels):
727
+ """
728
+ Applies a zero shot classifier to text using a list of labels. Returns a list of
729
+ {id: value, score: value} sorted by highest score, where id is the index in labels.
730
+
731
+ Args:
732
+ text: text|list
733
+ labels: list of labels
734
+
735
+ Returns:
736
+ list of {id: value, score: value} per text element
737
+ """
738
+
739
+ if "labels" in self.pipelines:
740
+ # Text is a string
741
+ if isinstance(text, str):
742
+ return [{"id": uid, "score": float(score)} for uid, score in self.pipelines["labels"](text, labels)]
743
+
744
+ # Text is a list
745
+ return [[{"id": uid, "score": float(score)} for uid, score in result] for result in self.pipelines["labels"](text, labels)]
746
+
747
+ return None
748
+
749
+ def pipeline(self, name, *args, **kwargs):
750
+ """
751
+ Generic pipeline execution method.
752
+
753
+ Args:
754
+ name: pipeline name
755
+ args: pipeline positional arguments
756
+ kwargs: pipeline keyword arguments
757
+
758
+ Returns:
759
+ pipeline results
760
+ """
761
+
762
+ # Backwards compatible with previous pipeline function arguments
763
+ args = args[0] if args and len(args) == 1 and isinstance(args[0], tuple) else args
764
+
765
+ if name in self.pipelines:
766
+ return self.pipelines[name](*args, **kwargs)
767
+
768
+ return None
769
+
770
+ def workflow(self, name, elements):
771
+ """
772
+ Executes a workflow.
773
+
774
+ Args:
775
+ name: workflow name
776
+ elements: elements to process
777
+
778
+ Returns:
779
+ processed elements
780
+ """
781
+
782
+ if hasattr(elements, "__len__") and hasattr(elements, "__getitem__"):
783
+ # Convert to tuples and return as a list since input is sized
784
+ elements = [tuple(element) if isinstance(element, list) else element for element in elements]
785
+ else:
786
+ # Convert to tuples and return as a generator since input is not sized
787
+ elements = (tuple(element) if isinstance(element, list) else element for element in elements)
788
+
789
+ # Execute workflow
790
+ return self.workflows[name](elements)
791
+
792
+ def agent(self, name, *args, **kwargs):
793
+ """
794
+ Executes an agent.
795
+
796
+ Args:
797
+ name: agent name
798
+ args: agent positional arguments
799
+ kwargs: agent keyword arguments
800
+ """
801
+
802
+ if name in self.agents:
803
+ return self.agents[name](*args, **kwargs)
804
+
805
+ return None
806
+
807
+ def wait(self):
808
+ """
809
+ Closes threadpool and waits for completion.
810
+ """
811
+
812
+ if self.pool:
813
+ self.pool.close()
814
+ self.pool.join()
815
+ self.pool = None
816
+
817
+
818
+ class ReadOnlyError(Exception):
819
+ """
820
+ Error raised when trying to modify a read-only index
821
+ """