mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,104 @@
1
+ """
2
+ Reducer module
3
+ """
4
+
5
+ from zipfile import BadZipFile
6
+
7
+ # Conditionally import dimensionality reduction libraries as they aren't installed by default
8
+ try:
9
+ import skops.io as sio
10
+
11
+ from sklearn.decomposition import TruncatedSVD
12
+
13
+ REDUCER = True
14
+ except ImportError:
15
+ REDUCER = False
16
+
17
+ from ...serialize import SerializeFactory
18
+
19
+
20
+ class Reducer:
21
+ """
22
+ LSA dimensionality reduction model
23
+ """
24
+
25
+ def __init__(self, embeddings=None, components=None):
26
+ """
27
+ Creates a dimensionality reduction model.
28
+
29
+ Args:
30
+ embeddings: input embeddings matrix
31
+ components: number of model components
32
+ """
33
+
34
+ if not REDUCER:
35
+ raise ImportError('Dimensionality reduction is not available - install "vectors" extra to enable')
36
+
37
+ self.model = self.build(embeddings, components) if embeddings is not None and components else None
38
+
39
+ def __call__(self, embeddings):
40
+ """
41
+ Applies a dimensionality reduction model to embeddings, removed the top n principal components. Operation applied
42
+ directly on array.
43
+
44
+ Args:
45
+ embeddings: input embeddings matrix
46
+ """
47
+
48
+ pc = self.model.components_
49
+ factor = embeddings.dot(pc.transpose())
50
+
51
+ # Apply LSA model
52
+ # Calculation is different if n_components = 1
53
+ if pc.shape[0] == 1:
54
+ embeddings -= factor * pc
55
+ elif len(embeddings.shape) > 1:
56
+ # Apply model on a row-wise basis to limit memory usage
57
+ for x in range(embeddings.shape[0]):
58
+ embeddings[x] -= factor[x].dot(pc)
59
+ else:
60
+ # Single embedding
61
+ embeddings -= factor.dot(pc)
62
+
63
+ def build(self, embeddings, components):
64
+ """
65
+ Builds a LSA model. This model is used to remove the principal component within embeddings. This helps to
66
+ smooth out noisy embeddings (common words with less value).
67
+
68
+ Args:
69
+ embeddings: input embeddings matrix
70
+ components: number of model components
71
+
72
+ Returns:
73
+ LSA model
74
+ """
75
+
76
+ model = TruncatedSVD(n_components=components, random_state=0)
77
+ model.fit(embeddings)
78
+
79
+ return model
80
+
81
+ def load(self, path):
82
+ """
83
+ Loads a Reducer object from path.
84
+
85
+ Args:
86
+ path: directory path to load model
87
+ """
88
+
89
+ # Dimensionality reduction
90
+ try:
91
+ self.model = sio.load(path)
92
+ except (BadZipFile, KeyError):
93
+ # Backwards compatible support for pickled models
94
+ self.model = SerializeFactory.create("pickle").load(path)
95
+
96
+ def save(self, path):
97
+ """
98
+ Saves a Reducer object to path.
99
+
100
+ Args:
101
+ path: directory path to save model
102
+ """
103
+
104
+ sio.dump(self.model, path)
@@ -0,0 +1,67 @@
1
+ """
2
+ Stream module
3
+ """
4
+
5
+ from .autoid import AutoId
6
+ from .transform import Action
7
+
8
+
9
+ class Stream:
10
+ """
11
+ Yields input document as standard (id, data, tags) tuples.
12
+ """
13
+
14
+ def __init__(self, embeddings, action=None):
15
+ """
16
+ Create a new stream.
17
+
18
+ Args:
19
+ embeddings: embeddings instance
20
+ action: optional index action
21
+ """
22
+
23
+ self.embeddings = embeddings
24
+ self.action = action
25
+
26
+ # Alias embeddings attributes
27
+ self.config = embeddings.config
28
+
29
+ # Get config parameters
30
+ self.offset = self.config.get("offset", 0) if action == Action.UPSERT else 0
31
+ autoid = self.config.get("autoid", self.offset)
32
+
33
+ # Create autoid generator, reset int sequence if this isn't an UPSERT
34
+ autoid = 0 if isinstance(autoid, int) and action != Action.UPSERT else autoid
35
+ self.autoid = AutoId(autoid)
36
+
37
+ def __call__(self, documents):
38
+ """
39
+ Yield (id, data, tags) tuples from a stream of documents.
40
+
41
+ Args:
42
+ documents: input documents
43
+ """
44
+
45
+ # Iterate over documents and yield standard (id, data, tag) tuples
46
+ for document in documents:
47
+ if isinstance(document, dict):
48
+ # Create (id, data, tags) tuple from dictionary
49
+ document = document.get("id"), document, document.get("tags")
50
+ elif isinstance(document, tuple):
51
+ # Create (id, data, tags) tuple
52
+ document = document if len(document) >= 3 else (document[0], document[1], None)
53
+ else:
54
+ # Create (id, data, tags) tuple with empty fields
55
+ document = None, document, None
56
+
57
+ # Set autoid if the action is set
58
+ if self.action and document[0] is None:
59
+ document = (self.autoid(document[1]), document[1], document[2])
60
+
61
+ # Yield (id, data, tags) tuple
62
+ yield document
63
+
64
+ # Save autoid sequence if used
65
+ current = self.autoid.current()
66
+ if self.action and current:
67
+ self.config["autoid"] = current
@@ -0,0 +1,205 @@
1
+ """
2
+ Transform module
3
+ """
4
+
5
+ import numpy as np
6
+
7
+ from .action import Action
8
+
9
+
10
+ class Transform:
11
+ """
12
+ Executes a transform. Processes a stream of documents, loads batches into enabled data stores and vectorizes documents.
13
+ """
14
+
15
+ def __init__(self, embeddings, action, checkpoint=None):
16
+ """
17
+ Creates a new transform.
18
+
19
+ Args:
20
+ embeddings: embeddings instance
21
+ action: index action
22
+ checkpoint: optional checkpoint directory, enables indexing restart
23
+ """
24
+
25
+ self.embeddings = embeddings
26
+ self.action = action
27
+ self.checkpoint = checkpoint
28
+
29
+ # Alias embeddings attributes
30
+ self.config = embeddings.config
31
+ self.delete = embeddings.delete
32
+ self.model = embeddings.model
33
+ self.database = embeddings.database
34
+ self.graph = embeddings.graph
35
+ self.indexes = embeddings.indexes
36
+ self.scoring = embeddings.scoring if embeddings.issparse() else None
37
+
38
+ # Get config parameters
39
+ self.offset = embeddings.config.get("offset", 0) if action == Action.UPSERT else 0
40
+ self.batch = embeddings.config.get("batch", 1024)
41
+
42
+ # Scalar quantization
43
+ quantize = embeddings.config.get("quantize")
44
+ self.qbits = quantize if isinstance(quantize, int) and not isinstance(quantize, bool) else None
45
+
46
+ # Transform columns
47
+ columns = embeddings.config.get("columns", {})
48
+ self.text = columns.get("text", "text")
49
+ self.object = columns.get("object", "object")
50
+
51
+ # Check if top-level indexing is enabled for this embeddings
52
+ self.indexing = embeddings.model or embeddings.scoring
53
+
54
+ # List of deleted ids with this action
55
+ self.deletes = set()
56
+
57
+ def __call__(self, documents, buffer):
58
+ """
59
+ Processes an iterable collection of documents, handles any iterable including generators.
60
+
61
+ This method loads a stream of documents into enabled data stores and vectorizes documents into an embeddings array.
62
+
63
+ Args:
64
+ documents: iterable of (id, data, tags)
65
+ buffer: file path used for memmap buffer
66
+
67
+ Returns:
68
+ (document ids, dimensions, embeddings)
69
+ """
70
+
71
+ # Return parameters
72
+ ids, dimensions, embeddings = None, None, None
73
+
74
+ if self.model:
75
+ ids, dimensions, embeddings = self.vectors(documents, buffer)
76
+ else:
77
+ ids = self.ids(documents)
78
+
79
+ return (ids, dimensions, embeddings)
80
+
81
+ def vectors(self, documents, buffer):
82
+ """
83
+ Runs a vectors transform operation when dense indexing is enabled.
84
+
85
+ Args:
86
+ documents: iterable of (id, data, tags)
87
+ buffer: file path used for memmap buffer
88
+
89
+ Returns:
90
+ (document ids, dimensions, embeddings)
91
+ """
92
+
93
+ # Determine dtype
94
+ dtype = np.uint8 if self.qbits else np.float32
95
+
96
+ # Transform documents into vectors
97
+ return self.model.vectors(self.stream(documents), self.batch, self.checkpoint, buffer, dtype)
98
+
99
+ def ids(self, documents):
100
+ """
101
+ Runs an ids transform operation when dense indexing is disabled.
102
+
103
+ Args:
104
+ documents: iterable of (id, data, tags)
105
+
106
+ Returns:
107
+ document ids
108
+ """
109
+
110
+ # Consume stream and build extract ids
111
+ ids = []
112
+ for uid, _, _ in self.stream(documents):
113
+ ids.append(uid)
114
+
115
+ # Save offset when dense indexing is disabled
116
+ self.config["offset"] = self.offset
117
+
118
+ return ids
119
+
120
+ def stream(self, documents):
121
+ """
122
+ This method does two things:
123
+
124
+ 1. Filter and yield data to vectorize
125
+ 2. Batch and load original documents into enabled data stores (database, graph, scoring)
126
+
127
+ Documents are yielded for vectorization if one of the following is True:
128
+ - dict with a text or object field
129
+ - not a dict
130
+
131
+ Otherwise, documents are only batched and inserted into data stores
132
+
133
+ Args:
134
+ documents: iterable collection (id, data, tags)
135
+ """
136
+
137
+ # Batch and index offset. Index offset increments by count of documents streamed for vectorization
138
+ batch, offset = [], 0
139
+
140
+ # Iterate and process documents stream
141
+ for document in documents:
142
+ if isinstance(document[1], dict):
143
+ # Set text field to uid when top-level indexing is disabled and text empty
144
+ if not self.indexing and not document[1].get(self.text):
145
+ document[1][self.text] = str(document[0])
146
+
147
+ if self.text in document[1]:
148
+ yield (document[0], document[1][self.text], document[2])
149
+ offset += 1
150
+ elif self.object in document[1]:
151
+ yield (document[0], document[1][self.object], document[2])
152
+ offset += 1
153
+ else:
154
+ yield document
155
+ offset += 1
156
+
157
+ # Batch document
158
+ batch.append(document)
159
+ if len(batch) == self.batch:
160
+ self.load(batch, offset)
161
+ batch, offset = [], 0
162
+
163
+ # Final batch
164
+ if batch:
165
+ self.load(batch, offset)
166
+
167
+ def load(self, batch, offset):
168
+ """
169
+ Loads a document batch. This method deletes existing ids from an embeddings index and
170
+ loads into enabled data stores (database, graph, scoring).
171
+
172
+ Args:
173
+ batch: list of (id, data, tags)
174
+ offset: index offset for batch
175
+ """
176
+
177
+ # Delete from embeddings index first (which deletes from underlying indexes and datastores) if this is an upsert
178
+ if self.action == Action.UPSERT:
179
+ # Get list of ids not yet seen and deleted
180
+ deletes = [uid for uid, _, _ in batch if uid not in self.deletes]
181
+ if deletes:
182
+ # Execute delete
183
+ self.delete(deletes)
184
+
185
+ # Save deleted ids as a delete must only occur once per action
186
+ self.deletes.update(deletes)
187
+
188
+ # Load batch into database except if this is a reindex
189
+ if self.database and self.action != Action.REINDEX:
190
+ self.database.insert(batch, self.offset)
191
+
192
+ # Load batch into scoring
193
+ if self.scoring:
194
+ self.scoring.insert(batch, self.offset, self.checkpoint)
195
+
196
+ # Load batch into subindex documents stream
197
+ if self.indexes:
198
+ self.indexes.insert(batch, self.offset, self.checkpoint)
199
+
200
+ # Load batch into graph
201
+ if self.graph:
202
+ self.graph.insert(batch, self.offset)
203
+
204
+ # Increment offset
205
+ self.offset += offset
@@ -0,0 +1,11 @@
1
+ """
2
+ Search imports
3
+ """
4
+
5
+ from .base import Search
6
+ from .errors import *
7
+ from .explain import Explain
8
+ from .ids import Ids
9
+ from .query import Query
10
+ from .scan import Scan
11
+ from .terms import Terms