mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,569 @@
1
+ """
2
+ RDBMS module
3
+ """
4
+
5
+ import datetime
6
+ import json
7
+
8
+ from .base import Database
9
+ from .schema import Statement
10
+
11
+
12
+ # pylint: disable=R0904
13
+ class RDBMS(Database):
14
+ """
15
+ Base relational database class. A relational database uses SQL to insert, update, delete and select from a
16
+ database instance.
17
+ """
18
+
19
+ def __init__(self, config):
20
+ """
21
+ Creates a new Database.
22
+
23
+ Args:
24
+ config: database configuration parameters
25
+ """
26
+
27
+ super().__init__(config)
28
+
29
+ # Database connection
30
+ self.connection = None
31
+ self.cursor = None
32
+
33
+ def load(self, path):
34
+ # Load an existing database. Thread locking must be handled externally.
35
+ self.session(path)
36
+
37
+ def insert(self, documents, index=0):
38
+ # Initialize connection if not open
39
+ self.initialize()
40
+
41
+ # Get entry date
42
+ entry = datetime.datetime.now(datetime.timezone.utc)
43
+
44
+ # Insert documents
45
+ for uid, document, tags in documents:
46
+ if isinstance(document, dict):
47
+ # Insert document and use return value for sections table
48
+ document = self.loaddocument(uid, document, tags, entry)
49
+
50
+ if document is not None:
51
+ if isinstance(document, list):
52
+ # Join tokens to text
53
+ document = " ".join(document)
54
+ elif not isinstance(document, str):
55
+ # If object support is enabled, save object
56
+ self.loadobject(uid, document, tags, entry)
57
+
58
+ # Clear section text for objects, even when objects aren't inserted
59
+ document = None
60
+
61
+ # Save text section
62
+ self.loadsection(index, uid, document, tags, entry)
63
+ index += 1
64
+
65
+ # Post processing logic
66
+ self.finalize()
67
+
68
+ def delete(self, ids):
69
+ if self.connection:
70
+ # Batch ids
71
+ self.batch(ids=ids)
72
+
73
+ # Delete all documents, objects and sections by id
74
+ self.cursor.execute(Statement.DELETE_DOCUMENTS)
75
+ self.cursor.execute(Statement.DELETE_OBJECTS)
76
+ self.cursor.execute(Statement.DELETE_SECTIONS)
77
+
78
+ def reindex(self, config):
79
+ if self.connection:
80
+ # Set new configuration
81
+ self.configure(config)
82
+
83
+ # Resolve text column
84
+ select = self.resolve(self.text)
85
+
86
+ # Initialize reindex operation
87
+ name = self.reindexstart()
88
+
89
+ # Copy data over
90
+ self.cursor.execute(Statement.COPY_SECTIONS % (name, select))
91
+
92
+ # Stream new results
93
+ self.cursor.execute(Statement.STREAM_SECTIONS % name)
94
+ for uid, text, data, obj, tags in self.rows():
95
+ if not text and self.encoder and obj:
96
+ yield (uid, self.encoder.decode(obj), tags)
97
+ else:
98
+ # Read JSON data, if provided
99
+ data = json.loads(data) if data and isinstance(data, str) else data
100
+
101
+ # Stream data if available, otherwise use section text
102
+ yield (uid, data if data else text, tags)
103
+
104
+ # Swap as new table
105
+ self.cursor.execute(Statement.DROP_SECTIONS)
106
+ self.cursor.execute(Statement.RENAME_SECTIONS % name)
107
+
108
+ # Finish reindex operation
109
+ self.reindexend(name)
110
+
111
+ def save(self, path):
112
+ if self.connection:
113
+ self.connection.commit()
114
+
115
+ def close(self):
116
+ # Close connection
117
+ if self.connection:
118
+ self.connection.close()
119
+
120
+ def ids(self, ids):
121
+ # Batch ids and run query
122
+ self.batch(ids=ids)
123
+ self.cursor.execute(Statement.SELECT_IDS)
124
+
125
+ # Format and return results
126
+ return self.cursor.fetchall()
127
+
128
+ def count(self):
129
+ self.cursor.execute(Statement.COUNT_IDS)
130
+ return self.cursor.fetchone()[0]
131
+
132
+ def resolve(self, name, alias=None):
133
+ # Standard column names
134
+ sections = ["indexid", "id", "tags", "entry"]
135
+ noprefix = ["data", "object", "score", "text"]
136
+
137
+ # Alias expression
138
+ if alias:
139
+ # Skip if name matches alias or alias is a standard column name
140
+ if name == alias or alias in sections:
141
+ return name
142
+
143
+ # Build alias clause
144
+ return f'{name} as "{alias}"'
145
+
146
+ # Resolve expression
147
+ if self.expressions and name in self.expressions:
148
+ return self.expressions[name]
149
+
150
+ # Name is already resolved, skip
151
+ if name.startswith(self.jsonprefix()) or any(f"s.{s}" == name for s in sections):
152
+ return name
153
+
154
+ # Standard columns - need prefixes
155
+ if name.lower() in sections:
156
+ return f"s.{name}"
157
+
158
+ # Standard columns - no prefixes
159
+ if name.lower() in noprefix:
160
+ return name
161
+
162
+ # Other columns come from documents.data JSON
163
+ return self.jsoncolumn(name)
164
+
165
+ def embed(self, similarity, batch):
166
+ # Load similarity results id batch
167
+ self.batch(indexids=[i for i, _ in similarity[batch]], batch=batch)
168
+
169
+ # Average and load all similarity scores with first batch
170
+ if not batch:
171
+ self.scores(similarity)
172
+
173
+ # Return ids clause placeholder
174
+ return Statement.IDS_CLAUSE % batch
175
+
176
+ # pylint: disable=R0912
177
+ def query(self, query, limit, parameters, indexids):
178
+ # Extract query components
179
+ select = query.get("select", self.defaults())
180
+ where = query.get("where")
181
+ groupby, having = query.get("groupby"), query.get("having")
182
+ orderby, qlimit, offset = query.get("orderby"), query.get("limit"), query.get("offset")
183
+ similarity = query.get("similar")
184
+
185
+ # Select "indexid, score" when indexids is True
186
+ if indexids:
187
+ select = f"{self.resolve('indexid')}, {self.resolve('score')}"
188
+
189
+ # Build query text
190
+ query = Statement.TABLE_CLAUSE % select
191
+ if where is not None:
192
+ query += f" WHERE {where}"
193
+ if groupby is not None:
194
+ query += f" GROUP BY {groupby}"
195
+ if having is not None:
196
+ query += f" HAVING {having}"
197
+ if orderby is not None:
198
+ query += f" ORDER BY {orderby}"
199
+
200
+ # Default ORDER BY if not provided and similarity scores are available
201
+ if similarity and orderby is None:
202
+ query += " ORDER BY score DESC"
203
+
204
+ # Apply query limit
205
+ if qlimit is not None or limit:
206
+ query += f" LIMIT {qlimit if qlimit else limit}"
207
+
208
+ # Apply offset
209
+ if offset is not None:
210
+ query += f" OFFSET {offset}"
211
+
212
+ # Clear scores when no similar clauses present
213
+ if not similarity:
214
+ self.scores(None)
215
+
216
+ # Runs a user query through execute method, which has common user query handling logic
217
+ args = (query, parameters) if parameters else (query,)
218
+ self.execute(self.cursor.execute, *args)
219
+
220
+ # Retrieve column list from query
221
+ columns = [c[0] for c in self.cursor.description]
222
+
223
+ # Map results and return
224
+ results = []
225
+ for row in self.rows():
226
+ result = {}
227
+
228
+ # Copy columns to result. In cases with duplicate column names, find one with a value
229
+ for x, column in enumerate(columns):
230
+ if column not in result or result[column] is None:
231
+ # Decode object
232
+ if self.encoder and column == self.object:
233
+ result[column] = self.encoder.decode(row[x])
234
+ else:
235
+ result[column] = row[x]
236
+
237
+ results.append(result)
238
+
239
+ # Transform results, if necessary
240
+ return [(x["indexid"], x["score"]) for x in results] if indexids else results
241
+
242
+ def initialize(self):
243
+ """
244
+ Creates connection and initial database schema if no connection exists.
245
+ """
246
+
247
+ if not self.connection:
248
+ # Create database session. Thread locking must be handled externally.
249
+ self.session()
250
+
251
+ # Create initial table schema
252
+ self.createtables()
253
+
254
+ def session(self, path=None, connection=None):
255
+ """
256
+ Starts a new database session.
257
+
258
+ Args:
259
+ path: path to database file
260
+ connection: existing connection to use
261
+ """
262
+
263
+ # Create database connection and cursor
264
+ self.connection = connection if connection else self.connect(path) if path else self.connect()
265
+ self.cursor = self.getcursor()
266
+
267
+ # Register custom functions - session scope
268
+ self.addfunctions()
269
+
270
+ # Create temporary tables - session scope
271
+ self.createbatch()
272
+ self.createscores()
273
+
274
+ def createtables(self):
275
+ """
276
+ Creates the initial table schema.
277
+ """
278
+
279
+ self.cursor.execute(Statement.CREATE_DOCUMENTS)
280
+ self.cursor.execute(Statement.CREATE_OBJECTS)
281
+ self.cursor.execute(Statement.CREATE_SECTIONS % "sections")
282
+ self.cursor.execute(Statement.CREATE_SECTIONS_INDEX)
283
+
284
+ def finalize(self):
285
+ """
286
+ Post processing logic run after inserting a batch of documents. Default method is no-op.
287
+ """
288
+
289
+ def loaddocument(self, uid, document, tags, entry):
290
+ """
291
+ Applies pre-processing logic and inserts a document.
292
+
293
+ Args:
294
+ uid: unique id
295
+ document: input document dictionary
296
+ tags: document tags
297
+ entry: generated entry date
298
+
299
+ Returns:
300
+ section value
301
+ """
302
+
303
+ # Make a copy of document before changing
304
+ document = document.copy()
305
+
306
+ # Get and remove object field from document
307
+ obj = document.pop(self.object) if self.object in document else None
308
+
309
+ # Insert document as JSON
310
+ if document:
311
+ self.insertdocument(uid, json.dumps(document, allow_nan=False), tags, entry)
312
+
313
+ # If text and object are both available, load object as it won't otherwise be used
314
+ if self.text in document and obj:
315
+ self.loadobject(uid, obj, tags, entry)
316
+
317
+ # Return value to use for section - use text if available otherwise use object
318
+ return document[self.text] if self.text in document else obj
319
+
320
+ def insertdocument(self, uid, data, tags, entry):
321
+ """
322
+ Inserts a document.
323
+
324
+ Args:
325
+ uid: unique id
326
+ data: document data
327
+ tags: document tags
328
+ entry: generated entry date
329
+ """
330
+
331
+ self.cursor.execute(Statement.INSERT_DOCUMENT, [uid, data, tags, entry])
332
+
333
+ def loadobject(self, uid, obj, tags, entry):
334
+ """
335
+ Applies pre-preprocessing logic and inserts an object.
336
+
337
+ Args:
338
+ uid: unique id
339
+ obj: input object
340
+ tags: object tags
341
+ entry: generated entry date
342
+ """
343
+
344
+ # If object support is enabled, save object
345
+ if self.encoder:
346
+ self.insertobject(uid, self.encoder.encode(obj), tags, entry)
347
+
348
+ def insertobject(self, uid, data, tags, entry):
349
+ """
350
+ Inserts an object.
351
+
352
+ Args:
353
+ uid: unique id
354
+ data: encoded data
355
+ tags: object tags
356
+ entry: generated entry date
357
+ """
358
+
359
+ self.cursor.execute(Statement.INSERT_OBJECT, [uid, data, tags, entry])
360
+
361
+ def loadsection(self, index, uid, text, tags, entry):
362
+ """
363
+ Applies pre-processing logic and inserts a section.
364
+
365
+ Args:
366
+ index: index id
367
+ uid: unique id
368
+ text: section text
369
+ tags: section tags
370
+ entry: generated entry date
371
+ """
372
+
373
+ self.insertsection(index, uid, text, tags, entry)
374
+
375
+ def insertsection(self, index, uid, text, tags, entry):
376
+ """
377
+ Inserts a section.
378
+
379
+ Args:
380
+ index: index id
381
+ uid: unique id
382
+ text: section text
383
+ tags: section tags
384
+ entry: generated entry date
385
+ """
386
+
387
+ # Save text section
388
+ self.cursor.execute(Statement.INSERT_SECTION, [index, uid, text, tags, entry])
389
+
390
+ def reindexstart(self):
391
+ """
392
+ Starts a reindex operation.
393
+
394
+ Returns:
395
+ temporary working table name
396
+ """
397
+
398
+ # Working table name
399
+ name = "rebuild"
400
+
401
+ # Create new table to hold reordered sections
402
+ self.cursor.execute(Statement.CREATE_SECTIONS % name)
403
+
404
+ return name
405
+
406
+ # pylint: disable=W0613
407
+ def reindexend(self, name):
408
+ """
409
+ Ends a reindex operation.
410
+
411
+ Args:
412
+ name: working table name
413
+ """
414
+
415
+ self.cursor.execute(Statement.CREATE_SECTIONS_INDEX)
416
+
417
+ def batch(self, indexids=None, ids=None, batch=None):
418
+ """
419
+ Loads ids to a temporary batch table for efficient query processing.
420
+
421
+ Args:
422
+ indexids: list of indexids
423
+ ids: list of ids
424
+ batch: batch index, used when statement has multiple subselects
425
+ """
426
+
427
+ # Delete batch when batch id is empty or for batch 0
428
+ if not batch:
429
+ self.cursor.execute(Statement.DELETE_BATCH)
430
+
431
+ # Add batch
432
+ self.insertbatch(indexids, ids, batch)
433
+
434
+ def createbatch(self):
435
+ """
436
+ Creates temporary batch table.
437
+ """
438
+
439
+ # Create or Replace temporary batch table
440
+ self.cursor.execute(Statement.CREATE_BATCH)
441
+
442
+ def insertbatch(self, indexids, ids, batch):
443
+ """
444
+ Inserts batch of ids.
445
+ """
446
+
447
+ if indexids:
448
+ self.cursor.executemany(Statement.INSERT_BATCH_INDEXID, [(i, batch) for i in indexids])
449
+ if ids:
450
+ self.cursor.executemany(Statement.INSERT_BATCH_ID, [(str(uid), batch) for uid in ids])
451
+
452
+ def scores(self, similarity):
453
+ """
454
+ Loads a batch of similarity scores to a temporary table for efficient query processing.
455
+
456
+ Args:
457
+ similarity: similarity results as [(indexid, score)]
458
+ """
459
+
460
+ # Delete scores
461
+ self.cursor.execute(Statement.DELETE_SCORES)
462
+
463
+ if similarity:
464
+ # Average scores per id, needed for multiple similar() clauses
465
+ scores = {}
466
+ for s in similarity:
467
+ for i, score in s:
468
+ if i not in scores:
469
+ scores[i] = []
470
+ scores[i].append(score)
471
+
472
+ # Add scores
473
+ self.insertscores(scores)
474
+
475
+ def createscores(self):
476
+ """
477
+ Creates temporary scores table.
478
+ """
479
+
480
+ # Create or Replace temporary scores table
481
+ self.cursor.execute(Statement.CREATE_SCORES)
482
+
483
+ def insertscores(self, scores):
484
+ """
485
+ Inserts a batch of scores.
486
+
487
+ Args:
488
+ scores: scores to add
489
+ """
490
+
491
+ # Average scores by id
492
+ if scores:
493
+ self.cursor.executemany(Statement.INSERT_SCORE, [(i, sum(s) / len(s)) for i, s in scores.items()])
494
+
495
+ def defaults(self):
496
+ """
497
+ Returns a list of default columns when there is no select clause.
498
+
499
+ Returns:
500
+ list of default columns
501
+ """
502
+
503
+ return "s.id, text, score"
504
+
505
+ def connect(self, path=None):
506
+ """
507
+ Creates a new database connection.
508
+
509
+ Args:
510
+ path: path to database file
511
+
512
+ Returns:
513
+ connection
514
+ """
515
+
516
+ raise NotImplementedError
517
+
518
+ def getcursor(self):
519
+ """
520
+ Opens a cursor for current connection.
521
+
522
+ Returns:
523
+ cursor
524
+ """
525
+
526
+ raise NotImplementedError
527
+
528
+ def jsonprefix(self):
529
+ """
530
+ Returns json column prefix to test for.
531
+
532
+ Returns:
533
+ dynamic column prefix
534
+ """
535
+
536
+ raise NotImplementedError
537
+
538
+ def jsoncolumn(self, name):
539
+ """
540
+ Builds a json extract column expression for name.
541
+
542
+ Args:
543
+ name: column name
544
+
545
+ Returns:
546
+ dynamic column expression
547
+ """
548
+
549
+ raise NotImplementedError
550
+
551
+ def rows(self):
552
+ """
553
+ Returns current cursor row iterator for last executed query.
554
+
555
+ Args:
556
+ cursor: cursor
557
+
558
+ Returns:
559
+ iterable collection of rows
560
+ """
561
+
562
+ raise NotImplementedError
563
+
564
+ def addfunctions(self):
565
+ """
566
+ Adds custom functions in current connection.
567
+ """
568
+
569
+ raise NotImplementedError
@@ -0,0 +1,6 @@
1
+ """
2
+ Schema imports
3
+ """
4
+
5
+ from .orm import *
6
+ from .statement import Statement
@@ -0,0 +1,99 @@
1
+ """
2
+ ORM Module
3
+ """
4
+
5
+ # Conditional import
6
+ try:
7
+ from sqlalchemy import Column, DateTime, Float, JSON, Integer, LargeBinary, String, Text
8
+ from sqlalchemy.orm import DeclarativeBase
9
+
10
+ ORM = True
11
+ except ImportError:
12
+ ORM = False
13
+
14
+
15
+ # Standard database schema using object relational mapping (ORM).
16
+ if ORM:
17
+
18
+ def idcolumn():
19
+ """
20
+ Creates an id column. This method creates an unbounded text field for platforms that support it.
21
+
22
+ Returns:
23
+ id column definition
24
+ """
25
+
26
+ return String(512).with_variant(Text(), "sqlite", "postgresql")
27
+
28
+ class Base(DeclarativeBase):
29
+ """
30
+ Base mapping.
31
+ """
32
+
33
+ class Batch(Base):
34
+ """
35
+ Batch temporary table mapping.
36
+ """
37
+
38
+ __tablename__ = "batch"
39
+ __table_args__ = {"prefixes": ["TEMPORARY"]}
40
+
41
+ autoid = Column(Integer, primary_key=True, autoincrement=True)
42
+ indexid = Column(Integer)
43
+ id = Column(idcolumn())
44
+ batch = Column(Integer)
45
+
46
+ class Score(Base):
47
+ """
48
+ Scores temporary table mapping.
49
+ """
50
+
51
+ __tablename__ = "scores"
52
+ __table_args__ = {"prefixes": ["TEMPORARY"]}
53
+
54
+ indexid = Column(Integer, primary_key=True, autoincrement=False)
55
+ score = Column(Float)
56
+
57
+ class Document(Base):
58
+ """
59
+ Documents table mapping.
60
+ """
61
+
62
+ __tablename__ = "documents"
63
+
64
+ id = Column(idcolumn(), primary_key=True)
65
+ data = Column(JSON)
66
+ tags = Column(Text)
67
+ entry = Column(DateTime(timezone=True))
68
+
69
+ class Object(Base):
70
+ """
71
+ Objects table mapping.
72
+ """
73
+
74
+ __tablename__ = "objects"
75
+
76
+ id = Column(idcolumn(), primary_key=True)
77
+ object = Column(LargeBinary)
78
+ tags = Column(Text)
79
+ entry = Column(DateTime(timezone=True))
80
+
81
+ class SectionBase(Base):
82
+ """
83
+ Generic sections table mapping. Allows multiple section table names for reindexing.
84
+ """
85
+
86
+ __abstract__ = True
87
+
88
+ indexid = Column(Integer, primary_key=True, autoincrement=False)
89
+ id = Column(idcolumn(), index=True)
90
+ text = Column(Text)
91
+ tags = Column(Text)
92
+ entry = Column(DateTime(timezone=True))
93
+
94
+ class Section(SectionBase):
95
+ """
96
+ Section table mapping.
97
+ """
98
+
99
+ __tablename__ = "sections"