mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
txtai/database/rdbms.py
ADDED
@@ -0,0 +1,569 @@
|
|
1
|
+
"""
|
2
|
+
RDBMS module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import datetime
|
6
|
+
import json
|
7
|
+
|
8
|
+
from .base import Database
|
9
|
+
from .schema import Statement
|
10
|
+
|
11
|
+
|
12
|
+
# pylint: disable=R0904
|
13
|
+
class RDBMS(Database):
|
14
|
+
"""
|
15
|
+
Base relational database class. A relational database uses SQL to insert, update, delete and select from a
|
16
|
+
database instance.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, config):
|
20
|
+
"""
|
21
|
+
Creates a new Database.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
config: database configuration parameters
|
25
|
+
"""
|
26
|
+
|
27
|
+
super().__init__(config)
|
28
|
+
|
29
|
+
# Database connection
|
30
|
+
self.connection = None
|
31
|
+
self.cursor = None
|
32
|
+
|
33
|
+
def load(self, path):
|
34
|
+
# Load an existing database. Thread locking must be handled externally.
|
35
|
+
self.session(path)
|
36
|
+
|
37
|
+
def insert(self, documents, index=0):
|
38
|
+
# Initialize connection if not open
|
39
|
+
self.initialize()
|
40
|
+
|
41
|
+
# Get entry date
|
42
|
+
entry = datetime.datetime.now(datetime.timezone.utc)
|
43
|
+
|
44
|
+
# Insert documents
|
45
|
+
for uid, document, tags in documents:
|
46
|
+
if isinstance(document, dict):
|
47
|
+
# Insert document and use return value for sections table
|
48
|
+
document = self.loaddocument(uid, document, tags, entry)
|
49
|
+
|
50
|
+
if document is not None:
|
51
|
+
if isinstance(document, list):
|
52
|
+
# Join tokens to text
|
53
|
+
document = " ".join(document)
|
54
|
+
elif not isinstance(document, str):
|
55
|
+
# If object support is enabled, save object
|
56
|
+
self.loadobject(uid, document, tags, entry)
|
57
|
+
|
58
|
+
# Clear section text for objects, even when objects aren't inserted
|
59
|
+
document = None
|
60
|
+
|
61
|
+
# Save text section
|
62
|
+
self.loadsection(index, uid, document, tags, entry)
|
63
|
+
index += 1
|
64
|
+
|
65
|
+
# Post processing logic
|
66
|
+
self.finalize()
|
67
|
+
|
68
|
+
def delete(self, ids):
|
69
|
+
if self.connection:
|
70
|
+
# Batch ids
|
71
|
+
self.batch(ids=ids)
|
72
|
+
|
73
|
+
# Delete all documents, objects and sections by id
|
74
|
+
self.cursor.execute(Statement.DELETE_DOCUMENTS)
|
75
|
+
self.cursor.execute(Statement.DELETE_OBJECTS)
|
76
|
+
self.cursor.execute(Statement.DELETE_SECTIONS)
|
77
|
+
|
78
|
+
def reindex(self, config):
|
79
|
+
if self.connection:
|
80
|
+
# Set new configuration
|
81
|
+
self.configure(config)
|
82
|
+
|
83
|
+
# Resolve text column
|
84
|
+
select = self.resolve(self.text)
|
85
|
+
|
86
|
+
# Initialize reindex operation
|
87
|
+
name = self.reindexstart()
|
88
|
+
|
89
|
+
# Copy data over
|
90
|
+
self.cursor.execute(Statement.COPY_SECTIONS % (name, select))
|
91
|
+
|
92
|
+
# Stream new results
|
93
|
+
self.cursor.execute(Statement.STREAM_SECTIONS % name)
|
94
|
+
for uid, text, data, obj, tags in self.rows():
|
95
|
+
if not text and self.encoder and obj:
|
96
|
+
yield (uid, self.encoder.decode(obj), tags)
|
97
|
+
else:
|
98
|
+
# Read JSON data, if provided
|
99
|
+
data = json.loads(data) if data and isinstance(data, str) else data
|
100
|
+
|
101
|
+
# Stream data if available, otherwise use section text
|
102
|
+
yield (uid, data if data else text, tags)
|
103
|
+
|
104
|
+
# Swap as new table
|
105
|
+
self.cursor.execute(Statement.DROP_SECTIONS)
|
106
|
+
self.cursor.execute(Statement.RENAME_SECTIONS % name)
|
107
|
+
|
108
|
+
# Finish reindex operation
|
109
|
+
self.reindexend(name)
|
110
|
+
|
111
|
+
def save(self, path):
|
112
|
+
if self.connection:
|
113
|
+
self.connection.commit()
|
114
|
+
|
115
|
+
def close(self):
|
116
|
+
# Close connection
|
117
|
+
if self.connection:
|
118
|
+
self.connection.close()
|
119
|
+
|
120
|
+
def ids(self, ids):
|
121
|
+
# Batch ids and run query
|
122
|
+
self.batch(ids=ids)
|
123
|
+
self.cursor.execute(Statement.SELECT_IDS)
|
124
|
+
|
125
|
+
# Format and return results
|
126
|
+
return self.cursor.fetchall()
|
127
|
+
|
128
|
+
def count(self):
|
129
|
+
self.cursor.execute(Statement.COUNT_IDS)
|
130
|
+
return self.cursor.fetchone()[0]
|
131
|
+
|
132
|
+
def resolve(self, name, alias=None):
|
133
|
+
# Standard column names
|
134
|
+
sections = ["indexid", "id", "tags", "entry"]
|
135
|
+
noprefix = ["data", "object", "score", "text"]
|
136
|
+
|
137
|
+
# Alias expression
|
138
|
+
if alias:
|
139
|
+
# Skip if name matches alias or alias is a standard column name
|
140
|
+
if name == alias or alias in sections:
|
141
|
+
return name
|
142
|
+
|
143
|
+
# Build alias clause
|
144
|
+
return f'{name} as "{alias}"'
|
145
|
+
|
146
|
+
# Resolve expression
|
147
|
+
if self.expressions and name in self.expressions:
|
148
|
+
return self.expressions[name]
|
149
|
+
|
150
|
+
# Name is already resolved, skip
|
151
|
+
if name.startswith(self.jsonprefix()) or any(f"s.{s}" == name for s in sections):
|
152
|
+
return name
|
153
|
+
|
154
|
+
# Standard columns - need prefixes
|
155
|
+
if name.lower() in sections:
|
156
|
+
return f"s.{name}"
|
157
|
+
|
158
|
+
# Standard columns - no prefixes
|
159
|
+
if name.lower() in noprefix:
|
160
|
+
return name
|
161
|
+
|
162
|
+
# Other columns come from documents.data JSON
|
163
|
+
return self.jsoncolumn(name)
|
164
|
+
|
165
|
+
def embed(self, similarity, batch):
|
166
|
+
# Load similarity results id batch
|
167
|
+
self.batch(indexids=[i for i, _ in similarity[batch]], batch=batch)
|
168
|
+
|
169
|
+
# Average and load all similarity scores with first batch
|
170
|
+
if not batch:
|
171
|
+
self.scores(similarity)
|
172
|
+
|
173
|
+
# Return ids clause placeholder
|
174
|
+
return Statement.IDS_CLAUSE % batch
|
175
|
+
|
176
|
+
# pylint: disable=R0912
|
177
|
+
def query(self, query, limit, parameters, indexids):
|
178
|
+
# Extract query components
|
179
|
+
select = query.get("select", self.defaults())
|
180
|
+
where = query.get("where")
|
181
|
+
groupby, having = query.get("groupby"), query.get("having")
|
182
|
+
orderby, qlimit, offset = query.get("orderby"), query.get("limit"), query.get("offset")
|
183
|
+
similarity = query.get("similar")
|
184
|
+
|
185
|
+
# Select "indexid, score" when indexids is True
|
186
|
+
if indexids:
|
187
|
+
select = f"{self.resolve('indexid')}, {self.resolve('score')}"
|
188
|
+
|
189
|
+
# Build query text
|
190
|
+
query = Statement.TABLE_CLAUSE % select
|
191
|
+
if where is not None:
|
192
|
+
query += f" WHERE {where}"
|
193
|
+
if groupby is not None:
|
194
|
+
query += f" GROUP BY {groupby}"
|
195
|
+
if having is not None:
|
196
|
+
query += f" HAVING {having}"
|
197
|
+
if orderby is not None:
|
198
|
+
query += f" ORDER BY {orderby}"
|
199
|
+
|
200
|
+
# Default ORDER BY if not provided and similarity scores are available
|
201
|
+
if similarity and orderby is None:
|
202
|
+
query += " ORDER BY score DESC"
|
203
|
+
|
204
|
+
# Apply query limit
|
205
|
+
if qlimit is not None or limit:
|
206
|
+
query += f" LIMIT {qlimit if qlimit else limit}"
|
207
|
+
|
208
|
+
# Apply offset
|
209
|
+
if offset is not None:
|
210
|
+
query += f" OFFSET {offset}"
|
211
|
+
|
212
|
+
# Clear scores when no similar clauses present
|
213
|
+
if not similarity:
|
214
|
+
self.scores(None)
|
215
|
+
|
216
|
+
# Runs a user query through execute method, which has common user query handling logic
|
217
|
+
args = (query, parameters) if parameters else (query,)
|
218
|
+
self.execute(self.cursor.execute, *args)
|
219
|
+
|
220
|
+
# Retrieve column list from query
|
221
|
+
columns = [c[0] for c in self.cursor.description]
|
222
|
+
|
223
|
+
# Map results and return
|
224
|
+
results = []
|
225
|
+
for row in self.rows():
|
226
|
+
result = {}
|
227
|
+
|
228
|
+
# Copy columns to result. In cases with duplicate column names, find one with a value
|
229
|
+
for x, column in enumerate(columns):
|
230
|
+
if column not in result or result[column] is None:
|
231
|
+
# Decode object
|
232
|
+
if self.encoder and column == self.object:
|
233
|
+
result[column] = self.encoder.decode(row[x])
|
234
|
+
else:
|
235
|
+
result[column] = row[x]
|
236
|
+
|
237
|
+
results.append(result)
|
238
|
+
|
239
|
+
# Transform results, if necessary
|
240
|
+
return [(x["indexid"], x["score"]) for x in results] if indexids else results
|
241
|
+
|
242
|
+
def initialize(self):
|
243
|
+
"""
|
244
|
+
Creates connection and initial database schema if no connection exists.
|
245
|
+
"""
|
246
|
+
|
247
|
+
if not self.connection:
|
248
|
+
# Create database session. Thread locking must be handled externally.
|
249
|
+
self.session()
|
250
|
+
|
251
|
+
# Create initial table schema
|
252
|
+
self.createtables()
|
253
|
+
|
254
|
+
def session(self, path=None, connection=None):
|
255
|
+
"""
|
256
|
+
Starts a new database session.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
path: path to database file
|
260
|
+
connection: existing connection to use
|
261
|
+
"""
|
262
|
+
|
263
|
+
# Create database connection and cursor
|
264
|
+
self.connection = connection if connection else self.connect(path) if path else self.connect()
|
265
|
+
self.cursor = self.getcursor()
|
266
|
+
|
267
|
+
# Register custom functions - session scope
|
268
|
+
self.addfunctions()
|
269
|
+
|
270
|
+
# Create temporary tables - session scope
|
271
|
+
self.createbatch()
|
272
|
+
self.createscores()
|
273
|
+
|
274
|
+
def createtables(self):
|
275
|
+
"""
|
276
|
+
Creates the initial table schema.
|
277
|
+
"""
|
278
|
+
|
279
|
+
self.cursor.execute(Statement.CREATE_DOCUMENTS)
|
280
|
+
self.cursor.execute(Statement.CREATE_OBJECTS)
|
281
|
+
self.cursor.execute(Statement.CREATE_SECTIONS % "sections")
|
282
|
+
self.cursor.execute(Statement.CREATE_SECTIONS_INDEX)
|
283
|
+
|
284
|
+
def finalize(self):
|
285
|
+
"""
|
286
|
+
Post processing logic run after inserting a batch of documents. Default method is no-op.
|
287
|
+
"""
|
288
|
+
|
289
|
+
def loaddocument(self, uid, document, tags, entry):
|
290
|
+
"""
|
291
|
+
Applies pre-processing logic and inserts a document.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
uid: unique id
|
295
|
+
document: input document dictionary
|
296
|
+
tags: document tags
|
297
|
+
entry: generated entry date
|
298
|
+
|
299
|
+
Returns:
|
300
|
+
section value
|
301
|
+
"""
|
302
|
+
|
303
|
+
# Make a copy of document before changing
|
304
|
+
document = document.copy()
|
305
|
+
|
306
|
+
# Get and remove object field from document
|
307
|
+
obj = document.pop(self.object) if self.object in document else None
|
308
|
+
|
309
|
+
# Insert document as JSON
|
310
|
+
if document:
|
311
|
+
self.insertdocument(uid, json.dumps(document, allow_nan=False), tags, entry)
|
312
|
+
|
313
|
+
# If text and object are both available, load object as it won't otherwise be used
|
314
|
+
if self.text in document and obj:
|
315
|
+
self.loadobject(uid, obj, tags, entry)
|
316
|
+
|
317
|
+
# Return value to use for section - use text if available otherwise use object
|
318
|
+
return document[self.text] if self.text in document else obj
|
319
|
+
|
320
|
+
def insertdocument(self, uid, data, tags, entry):
|
321
|
+
"""
|
322
|
+
Inserts a document.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
uid: unique id
|
326
|
+
data: document data
|
327
|
+
tags: document tags
|
328
|
+
entry: generated entry date
|
329
|
+
"""
|
330
|
+
|
331
|
+
self.cursor.execute(Statement.INSERT_DOCUMENT, [uid, data, tags, entry])
|
332
|
+
|
333
|
+
def loadobject(self, uid, obj, tags, entry):
|
334
|
+
"""
|
335
|
+
Applies pre-preprocessing logic and inserts an object.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
uid: unique id
|
339
|
+
obj: input object
|
340
|
+
tags: object tags
|
341
|
+
entry: generated entry date
|
342
|
+
"""
|
343
|
+
|
344
|
+
# If object support is enabled, save object
|
345
|
+
if self.encoder:
|
346
|
+
self.insertobject(uid, self.encoder.encode(obj), tags, entry)
|
347
|
+
|
348
|
+
def insertobject(self, uid, data, tags, entry):
|
349
|
+
"""
|
350
|
+
Inserts an object.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
uid: unique id
|
354
|
+
data: encoded data
|
355
|
+
tags: object tags
|
356
|
+
entry: generated entry date
|
357
|
+
"""
|
358
|
+
|
359
|
+
self.cursor.execute(Statement.INSERT_OBJECT, [uid, data, tags, entry])
|
360
|
+
|
361
|
+
def loadsection(self, index, uid, text, tags, entry):
|
362
|
+
"""
|
363
|
+
Applies pre-processing logic and inserts a section.
|
364
|
+
|
365
|
+
Args:
|
366
|
+
index: index id
|
367
|
+
uid: unique id
|
368
|
+
text: section text
|
369
|
+
tags: section tags
|
370
|
+
entry: generated entry date
|
371
|
+
"""
|
372
|
+
|
373
|
+
self.insertsection(index, uid, text, tags, entry)
|
374
|
+
|
375
|
+
def insertsection(self, index, uid, text, tags, entry):
|
376
|
+
"""
|
377
|
+
Inserts a section.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
index: index id
|
381
|
+
uid: unique id
|
382
|
+
text: section text
|
383
|
+
tags: section tags
|
384
|
+
entry: generated entry date
|
385
|
+
"""
|
386
|
+
|
387
|
+
# Save text section
|
388
|
+
self.cursor.execute(Statement.INSERT_SECTION, [index, uid, text, tags, entry])
|
389
|
+
|
390
|
+
def reindexstart(self):
|
391
|
+
"""
|
392
|
+
Starts a reindex operation.
|
393
|
+
|
394
|
+
Returns:
|
395
|
+
temporary working table name
|
396
|
+
"""
|
397
|
+
|
398
|
+
# Working table name
|
399
|
+
name = "rebuild"
|
400
|
+
|
401
|
+
# Create new table to hold reordered sections
|
402
|
+
self.cursor.execute(Statement.CREATE_SECTIONS % name)
|
403
|
+
|
404
|
+
return name
|
405
|
+
|
406
|
+
# pylint: disable=W0613
|
407
|
+
def reindexend(self, name):
|
408
|
+
"""
|
409
|
+
Ends a reindex operation.
|
410
|
+
|
411
|
+
Args:
|
412
|
+
name: working table name
|
413
|
+
"""
|
414
|
+
|
415
|
+
self.cursor.execute(Statement.CREATE_SECTIONS_INDEX)
|
416
|
+
|
417
|
+
def batch(self, indexids=None, ids=None, batch=None):
|
418
|
+
"""
|
419
|
+
Loads ids to a temporary batch table for efficient query processing.
|
420
|
+
|
421
|
+
Args:
|
422
|
+
indexids: list of indexids
|
423
|
+
ids: list of ids
|
424
|
+
batch: batch index, used when statement has multiple subselects
|
425
|
+
"""
|
426
|
+
|
427
|
+
# Delete batch when batch id is empty or for batch 0
|
428
|
+
if not batch:
|
429
|
+
self.cursor.execute(Statement.DELETE_BATCH)
|
430
|
+
|
431
|
+
# Add batch
|
432
|
+
self.insertbatch(indexids, ids, batch)
|
433
|
+
|
434
|
+
def createbatch(self):
|
435
|
+
"""
|
436
|
+
Creates temporary batch table.
|
437
|
+
"""
|
438
|
+
|
439
|
+
# Create or Replace temporary batch table
|
440
|
+
self.cursor.execute(Statement.CREATE_BATCH)
|
441
|
+
|
442
|
+
def insertbatch(self, indexids, ids, batch):
|
443
|
+
"""
|
444
|
+
Inserts batch of ids.
|
445
|
+
"""
|
446
|
+
|
447
|
+
if indexids:
|
448
|
+
self.cursor.executemany(Statement.INSERT_BATCH_INDEXID, [(i, batch) for i in indexids])
|
449
|
+
if ids:
|
450
|
+
self.cursor.executemany(Statement.INSERT_BATCH_ID, [(str(uid), batch) for uid in ids])
|
451
|
+
|
452
|
+
def scores(self, similarity):
|
453
|
+
"""
|
454
|
+
Loads a batch of similarity scores to a temporary table for efficient query processing.
|
455
|
+
|
456
|
+
Args:
|
457
|
+
similarity: similarity results as [(indexid, score)]
|
458
|
+
"""
|
459
|
+
|
460
|
+
# Delete scores
|
461
|
+
self.cursor.execute(Statement.DELETE_SCORES)
|
462
|
+
|
463
|
+
if similarity:
|
464
|
+
# Average scores per id, needed for multiple similar() clauses
|
465
|
+
scores = {}
|
466
|
+
for s in similarity:
|
467
|
+
for i, score in s:
|
468
|
+
if i not in scores:
|
469
|
+
scores[i] = []
|
470
|
+
scores[i].append(score)
|
471
|
+
|
472
|
+
# Add scores
|
473
|
+
self.insertscores(scores)
|
474
|
+
|
475
|
+
def createscores(self):
|
476
|
+
"""
|
477
|
+
Creates temporary scores table.
|
478
|
+
"""
|
479
|
+
|
480
|
+
# Create or Replace temporary scores table
|
481
|
+
self.cursor.execute(Statement.CREATE_SCORES)
|
482
|
+
|
483
|
+
def insertscores(self, scores):
|
484
|
+
"""
|
485
|
+
Inserts a batch of scores.
|
486
|
+
|
487
|
+
Args:
|
488
|
+
scores: scores to add
|
489
|
+
"""
|
490
|
+
|
491
|
+
# Average scores by id
|
492
|
+
if scores:
|
493
|
+
self.cursor.executemany(Statement.INSERT_SCORE, [(i, sum(s) / len(s)) for i, s in scores.items()])
|
494
|
+
|
495
|
+
def defaults(self):
|
496
|
+
"""
|
497
|
+
Returns a list of default columns when there is no select clause.
|
498
|
+
|
499
|
+
Returns:
|
500
|
+
list of default columns
|
501
|
+
"""
|
502
|
+
|
503
|
+
return "s.id, text, score"
|
504
|
+
|
505
|
+
def connect(self, path=None):
|
506
|
+
"""
|
507
|
+
Creates a new database connection.
|
508
|
+
|
509
|
+
Args:
|
510
|
+
path: path to database file
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
connection
|
514
|
+
"""
|
515
|
+
|
516
|
+
raise NotImplementedError
|
517
|
+
|
518
|
+
def getcursor(self):
|
519
|
+
"""
|
520
|
+
Opens a cursor for current connection.
|
521
|
+
|
522
|
+
Returns:
|
523
|
+
cursor
|
524
|
+
"""
|
525
|
+
|
526
|
+
raise NotImplementedError
|
527
|
+
|
528
|
+
def jsonprefix(self):
|
529
|
+
"""
|
530
|
+
Returns json column prefix to test for.
|
531
|
+
|
532
|
+
Returns:
|
533
|
+
dynamic column prefix
|
534
|
+
"""
|
535
|
+
|
536
|
+
raise NotImplementedError
|
537
|
+
|
538
|
+
def jsoncolumn(self, name):
|
539
|
+
"""
|
540
|
+
Builds a json extract column expression for name.
|
541
|
+
|
542
|
+
Args:
|
543
|
+
name: column name
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
dynamic column expression
|
547
|
+
"""
|
548
|
+
|
549
|
+
raise NotImplementedError
|
550
|
+
|
551
|
+
def rows(self):
|
552
|
+
"""
|
553
|
+
Returns current cursor row iterator for last executed query.
|
554
|
+
|
555
|
+
Args:
|
556
|
+
cursor: cursor
|
557
|
+
|
558
|
+
Returns:
|
559
|
+
iterable collection of rows
|
560
|
+
"""
|
561
|
+
|
562
|
+
raise NotImplementedError
|
563
|
+
|
564
|
+
def addfunctions(self):
|
565
|
+
"""
|
566
|
+
Adds custom functions in current connection.
|
567
|
+
"""
|
568
|
+
|
569
|
+
raise NotImplementedError
|
@@ -0,0 +1,99 @@
|
|
1
|
+
"""
|
2
|
+
ORM Module
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Conditional import
|
6
|
+
try:
|
7
|
+
from sqlalchemy import Column, DateTime, Float, JSON, Integer, LargeBinary, String, Text
|
8
|
+
from sqlalchemy.orm import DeclarativeBase
|
9
|
+
|
10
|
+
ORM = True
|
11
|
+
except ImportError:
|
12
|
+
ORM = False
|
13
|
+
|
14
|
+
|
15
|
+
# Standard database schema using object relational mapping (ORM).
|
16
|
+
if ORM:
|
17
|
+
|
18
|
+
def idcolumn():
|
19
|
+
"""
|
20
|
+
Creates an id column. This method creates an unbounded text field for platforms that support it.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
id column definition
|
24
|
+
"""
|
25
|
+
|
26
|
+
return String(512).with_variant(Text(), "sqlite", "postgresql")
|
27
|
+
|
28
|
+
class Base(DeclarativeBase):
|
29
|
+
"""
|
30
|
+
Base mapping.
|
31
|
+
"""
|
32
|
+
|
33
|
+
class Batch(Base):
|
34
|
+
"""
|
35
|
+
Batch temporary table mapping.
|
36
|
+
"""
|
37
|
+
|
38
|
+
__tablename__ = "batch"
|
39
|
+
__table_args__ = {"prefixes": ["TEMPORARY"]}
|
40
|
+
|
41
|
+
autoid = Column(Integer, primary_key=True, autoincrement=True)
|
42
|
+
indexid = Column(Integer)
|
43
|
+
id = Column(idcolumn())
|
44
|
+
batch = Column(Integer)
|
45
|
+
|
46
|
+
class Score(Base):
|
47
|
+
"""
|
48
|
+
Scores temporary table mapping.
|
49
|
+
"""
|
50
|
+
|
51
|
+
__tablename__ = "scores"
|
52
|
+
__table_args__ = {"prefixes": ["TEMPORARY"]}
|
53
|
+
|
54
|
+
indexid = Column(Integer, primary_key=True, autoincrement=False)
|
55
|
+
score = Column(Float)
|
56
|
+
|
57
|
+
class Document(Base):
|
58
|
+
"""
|
59
|
+
Documents table mapping.
|
60
|
+
"""
|
61
|
+
|
62
|
+
__tablename__ = "documents"
|
63
|
+
|
64
|
+
id = Column(idcolumn(), primary_key=True)
|
65
|
+
data = Column(JSON)
|
66
|
+
tags = Column(Text)
|
67
|
+
entry = Column(DateTime(timezone=True))
|
68
|
+
|
69
|
+
class Object(Base):
|
70
|
+
"""
|
71
|
+
Objects table mapping.
|
72
|
+
"""
|
73
|
+
|
74
|
+
__tablename__ = "objects"
|
75
|
+
|
76
|
+
id = Column(idcolumn(), primary_key=True)
|
77
|
+
object = Column(LargeBinary)
|
78
|
+
tags = Column(Text)
|
79
|
+
entry = Column(DateTime(timezone=True))
|
80
|
+
|
81
|
+
class SectionBase(Base):
|
82
|
+
"""
|
83
|
+
Generic sections table mapping. Allows multiple section table names for reindexing.
|
84
|
+
"""
|
85
|
+
|
86
|
+
__abstract__ = True
|
87
|
+
|
88
|
+
indexid = Column(Integer, primary_key=True, autoincrement=False)
|
89
|
+
id = Column(idcolumn(), index=True)
|
90
|
+
text = Column(Text)
|
91
|
+
tags = Column(Text)
|
92
|
+
entry = Column(DateTime(timezone=True))
|
93
|
+
|
94
|
+
class Section(SectionBase):
|
95
|
+
"""
|
96
|
+
Section table mapping.
|
97
|
+
"""
|
98
|
+
|
99
|
+
__tablename__ = "sections"
|