mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,342 @@
|
|
1
|
+
"""
|
2
|
+
Token module
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class Token:
|
7
|
+
"""
|
8
|
+
Methods to check for token type.
|
9
|
+
"""
|
10
|
+
|
11
|
+
# Similar token replacement
|
12
|
+
SIMILAR_TOKEN = "__SIMILAR__"
|
13
|
+
|
14
|
+
# Default distinct token
|
15
|
+
DISTINCT = ["distinct"]
|
16
|
+
|
17
|
+
# Default alias token
|
18
|
+
ALIAS = ["as"]
|
19
|
+
|
20
|
+
# Default list of comparison operators
|
21
|
+
OPERATORS = ["=", "!=", "<>", ">", ">=", "<", "<=", "+", "-", "*", "/", "%", "||", "not", "between", "like", "is", "null"]
|
22
|
+
|
23
|
+
# Default list of logic separators
|
24
|
+
LOGIC_SEPARATORS = ["and", "or"]
|
25
|
+
|
26
|
+
# Default list of sort order operators
|
27
|
+
SORT_ORDER = ["asc", "desc"]
|
28
|
+
|
29
|
+
@staticmethod
|
30
|
+
def get(tokens, x):
|
31
|
+
"""
|
32
|
+
Gets token at position x. This method will validate position is valid within tokens.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
tokens: input tokens
|
36
|
+
x: position to retrieve
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
tokens[x] if x is a valid position, None otherwise
|
40
|
+
"""
|
41
|
+
|
42
|
+
if 0 <= x < len(tokens):
|
43
|
+
return tokens[x]
|
44
|
+
|
45
|
+
return None
|
46
|
+
|
47
|
+
@staticmethod
|
48
|
+
def isalias(tokens, x, alias):
|
49
|
+
"""
|
50
|
+
Checks if tokens[x] is an alias keyword.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
tokens: input tokens
|
54
|
+
x: current position
|
55
|
+
alias: if column alias processing is enabled
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
True if tokens[x] is an alias token, False otherwise
|
59
|
+
"""
|
60
|
+
|
61
|
+
prior = Token.get(tokens, x - 1)
|
62
|
+
token = tokens[x]
|
63
|
+
|
64
|
+
# True if prior token is not a separator, grouping token or distinct token and current token is either a column token or quoted token
|
65
|
+
return (
|
66
|
+
alias
|
67
|
+
and x > 0
|
68
|
+
and not Token.isseparator(prior)
|
69
|
+
and not Token.isgroupstart(prior)
|
70
|
+
and not Token.isdistinct(prior)
|
71
|
+
and (Token.iscolumn(token) or Token.isquoted(token))
|
72
|
+
)
|
73
|
+
|
74
|
+
@staticmethod
|
75
|
+
def isattribute(tokens, x):
|
76
|
+
"""
|
77
|
+
Checks if tokens[x] is an attribute.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
tokens: input tokens
|
81
|
+
x: current position
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
True if tokens[x] is an attribute, False otherwise
|
85
|
+
"""
|
86
|
+
|
87
|
+
# True if token is a column and next token is not an operator
|
88
|
+
return Token.iscolumn(tokens[x]) and not Token.isoperator(Token.get(tokens, x + 1))
|
89
|
+
|
90
|
+
@staticmethod
|
91
|
+
def isbracket(token):
|
92
|
+
"""
|
93
|
+
Checks if token is an open bracket.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
token: token to test
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
True if token is an open bracket, False otherwise
|
100
|
+
"""
|
101
|
+
|
102
|
+
# Token is a bracket
|
103
|
+
return token == "["
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def iscolumn(token):
|
107
|
+
"""
|
108
|
+
Checks if token is a column name.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
token: token to test
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
True if this token is a column name token, False otherwise
|
115
|
+
"""
|
116
|
+
|
117
|
+
# Columns are not operators, logic separators, literals or sort order tokens
|
118
|
+
return (
|
119
|
+
token
|
120
|
+
and not Token.isoperator(token)
|
121
|
+
and not Token.islogicseparator(token)
|
122
|
+
and not Token.isliteral(token)
|
123
|
+
and not Token.issortorder(token)
|
124
|
+
)
|
125
|
+
|
126
|
+
@staticmethod
|
127
|
+
def iscompound(tokens, x):
|
128
|
+
"""
|
129
|
+
Checks if tokens[x] is a compound expression.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
tokens: input tokens
|
133
|
+
x: current position
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
True if tokens[x] is a compound expression, False otherwise
|
137
|
+
"""
|
138
|
+
|
139
|
+
# Compound expression is defined as: <column> <operator(s)> <column>
|
140
|
+
return Token.isoperator(tokens[x]) and (Token.iscolumn(Token.get(tokens, x - 1)) or Token.iscolumn(Token.get(tokens, x + 1)))
|
141
|
+
|
142
|
+
@staticmethod
|
143
|
+
def isdistinct(token):
|
144
|
+
"""
|
145
|
+
Checks if token is the distinct keyword.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
token: token to test
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
True if this token is a distinct keyword, False otherwise
|
152
|
+
"""
|
153
|
+
|
154
|
+
# Token is the distinct keyword
|
155
|
+
return token and token.lower() in Token.DISTINCT
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def isfunction(tokens, x):
|
159
|
+
"""
|
160
|
+
Checks if tokens[x] is a function.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
tokens: input tokens
|
164
|
+
x: current position
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
True if tokens[x] is a function, False otherwise
|
168
|
+
"""
|
169
|
+
|
170
|
+
# True if a column token is followed by an open paren
|
171
|
+
return Token.iscolumn(tokens[x]) and Token.get(tokens, x + 1) == "("
|
172
|
+
|
173
|
+
@staticmethod
|
174
|
+
def isgroupstart(token):
|
175
|
+
"""
|
176
|
+
Checks if token is a group start token.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
token: token to test
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
True if token is a group start token, False otherwise
|
183
|
+
"""
|
184
|
+
|
185
|
+
# Token is a paren
|
186
|
+
return token == "("
|
187
|
+
|
188
|
+
@staticmethod
|
189
|
+
def isliteral(token):
|
190
|
+
"""
|
191
|
+
Checks if token is a literal.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
token: token to test
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
True if this token is a literal, False otherwise
|
198
|
+
"""
|
199
|
+
|
200
|
+
# Literals are wrapped in quotes, parens, wildcards or numeric.
|
201
|
+
return token and (token.startswith(("'", '"', ",", "(", ")", "*")) or token.replace(".", "", 1).isdigit())
|
202
|
+
|
203
|
+
@staticmethod
|
204
|
+
def islogicseparator(token):
|
205
|
+
"""
|
206
|
+
Checks if token is a logic separator token.
|
207
|
+
|
208
|
+
Args:
|
209
|
+
token: token to test
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
True if this token is a logic separator, False otherwise
|
213
|
+
"""
|
214
|
+
|
215
|
+
# Token is a logic separator
|
216
|
+
return token and token.lower() in Token.LOGIC_SEPARATORS
|
217
|
+
|
218
|
+
@staticmethod
|
219
|
+
def isoperator(token):
|
220
|
+
"""
|
221
|
+
Checks if token is an operator token.
|
222
|
+
|
223
|
+
Args:
|
224
|
+
token: token to test
|
225
|
+
|
226
|
+
Returns:
|
227
|
+
True if this token is an operator, False otherwise
|
228
|
+
"""
|
229
|
+
|
230
|
+
# Token is an operator
|
231
|
+
return token and token.lower() in Token.OPERATORS
|
232
|
+
|
233
|
+
@staticmethod
|
234
|
+
def isquoted(token):
|
235
|
+
"""
|
236
|
+
Checks if token is quoted.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
token: token to test
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
True if this token is quoted, False otherwise
|
243
|
+
"""
|
244
|
+
|
245
|
+
# Token is quoted
|
246
|
+
return token.startswith(("'", '"')) and token.endswith(("'", '"'))
|
247
|
+
|
248
|
+
@staticmethod
|
249
|
+
def isseparator(token):
|
250
|
+
"""
|
251
|
+
Checks if token is a separator token.
|
252
|
+
|
253
|
+
Args:
|
254
|
+
token to test
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
True if this token is a separator, False otherwise
|
258
|
+
"""
|
259
|
+
|
260
|
+
# Token is a comma
|
261
|
+
return token == ","
|
262
|
+
|
263
|
+
@staticmethod
|
264
|
+
def issimilar(tokens, x, similar):
|
265
|
+
"""
|
266
|
+
Checks if tokens[x] is a similar() function.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
tokens: input tokens
|
270
|
+
x: current position
|
271
|
+
similar: list where similar function call parameters are stored, can be None in which case similar processing is skipped
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
True if tokens[x] is a similar clause
|
275
|
+
"""
|
276
|
+
|
277
|
+
# True if a "similar" token is followed by an open paren
|
278
|
+
return similar is not None and tokens[x].lower() == "similar" and Token.get(tokens, x + 1) == "("
|
279
|
+
|
280
|
+
@staticmethod
|
281
|
+
def issortorder(token):
|
282
|
+
"""
|
283
|
+
Checks if token is a sort order token.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
token: token to test
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
True if this token is a sort order operator, False otherwise
|
290
|
+
"""
|
291
|
+
|
292
|
+
# Token is a sort order operator
|
293
|
+
return token and token.lower() in Token.SORT_ORDER
|
294
|
+
|
295
|
+
@staticmethod
|
296
|
+
def normalize(token):
|
297
|
+
"""
|
298
|
+
Applies a normalization algorithm to the input token as follows:
|
299
|
+
- Strip single and double quotes
|
300
|
+
- Make lowercase
|
301
|
+
|
302
|
+
Args:
|
303
|
+
token: input token
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
normalized token
|
307
|
+
"""
|
308
|
+
|
309
|
+
# Lowercase, replace and return
|
310
|
+
return token.lower().replace("'", "").replace('"', "")
|
311
|
+
|
312
|
+
@staticmethod
|
313
|
+
def wrapspace(text, token):
|
314
|
+
"""
|
315
|
+
Applies whitespace wrapping rules to token.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
text: current text buffer
|
319
|
+
token: token to add
|
320
|
+
|
321
|
+
Returns:
|
322
|
+
token with whitespace rules applied
|
323
|
+
"""
|
324
|
+
|
325
|
+
# Wildcards have no whitespace. Need special case since * is also multiply which does have whitespace.
|
326
|
+
if token in ["*"] and (not text or text.endswith((" ", "("))):
|
327
|
+
return token
|
328
|
+
|
329
|
+
# Operator whitespace
|
330
|
+
if Token.isoperator(token) or Token.islogicseparator(token) or token.lower() in ["in"]:
|
331
|
+
return f" {token} " if not text.endswith(" ") else f"{token} "
|
332
|
+
|
333
|
+
# Comma whitespace
|
334
|
+
if Token.isseparator(token):
|
335
|
+
return f"{token} "
|
336
|
+
|
337
|
+
# No whitespace if any of the following is True
|
338
|
+
if not text or text.endswith((" ", "(", "[")) or token in ["(", "[", ")", "]"] or token.startswith("."):
|
339
|
+
return token
|
340
|
+
|
341
|
+
# Default is to add leading whitespace
|
342
|
+
return f" {token}"
|
txtai/database/sqlite.py
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
"""
|
2
|
+
SQLite module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sqlite3
|
7
|
+
|
8
|
+
from .embedded import Embedded
|
9
|
+
|
10
|
+
|
11
|
+
class SQLite(Embedded):
|
12
|
+
"""
|
13
|
+
Database instance backed by SQLite.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def connect(self, path=""):
|
17
|
+
# Create connection
|
18
|
+
connection = sqlite3.connect(path, check_same_thread=False)
|
19
|
+
|
20
|
+
# Enable WAL mode, if necessary
|
21
|
+
if self.setting("wal"):
|
22
|
+
connection.execute("PRAGMA journal_mode=WAL")
|
23
|
+
|
24
|
+
return connection
|
25
|
+
|
26
|
+
def getcursor(self):
|
27
|
+
return self.connection.cursor()
|
28
|
+
|
29
|
+
def rows(self):
|
30
|
+
return self.cursor
|
31
|
+
|
32
|
+
def addfunctions(self):
|
33
|
+
if self.connection and self.functions:
|
34
|
+
# Enable callback tracebacks to show user-defined function errors
|
35
|
+
sqlite3.enable_callback_tracebacks(True)
|
36
|
+
|
37
|
+
for name, argcount, fn in self.functions:
|
38
|
+
self.connection.create_function(name, argcount, fn)
|
39
|
+
|
40
|
+
def copy(self, path):
|
41
|
+
# Delete existing file, if necessary
|
42
|
+
if os.path.exists(path):
|
43
|
+
os.remove(path)
|
44
|
+
|
45
|
+
# Create database. Thread locking must be handled externally.
|
46
|
+
connection = self.connect(path)
|
47
|
+
|
48
|
+
if self.connection.in_transaction:
|
49
|
+
# The backup call will hang if there are uncommitted changes, need to copy over
|
50
|
+
# with iterdump (which is much slower)
|
51
|
+
for sql in self.connection.iterdump():
|
52
|
+
connection.execute(sql)
|
53
|
+
else:
|
54
|
+
# Database is up to date, can do a more efficient copy with SQLite C API
|
55
|
+
self.connection.backup(connection)
|
56
|
+
|
57
|
+
return connection
|