mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
"""
|
2
|
+
Reducer module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from zipfile import BadZipFile
|
6
|
+
|
7
|
+
# Conditionally import dimensionality reduction libraries as they aren't installed by default
|
8
|
+
try:
|
9
|
+
import skops.io as sio
|
10
|
+
|
11
|
+
from sklearn.decomposition import TruncatedSVD
|
12
|
+
|
13
|
+
REDUCER = True
|
14
|
+
except ImportError:
|
15
|
+
REDUCER = False
|
16
|
+
|
17
|
+
from ...serialize import SerializeFactory
|
18
|
+
|
19
|
+
|
20
|
+
class Reducer:
|
21
|
+
"""
|
22
|
+
LSA dimensionality reduction model
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, embeddings=None, components=None):
|
26
|
+
"""
|
27
|
+
Creates a dimensionality reduction model.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
embeddings: input embeddings matrix
|
31
|
+
components: number of model components
|
32
|
+
"""
|
33
|
+
|
34
|
+
if not REDUCER:
|
35
|
+
raise ImportError('Dimensionality reduction is not available - install "vectors" extra to enable')
|
36
|
+
|
37
|
+
self.model = self.build(embeddings, components) if embeddings is not None and components else None
|
38
|
+
|
39
|
+
def __call__(self, embeddings):
|
40
|
+
"""
|
41
|
+
Applies a dimensionality reduction model to embeddings, removed the top n principal components. Operation applied
|
42
|
+
directly on array.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
embeddings: input embeddings matrix
|
46
|
+
"""
|
47
|
+
|
48
|
+
pc = self.model.components_
|
49
|
+
factor = embeddings.dot(pc.transpose())
|
50
|
+
|
51
|
+
# Apply LSA model
|
52
|
+
# Calculation is different if n_components = 1
|
53
|
+
if pc.shape[0] == 1:
|
54
|
+
embeddings -= factor * pc
|
55
|
+
elif len(embeddings.shape) > 1:
|
56
|
+
# Apply model on a row-wise basis to limit memory usage
|
57
|
+
for x in range(embeddings.shape[0]):
|
58
|
+
embeddings[x] -= factor[x].dot(pc)
|
59
|
+
else:
|
60
|
+
# Single embedding
|
61
|
+
embeddings -= factor.dot(pc)
|
62
|
+
|
63
|
+
def build(self, embeddings, components):
|
64
|
+
"""
|
65
|
+
Builds a LSA model. This model is used to remove the principal component within embeddings. This helps to
|
66
|
+
smooth out noisy embeddings (common words with less value).
|
67
|
+
|
68
|
+
Args:
|
69
|
+
embeddings: input embeddings matrix
|
70
|
+
components: number of model components
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
LSA model
|
74
|
+
"""
|
75
|
+
|
76
|
+
model = TruncatedSVD(n_components=components, random_state=0)
|
77
|
+
model.fit(embeddings)
|
78
|
+
|
79
|
+
return model
|
80
|
+
|
81
|
+
def load(self, path):
|
82
|
+
"""
|
83
|
+
Loads a Reducer object from path.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
path: directory path to load model
|
87
|
+
"""
|
88
|
+
|
89
|
+
# Dimensionality reduction
|
90
|
+
try:
|
91
|
+
self.model = sio.load(path)
|
92
|
+
except (BadZipFile, KeyError):
|
93
|
+
# Backwards compatible support for pickled models
|
94
|
+
self.model = SerializeFactory.create("pickle").load(path)
|
95
|
+
|
96
|
+
def save(self, path):
|
97
|
+
"""
|
98
|
+
Saves a Reducer object to path.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
path: directory path to save model
|
102
|
+
"""
|
103
|
+
|
104
|
+
sio.dump(self.model, path)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
"""
|
2
|
+
Stream module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .autoid import AutoId
|
6
|
+
from .transform import Action
|
7
|
+
|
8
|
+
|
9
|
+
class Stream:
|
10
|
+
"""
|
11
|
+
Yields input document as standard (id, data, tags) tuples.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(self, embeddings, action=None):
|
15
|
+
"""
|
16
|
+
Create a new stream.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
embeddings: embeddings instance
|
20
|
+
action: optional index action
|
21
|
+
"""
|
22
|
+
|
23
|
+
self.embeddings = embeddings
|
24
|
+
self.action = action
|
25
|
+
|
26
|
+
# Alias embeddings attributes
|
27
|
+
self.config = embeddings.config
|
28
|
+
|
29
|
+
# Get config parameters
|
30
|
+
self.offset = self.config.get("offset", 0) if action == Action.UPSERT else 0
|
31
|
+
autoid = self.config.get("autoid", self.offset)
|
32
|
+
|
33
|
+
# Create autoid generator, reset int sequence if this isn't an UPSERT
|
34
|
+
autoid = 0 if isinstance(autoid, int) and action != Action.UPSERT else autoid
|
35
|
+
self.autoid = AutoId(autoid)
|
36
|
+
|
37
|
+
def __call__(self, documents):
|
38
|
+
"""
|
39
|
+
Yield (id, data, tags) tuples from a stream of documents.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
documents: input documents
|
43
|
+
"""
|
44
|
+
|
45
|
+
# Iterate over documents and yield standard (id, data, tag) tuples
|
46
|
+
for document in documents:
|
47
|
+
if isinstance(document, dict):
|
48
|
+
# Create (id, data, tags) tuple from dictionary
|
49
|
+
document = document.get("id"), document, document.get("tags")
|
50
|
+
elif isinstance(document, tuple):
|
51
|
+
# Create (id, data, tags) tuple
|
52
|
+
document = document if len(document) >= 3 else (document[0], document[1], None)
|
53
|
+
else:
|
54
|
+
# Create (id, data, tags) tuple with empty fields
|
55
|
+
document = None, document, None
|
56
|
+
|
57
|
+
# Set autoid if the action is set
|
58
|
+
if self.action and document[0] is None:
|
59
|
+
document = (self.autoid(document[1]), document[1], document[2])
|
60
|
+
|
61
|
+
# Yield (id, data, tags) tuple
|
62
|
+
yield document
|
63
|
+
|
64
|
+
# Save autoid sequence if used
|
65
|
+
current = self.autoid.current()
|
66
|
+
if self.action and current:
|
67
|
+
self.config["autoid"] = current
|
@@ -0,0 +1,205 @@
|
|
1
|
+
"""
|
2
|
+
Transform module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
from .action import Action
|
8
|
+
|
9
|
+
|
10
|
+
class Transform:
|
11
|
+
"""
|
12
|
+
Executes a transform. Processes a stream of documents, loads batches into enabled data stores and vectorizes documents.
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(self, embeddings, action, checkpoint=None):
|
16
|
+
"""
|
17
|
+
Creates a new transform.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
embeddings: embeddings instance
|
21
|
+
action: index action
|
22
|
+
checkpoint: optional checkpoint directory, enables indexing restart
|
23
|
+
"""
|
24
|
+
|
25
|
+
self.embeddings = embeddings
|
26
|
+
self.action = action
|
27
|
+
self.checkpoint = checkpoint
|
28
|
+
|
29
|
+
# Alias embeddings attributes
|
30
|
+
self.config = embeddings.config
|
31
|
+
self.delete = embeddings.delete
|
32
|
+
self.model = embeddings.model
|
33
|
+
self.database = embeddings.database
|
34
|
+
self.graph = embeddings.graph
|
35
|
+
self.indexes = embeddings.indexes
|
36
|
+
self.scoring = embeddings.scoring if embeddings.issparse() else None
|
37
|
+
|
38
|
+
# Get config parameters
|
39
|
+
self.offset = embeddings.config.get("offset", 0) if action == Action.UPSERT else 0
|
40
|
+
self.batch = embeddings.config.get("batch", 1024)
|
41
|
+
|
42
|
+
# Scalar quantization
|
43
|
+
quantize = embeddings.config.get("quantize")
|
44
|
+
self.qbits = quantize if isinstance(quantize, int) and not isinstance(quantize, bool) else None
|
45
|
+
|
46
|
+
# Transform columns
|
47
|
+
columns = embeddings.config.get("columns", {})
|
48
|
+
self.text = columns.get("text", "text")
|
49
|
+
self.object = columns.get("object", "object")
|
50
|
+
|
51
|
+
# Check if top-level indexing is enabled for this embeddings
|
52
|
+
self.indexing = embeddings.model or embeddings.scoring
|
53
|
+
|
54
|
+
# List of deleted ids with this action
|
55
|
+
self.deletes = set()
|
56
|
+
|
57
|
+
def __call__(self, documents, buffer):
|
58
|
+
"""
|
59
|
+
Processes an iterable collection of documents, handles any iterable including generators.
|
60
|
+
|
61
|
+
This method loads a stream of documents into enabled data stores and vectorizes documents into an embeddings array.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
documents: iterable of (id, data, tags)
|
65
|
+
buffer: file path used for memmap buffer
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
(document ids, dimensions, embeddings)
|
69
|
+
"""
|
70
|
+
|
71
|
+
# Return parameters
|
72
|
+
ids, dimensions, embeddings = None, None, None
|
73
|
+
|
74
|
+
if self.model:
|
75
|
+
ids, dimensions, embeddings = self.vectors(documents, buffer)
|
76
|
+
else:
|
77
|
+
ids = self.ids(documents)
|
78
|
+
|
79
|
+
return (ids, dimensions, embeddings)
|
80
|
+
|
81
|
+
def vectors(self, documents, buffer):
|
82
|
+
"""
|
83
|
+
Runs a vectors transform operation when dense indexing is enabled.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
documents: iterable of (id, data, tags)
|
87
|
+
buffer: file path used for memmap buffer
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
(document ids, dimensions, embeddings)
|
91
|
+
"""
|
92
|
+
|
93
|
+
# Determine dtype
|
94
|
+
dtype = np.uint8 if self.qbits else np.float32
|
95
|
+
|
96
|
+
# Transform documents into vectors
|
97
|
+
return self.model.vectors(self.stream(documents), self.batch, self.checkpoint, buffer, dtype)
|
98
|
+
|
99
|
+
def ids(self, documents):
|
100
|
+
"""
|
101
|
+
Runs an ids transform operation when dense indexing is disabled.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
documents: iterable of (id, data, tags)
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
document ids
|
108
|
+
"""
|
109
|
+
|
110
|
+
# Consume stream and build extract ids
|
111
|
+
ids = []
|
112
|
+
for uid, _, _ in self.stream(documents):
|
113
|
+
ids.append(uid)
|
114
|
+
|
115
|
+
# Save offset when dense indexing is disabled
|
116
|
+
self.config["offset"] = self.offset
|
117
|
+
|
118
|
+
return ids
|
119
|
+
|
120
|
+
def stream(self, documents):
|
121
|
+
"""
|
122
|
+
This method does two things:
|
123
|
+
|
124
|
+
1. Filter and yield data to vectorize
|
125
|
+
2. Batch and load original documents into enabled data stores (database, graph, scoring)
|
126
|
+
|
127
|
+
Documents are yielded for vectorization if one of the following is True:
|
128
|
+
- dict with a text or object field
|
129
|
+
- not a dict
|
130
|
+
|
131
|
+
Otherwise, documents are only batched and inserted into data stores
|
132
|
+
|
133
|
+
Args:
|
134
|
+
documents: iterable collection (id, data, tags)
|
135
|
+
"""
|
136
|
+
|
137
|
+
# Batch and index offset. Index offset increments by count of documents streamed for vectorization
|
138
|
+
batch, offset = [], 0
|
139
|
+
|
140
|
+
# Iterate and process documents stream
|
141
|
+
for document in documents:
|
142
|
+
if isinstance(document[1], dict):
|
143
|
+
# Set text field to uid when top-level indexing is disabled and text empty
|
144
|
+
if not self.indexing and not document[1].get(self.text):
|
145
|
+
document[1][self.text] = str(document[0])
|
146
|
+
|
147
|
+
if self.text in document[1]:
|
148
|
+
yield (document[0], document[1][self.text], document[2])
|
149
|
+
offset += 1
|
150
|
+
elif self.object in document[1]:
|
151
|
+
yield (document[0], document[1][self.object], document[2])
|
152
|
+
offset += 1
|
153
|
+
else:
|
154
|
+
yield document
|
155
|
+
offset += 1
|
156
|
+
|
157
|
+
# Batch document
|
158
|
+
batch.append(document)
|
159
|
+
if len(batch) == self.batch:
|
160
|
+
self.load(batch, offset)
|
161
|
+
batch, offset = [], 0
|
162
|
+
|
163
|
+
# Final batch
|
164
|
+
if batch:
|
165
|
+
self.load(batch, offset)
|
166
|
+
|
167
|
+
def load(self, batch, offset):
|
168
|
+
"""
|
169
|
+
Loads a document batch. This method deletes existing ids from an embeddings index and
|
170
|
+
loads into enabled data stores (database, graph, scoring).
|
171
|
+
|
172
|
+
Args:
|
173
|
+
batch: list of (id, data, tags)
|
174
|
+
offset: index offset for batch
|
175
|
+
"""
|
176
|
+
|
177
|
+
# Delete from embeddings index first (which deletes from underlying indexes and datastores) if this is an upsert
|
178
|
+
if self.action == Action.UPSERT:
|
179
|
+
# Get list of ids not yet seen and deleted
|
180
|
+
deletes = [uid for uid, _, _ in batch if uid not in self.deletes]
|
181
|
+
if deletes:
|
182
|
+
# Execute delete
|
183
|
+
self.delete(deletes)
|
184
|
+
|
185
|
+
# Save deleted ids as a delete must only occur once per action
|
186
|
+
self.deletes.update(deletes)
|
187
|
+
|
188
|
+
# Load batch into database except if this is a reindex
|
189
|
+
if self.database and self.action != Action.REINDEX:
|
190
|
+
self.database.insert(batch, self.offset)
|
191
|
+
|
192
|
+
# Load batch into scoring
|
193
|
+
if self.scoring:
|
194
|
+
self.scoring.insert(batch, self.offset, self.checkpoint)
|
195
|
+
|
196
|
+
# Load batch into subindex documents stream
|
197
|
+
if self.indexes:
|
198
|
+
self.indexes.insert(batch, self.offset, self.checkpoint)
|
199
|
+
|
200
|
+
# Load batch into graph
|
201
|
+
if self.graph:
|
202
|
+
self.graph.insert(batch, self.offset)
|
203
|
+
|
204
|
+
# Increment offset
|
205
|
+
self.offset += offset
|