mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
"""
|
2
|
+
Index imports
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .action import Action
|
6
|
+
from .autoid import AutoId
|
7
|
+
from .configuration import Configuration
|
8
|
+
from .documents import Documents
|
9
|
+
from .functions import Functions
|
10
|
+
from .indexes import Indexes
|
11
|
+
from .indexids import IndexIds
|
12
|
+
from .reducer import Reducer
|
13
|
+
from .stream import Stream
|
14
|
+
from .transform import Transform
|
@@ -0,0 +1,92 @@
|
|
1
|
+
"""
|
2
|
+
AutoId module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import inspect
|
6
|
+
import uuid
|
7
|
+
|
8
|
+
|
9
|
+
class AutoId:
|
10
|
+
"""
|
11
|
+
Generates unique ids.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(self, method=None):
|
15
|
+
"""
|
16
|
+
Creates a unique id generator.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
method: generation method - supports int sequence (default) or UUID function
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Initialize variables
|
23
|
+
self.method, self.function, self.value = None, None, None
|
24
|
+
|
25
|
+
# Set id generation method
|
26
|
+
if not method or isinstance(method, int):
|
27
|
+
# Incrementing sequence (default)
|
28
|
+
self.method = self.sequence
|
29
|
+
self.value = method if method else 0
|
30
|
+
else:
|
31
|
+
# UUID generation function
|
32
|
+
self.method = self.uuid
|
33
|
+
self.function = getattr(uuid, method)
|
34
|
+
|
35
|
+
# Check if signature takes a namespace argument (deterministic)
|
36
|
+
args = inspect.getfullargspec(self.function).args if self.function else []
|
37
|
+
self.deterministic = "namespace" in args
|
38
|
+
|
39
|
+
def __call__(self, data=None):
|
40
|
+
"""
|
41
|
+
Generates a unique id.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
data: optional data to use for deterministic algorithms (i.e. uuid3, uuid5)
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
unique id
|
48
|
+
"""
|
49
|
+
|
50
|
+
return self.method(data)
|
51
|
+
|
52
|
+
# pylint: disable=W0613
|
53
|
+
def sequence(self, data):
|
54
|
+
"""
|
55
|
+
Gets and increments sequence.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
data: not used
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
current sequence value
|
62
|
+
"""
|
63
|
+
|
64
|
+
# Get and increment sequence
|
65
|
+
value = self.value
|
66
|
+
self.value += 1
|
67
|
+
|
68
|
+
return value
|
69
|
+
|
70
|
+
def uuid(self, data):
|
71
|
+
"""
|
72
|
+
Generates a UUID and return as a string.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
data: used with determistic algorithms (uuid3, uuid5)
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
UUID string
|
79
|
+
"""
|
80
|
+
|
81
|
+
uid = self.function(uuid.NAMESPACE_DNS, str(data)) if self.deterministic else self.function()
|
82
|
+
return str(uid)
|
83
|
+
|
84
|
+
def current(self):
|
85
|
+
"""
|
86
|
+
Get the current sequence value. Only applicable for sequence ids, will be None for UUID methods.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
current sequence value
|
90
|
+
"""
|
91
|
+
|
92
|
+
return self.value
|
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
Configuration module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
|
8
|
+
from ...serialize import SerializeFactory
|
9
|
+
|
10
|
+
|
11
|
+
class Configuration:
|
12
|
+
"""
|
13
|
+
Loads and saves index configuration.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def load(self, path):
|
17
|
+
"""
|
18
|
+
Loads index configuration. This method supports both config.json and config pickle files.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
path: path to directory
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
dict
|
25
|
+
"""
|
26
|
+
|
27
|
+
# Configuration
|
28
|
+
config = None
|
29
|
+
|
30
|
+
# Determine if config is json or pickle
|
31
|
+
jsonconfig = os.path.exists(f"{path}/config.json")
|
32
|
+
|
33
|
+
# Set config file name
|
34
|
+
name = "config.json" if jsonconfig else "config"
|
35
|
+
|
36
|
+
# Load configuration
|
37
|
+
with open(f"{path}/{name}", "r" if jsonconfig else "rb", encoding="utf-8" if jsonconfig else None) as handle:
|
38
|
+
# Load JSON, also backwards-compatible with pickle configuration
|
39
|
+
config = json.load(handle) if jsonconfig else SerializeFactory.create("pickle").loadstream(handle)
|
40
|
+
|
41
|
+
# Add format parameter
|
42
|
+
config["format"] = "json" if jsonconfig else "pickle"
|
43
|
+
|
44
|
+
return config
|
45
|
+
|
46
|
+
def save(self, config, path):
|
47
|
+
"""
|
48
|
+
Saves index configuration. This method defaults to JSON and falls back to pickle.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
config: configuration to save
|
52
|
+
path: path to directory
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
dict
|
56
|
+
"""
|
57
|
+
|
58
|
+
# Default to JSON config
|
59
|
+
jsonconfig = config.get("format", "json") == "json"
|
60
|
+
|
61
|
+
# Set config file name
|
62
|
+
name = "config.json" if jsonconfig else "config"
|
63
|
+
|
64
|
+
# Write configuration
|
65
|
+
with open(f"{path}/{name}", "w" if jsonconfig else "wb", encoding="utf-8" if jsonconfig else None) as handle:
|
66
|
+
if jsonconfig:
|
67
|
+
# Write config as JSON
|
68
|
+
json.dump(config, handle, default=str, indent=2)
|
69
|
+
else:
|
70
|
+
# Backwards compatible method to save pickle configuration
|
71
|
+
SerializeFactory.create("pickle").savestream(config, handle)
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""
|
2
|
+
Documents module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import tempfile
|
7
|
+
|
8
|
+
from ...serialize import SerializeFactory
|
9
|
+
|
10
|
+
|
11
|
+
class Documents:
|
12
|
+
"""
|
13
|
+
Streams documents to temporary storage. Allows queuing large volumes of content for later indexing.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self):
|
17
|
+
"""
|
18
|
+
Creates a new documents stream.
|
19
|
+
"""
|
20
|
+
|
21
|
+
self.documents = None
|
22
|
+
self.batch = 0
|
23
|
+
self.size = 0
|
24
|
+
|
25
|
+
# Pickle serialization - local temporary data
|
26
|
+
self.serializer = SerializeFactory.create("pickle", allowpickle=True)
|
27
|
+
|
28
|
+
def __len__(self):
|
29
|
+
"""
|
30
|
+
Returns total number of queued documents.
|
31
|
+
"""
|
32
|
+
|
33
|
+
return self.size
|
34
|
+
|
35
|
+
def __iter__(self):
|
36
|
+
"""
|
37
|
+
Streams all queued documents.
|
38
|
+
"""
|
39
|
+
|
40
|
+
# Close streaming file
|
41
|
+
self.documents.close()
|
42
|
+
|
43
|
+
# Open stream file
|
44
|
+
with open(self.documents.name, "rb") as queue:
|
45
|
+
# Read each batch
|
46
|
+
for _ in range(self.batch):
|
47
|
+
documents = self.serializer.loadstream(queue)
|
48
|
+
|
49
|
+
# Yield each document
|
50
|
+
yield from documents
|
51
|
+
|
52
|
+
def add(self, documents):
|
53
|
+
"""
|
54
|
+
Adds a batch of documents for indexing.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
documents: list of (id, data, tag) tuples
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
documents
|
61
|
+
"""
|
62
|
+
|
63
|
+
# Create documents file if not already open
|
64
|
+
# pylint: disable=R1732
|
65
|
+
if not self.documents:
|
66
|
+
self.documents = tempfile.NamedTemporaryFile(mode="wb", suffix=".docs", delete=False)
|
67
|
+
|
68
|
+
# Add batch
|
69
|
+
self.serializer.savestream(documents, self.documents)
|
70
|
+
self.batch += 1
|
71
|
+
self.size += len(documents)
|
72
|
+
|
73
|
+
return documents
|
74
|
+
|
75
|
+
def close(self):
|
76
|
+
"""
|
77
|
+
Closes and resets this instance. New sets of documents can be added with additional calls to add.
|
78
|
+
"""
|
79
|
+
|
80
|
+
# Cleanup stream file
|
81
|
+
os.remove(self.documents.name)
|
82
|
+
|
83
|
+
# Reset document parameters
|
84
|
+
self.documents = None
|
85
|
+
self.batch = 0
|
86
|
+
self.size = 0
|
@@ -0,0 +1,155 @@
|
|
1
|
+
"""
|
2
|
+
Functions module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from types import FunctionType, MethodType
|
6
|
+
|
7
|
+
|
8
|
+
class Functions:
|
9
|
+
"""
|
10
|
+
Resolves function configuration to function references.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, embeddings):
|
14
|
+
"""
|
15
|
+
Creates a new function resolver.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
embeddings: embeddings instance
|
19
|
+
"""
|
20
|
+
|
21
|
+
self.embeddings = embeddings
|
22
|
+
|
23
|
+
# Handle to all reference objects
|
24
|
+
self.references = None
|
25
|
+
|
26
|
+
def __call__(self, config):
|
27
|
+
"""
|
28
|
+
Resolves a list of functions to function references.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
config: configuration
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
list of function references
|
35
|
+
"""
|
36
|
+
|
37
|
+
# Initialize stored references array
|
38
|
+
self.references = []
|
39
|
+
|
40
|
+
# Resolve callable functions
|
41
|
+
functions = []
|
42
|
+
for fn in config["functions"]:
|
43
|
+
if isinstance(fn, dict):
|
44
|
+
fn = fn.copy()
|
45
|
+
fn["function"] = self.function(fn["function"])
|
46
|
+
else:
|
47
|
+
fn = self.function(fn)
|
48
|
+
functions.append(fn)
|
49
|
+
|
50
|
+
return functions
|
51
|
+
|
52
|
+
def reset(self):
|
53
|
+
"""
|
54
|
+
Clears all resolved references.
|
55
|
+
"""
|
56
|
+
|
57
|
+
if self.references:
|
58
|
+
for reference in self.references:
|
59
|
+
reference.reset()
|
60
|
+
|
61
|
+
def function(self, function):
|
62
|
+
"""
|
63
|
+
Resolves function configuration. If function is a string, it's split on '.' and each part
|
64
|
+
is separately resolved to an object, attribute or function. Each part is resolved upon the
|
65
|
+
first invocation of the function. Otherwise, the input is returned.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
function: function configuration
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
function reference
|
72
|
+
"""
|
73
|
+
|
74
|
+
if isinstance(function, str):
|
75
|
+
parts = function.split(".")
|
76
|
+
|
77
|
+
if hasattr(self.embeddings, parts[0]):
|
78
|
+
m = Reference(self.embeddings, parts[0])
|
79
|
+
self.references.append(m)
|
80
|
+
else:
|
81
|
+
module = ".".join(parts[:-1])
|
82
|
+
m = __import__(module)
|
83
|
+
|
84
|
+
for comp in parts[1:]:
|
85
|
+
m = Reference(m, comp)
|
86
|
+
self.references.append(m)
|
87
|
+
|
88
|
+
return m
|
89
|
+
|
90
|
+
return function
|
91
|
+
|
92
|
+
|
93
|
+
class Reference:
|
94
|
+
"""
|
95
|
+
Stores a reference to an object attribute. This attribute is resolved by invoking the __call__ method.
|
96
|
+
This allows for functions to be independent of the initialization order of an embeddings instance.
|
97
|
+
"""
|
98
|
+
|
99
|
+
def __init__(self, obj, attribute):
|
100
|
+
"""
|
101
|
+
Create a new reference.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
obj: object handle
|
105
|
+
attribute: attribute name
|
106
|
+
"""
|
107
|
+
|
108
|
+
# Object handle and attribute
|
109
|
+
self.obj = obj
|
110
|
+
self.attribute = attribute
|
111
|
+
|
112
|
+
# Keep a handle to the original inputs
|
113
|
+
self.inputs = (obj, attribute)
|
114
|
+
|
115
|
+
# True if the object and attribute have been resolved
|
116
|
+
self.resolved = False
|
117
|
+
|
118
|
+
# True if the attribute is a function
|
119
|
+
self.function = None
|
120
|
+
|
121
|
+
def __call__(self, *args):
|
122
|
+
"""
|
123
|
+
Resolves an object attribute reference. If the attribute is a function, the function is executed.
|
124
|
+
Otherwise, the object attribute value is returned.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
args: list of function arguments to the object attribute, when attribute is a function
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
object attribute function result or object attribute value
|
131
|
+
"""
|
132
|
+
|
133
|
+
# Resolve nested function arguments, if necessary
|
134
|
+
if not self.resolved:
|
135
|
+
self.obj = self.obj() if isinstance(self.obj, Reference) else self.obj
|
136
|
+
self.attribute = self.attribute() if isinstance(self.attribute, Reference) else self.attribute
|
137
|
+
self.resolved = True
|
138
|
+
|
139
|
+
# Lookup attribute
|
140
|
+
attribute = getattr(self.obj, self.attribute)
|
141
|
+
|
142
|
+
# Determine if attribute is a function
|
143
|
+
if self.function is None:
|
144
|
+
self.function = isinstance(attribute, (FunctionType, MethodType)) or (hasattr(attribute, "__call__") and args)
|
145
|
+
|
146
|
+
# If attribute is a function, execute and return, otherwise return attribute
|
147
|
+
return attribute(*args) if self.function else attribute
|
148
|
+
|
149
|
+
def reset(self):
|
150
|
+
"""
|
151
|
+
Clears resolved references.
|
152
|
+
"""
|
153
|
+
|
154
|
+
self.obj, self.attribute = self.inputs
|
155
|
+
self.resolved = False
|
@@ -0,0 +1,199 @@
|
|
1
|
+
"""
|
2
|
+
Indexes module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
|
7
|
+
from .documents import Documents
|
8
|
+
|
9
|
+
|
10
|
+
class Indexes:
|
11
|
+
"""
|
12
|
+
Manages a collection of subindexes for an embeddings instance.
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(self, embeddings, indexes):
|
16
|
+
"""
|
17
|
+
Creates a new indexes instance.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
embeddings: embeddings instance
|
21
|
+
indexes: dict of subindexes to add
|
22
|
+
"""
|
23
|
+
|
24
|
+
self.embeddings = embeddings
|
25
|
+
self.indexes = indexes
|
26
|
+
|
27
|
+
self.documents = None
|
28
|
+
self.checkpoint = None
|
29
|
+
|
30
|
+
# Transform columns
|
31
|
+
columns = embeddings.config.get("columns", {})
|
32
|
+
self.text = columns.get("text", "text")
|
33
|
+
self.object = columns.get("object", "object")
|
34
|
+
|
35
|
+
# Check if top-level indexing is enabled for this embeddings instance
|
36
|
+
self.indexing = embeddings.model or embeddings.scoring
|
37
|
+
|
38
|
+
def __contains__(self, name):
|
39
|
+
"""
|
40
|
+
Returns True if name is in this instance, False otherwise.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
True if name is in this instance, False otherwise
|
44
|
+
"""
|
45
|
+
|
46
|
+
return name in self.indexes
|
47
|
+
|
48
|
+
def __getitem__(self, name):
|
49
|
+
"""
|
50
|
+
Looks up an index by name.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
name: index name
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
index
|
57
|
+
"""
|
58
|
+
|
59
|
+
return self.indexes[name]
|
60
|
+
|
61
|
+
def __getattr__(self, name):
|
62
|
+
"""
|
63
|
+
Looks up an index by attribute name.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
name: index name
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
index
|
70
|
+
"""
|
71
|
+
|
72
|
+
try:
|
73
|
+
return self.indexes[name]
|
74
|
+
except Exception as e:
|
75
|
+
raise AttributeError(e) from e
|
76
|
+
|
77
|
+
def default(self):
|
78
|
+
"""
|
79
|
+
Gets the default/first index.
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
default index
|
83
|
+
"""
|
84
|
+
|
85
|
+
return list(self.indexes.keys())[0]
|
86
|
+
|
87
|
+
def findmodel(self, index=None):
|
88
|
+
"""
|
89
|
+
Finds a vector model. If index is empty, the first vector model is returned.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
index: index name to match
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Vectors
|
96
|
+
"""
|
97
|
+
|
98
|
+
# Find vector model
|
99
|
+
matches = [self.indexes[index].findmodel()] if index else [index.findmodel() for index in self.indexes.values() if index.findmodel()]
|
100
|
+
return matches[0] if matches else None
|
101
|
+
|
102
|
+
def insert(self, documents, index=None, checkpoint=None):
|
103
|
+
"""
|
104
|
+
Inserts a batch of documents into each subindex.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
documents: list of (id, data, tags)
|
108
|
+
index: indexid offset
|
109
|
+
checkpoint: optional checkpoint directory, enables indexing restart
|
110
|
+
"""
|
111
|
+
|
112
|
+
if not self.documents:
|
113
|
+
self.documents = Documents()
|
114
|
+
self.checkpoint = checkpoint
|
115
|
+
|
116
|
+
# Create batch containing documents added to parent index
|
117
|
+
batch = []
|
118
|
+
for _, document, _ in documents:
|
119
|
+
# Add to documents collection if text or object field is set
|
120
|
+
parent = document
|
121
|
+
if isinstance(parent, dict):
|
122
|
+
parent = parent.get(self.text, document.get(self.object))
|
123
|
+
|
124
|
+
# Add if field is available or top-level indexing is disabled
|
125
|
+
if parent is not None or not self.indexing:
|
126
|
+
batch.append((index, document, None))
|
127
|
+
index += 1
|
128
|
+
|
129
|
+
# Add filtered documents batch
|
130
|
+
self.documents.add(batch)
|
131
|
+
|
132
|
+
def delete(self, ids):
|
133
|
+
"""
|
134
|
+
Deletes ids from each subindex.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
ids: list of ids to delete
|
138
|
+
"""
|
139
|
+
|
140
|
+
for index in self.indexes.values():
|
141
|
+
index.delete(ids)
|
142
|
+
|
143
|
+
def index(self):
|
144
|
+
"""
|
145
|
+
Builds each subindex.
|
146
|
+
"""
|
147
|
+
|
148
|
+
for name, index in self.indexes.items():
|
149
|
+
index.index(self.documents, checkpoint=f"{self.checkpoint}/{name}" if self.checkpoint else None)
|
150
|
+
|
151
|
+
# Reset document stream
|
152
|
+
self.documents.close()
|
153
|
+
self.documents = None
|
154
|
+
self.checkpoint = None
|
155
|
+
|
156
|
+
def upsert(self):
|
157
|
+
"""
|
158
|
+
Runs upsert for each subindex.
|
159
|
+
"""
|
160
|
+
|
161
|
+
for index in self.indexes.values():
|
162
|
+
index.upsert(self.documents)
|
163
|
+
|
164
|
+
# Reset document stream
|
165
|
+
self.documents.close()
|
166
|
+
self.documents = None
|
167
|
+
|
168
|
+
def load(self, path):
|
169
|
+
"""
|
170
|
+
Loads each subindex from path.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
path: directory path to load subindexes
|
174
|
+
"""
|
175
|
+
|
176
|
+
for name, index in self.indexes.items():
|
177
|
+
# Load subindex if it exists, subindexes aren't required to have data
|
178
|
+
directory = os.path.join(path, name)
|
179
|
+
if index.exists(directory):
|
180
|
+
index.load(directory)
|
181
|
+
|
182
|
+
def save(self, path):
|
183
|
+
"""
|
184
|
+
Saves each subindex to path.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
path: directory path to save subindexes
|
188
|
+
"""
|
189
|
+
|
190
|
+
for name, index in self.indexes.items():
|
191
|
+
index.save(os.path.join(path, name))
|
192
|
+
|
193
|
+
def close(self):
|
194
|
+
"""
|
195
|
+
Close and free resources used by this instance.
|
196
|
+
"""
|
197
|
+
|
198
|
+
for index in self.indexes.values():
|
199
|
+
index.close()
|
@@ -0,0 +1,60 @@
|
|
1
|
+
"""
|
2
|
+
IndexIds module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from ...serialize import Serializer
|
6
|
+
|
7
|
+
|
8
|
+
class IndexIds:
|
9
|
+
"""
|
10
|
+
Stores index ids when content is disabled.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, embeddings, ids=None):
|
14
|
+
"""
|
15
|
+
Creates an IndexIds instance.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
embeddings: embeddings instance
|
19
|
+
ids: ids to store
|
20
|
+
"""
|
21
|
+
|
22
|
+
self.config = embeddings.config
|
23
|
+
self.ids = ids
|
24
|
+
|
25
|
+
def __iter__(self):
|
26
|
+
yield from self.ids
|
27
|
+
|
28
|
+
def __getitem__(self, index):
|
29
|
+
return self.ids[index]
|
30
|
+
|
31
|
+
def __setitem__(self, index, value):
|
32
|
+
self.ids[index] = value
|
33
|
+
|
34
|
+
def __add__(self, ids):
|
35
|
+
return self.ids + ids
|
36
|
+
|
37
|
+
def load(self, path):
|
38
|
+
"""
|
39
|
+
Loads IndexIds from path.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
path: path to load
|
43
|
+
"""
|
44
|
+
|
45
|
+
if "ids" in self.config:
|
46
|
+
# Legacy ids format
|
47
|
+
self.ids = self.config.pop("ids")
|
48
|
+
else:
|
49
|
+
# Standard ids format
|
50
|
+
self.ids = Serializer.load(path)
|
51
|
+
|
52
|
+
def save(self, path):
|
53
|
+
"""
|
54
|
+
Saves IndexIds to path.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
path: path to save
|
58
|
+
"""
|
59
|
+
|
60
|
+
Serializer.save(self.ids, path)
|