mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,14 @@
1
+ """
2
+ Index imports
3
+ """
4
+
5
+ from .action import Action
6
+ from .autoid import AutoId
7
+ from .configuration import Configuration
8
+ from .documents import Documents
9
+ from .functions import Functions
10
+ from .indexes import Indexes
11
+ from .indexids import IndexIds
12
+ from .reducer import Reducer
13
+ from .stream import Stream
14
+ from .transform import Transform
@@ -0,0 +1,15 @@
1
+ """
2
+ Action module
3
+ """
4
+
5
+ from enum import Enum
6
+
7
+
8
+ class Action(Enum):
9
+ """
10
+ Index action types
11
+ """
12
+
13
+ INDEX = 1
14
+ UPSERT = 2
15
+ REINDEX = 3
@@ -0,0 +1,92 @@
1
+ """
2
+ AutoId module
3
+ """
4
+
5
+ import inspect
6
+ import uuid
7
+
8
+
9
+ class AutoId:
10
+ """
11
+ Generates unique ids.
12
+ """
13
+
14
+ def __init__(self, method=None):
15
+ """
16
+ Creates a unique id generator.
17
+
18
+ Args:
19
+ method: generation method - supports int sequence (default) or UUID function
20
+ """
21
+
22
+ # Initialize variables
23
+ self.method, self.function, self.value = None, None, None
24
+
25
+ # Set id generation method
26
+ if not method or isinstance(method, int):
27
+ # Incrementing sequence (default)
28
+ self.method = self.sequence
29
+ self.value = method if method else 0
30
+ else:
31
+ # UUID generation function
32
+ self.method = self.uuid
33
+ self.function = getattr(uuid, method)
34
+
35
+ # Check if signature takes a namespace argument (deterministic)
36
+ args = inspect.getfullargspec(self.function).args if self.function else []
37
+ self.deterministic = "namespace" in args
38
+
39
+ def __call__(self, data=None):
40
+ """
41
+ Generates a unique id.
42
+
43
+ Args:
44
+ data: optional data to use for deterministic algorithms (i.e. uuid3, uuid5)
45
+
46
+ Returns:
47
+ unique id
48
+ """
49
+
50
+ return self.method(data)
51
+
52
+ # pylint: disable=W0613
53
+ def sequence(self, data):
54
+ """
55
+ Gets and increments sequence.
56
+
57
+ Args:
58
+ data: not used
59
+
60
+ Returns:
61
+ current sequence value
62
+ """
63
+
64
+ # Get and increment sequence
65
+ value = self.value
66
+ self.value += 1
67
+
68
+ return value
69
+
70
+ def uuid(self, data):
71
+ """
72
+ Generates a UUID and return as a string.
73
+
74
+ Args:
75
+ data: used with determistic algorithms (uuid3, uuid5)
76
+
77
+ Returns:
78
+ UUID string
79
+ """
80
+
81
+ uid = self.function(uuid.NAMESPACE_DNS, str(data)) if self.deterministic else self.function()
82
+ return str(uid)
83
+
84
+ def current(self):
85
+ """
86
+ Get the current sequence value. Only applicable for sequence ids, will be None for UUID methods.
87
+
88
+ Returns:
89
+ current sequence value
90
+ """
91
+
92
+ return self.value
@@ -0,0 +1,71 @@
1
+ """
2
+ Configuration module
3
+ """
4
+
5
+ import json
6
+ import os
7
+
8
+ from ...serialize import SerializeFactory
9
+
10
+
11
+ class Configuration:
12
+ """
13
+ Loads and saves index configuration.
14
+ """
15
+
16
+ def load(self, path):
17
+ """
18
+ Loads index configuration. This method supports both config.json and config pickle files.
19
+
20
+ Args:
21
+ path: path to directory
22
+
23
+ Returns:
24
+ dict
25
+ """
26
+
27
+ # Configuration
28
+ config = None
29
+
30
+ # Determine if config is json or pickle
31
+ jsonconfig = os.path.exists(f"{path}/config.json")
32
+
33
+ # Set config file name
34
+ name = "config.json" if jsonconfig else "config"
35
+
36
+ # Load configuration
37
+ with open(f"{path}/{name}", "r" if jsonconfig else "rb", encoding="utf-8" if jsonconfig else None) as handle:
38
+ # Load JSON, also backwards-compatible with pickle configuration
39
+ config = json.load(handle) if jsonconfig else SerializeFactory.create("pickle").loadstream(handle)
40
+
41
+ # Add format parameter
42
+ config["format"] = "json" if jsonconfig else "pickle"
43
+
44
+ return config
45
+
46
+ def save(self, config, path):
47
+ """
48
+ Saves index configuration. This method defaults to JSON and falls back to pickle.
49
+
50
+ Args:
51
+ config: configuration to save
52
+ path: path to directory
53
+
54
+ Returns:
55
+ dict
56
+ """
57
+
58
+ # Default to JSON config
59
+ jsonconfig = config.get("format", "json") == "json"
60
+
61
+ # Set config file name
62
+ name = "config.json" if jsonconfig else "config"
63
+
64
+ # Write configuration
65
+ with open(f"{path}/{name}", "w" if jsonconfig else "wb", encoding="utf-8" if jsonconfig else None) as handle:
66
+ if jsonconfig:
67
+ # Write config as JSON
68
+ json.dump(config, handle, default=str, indent=2)
69
+ else:
70
+ # Backwards compatible method to save pickle configuration
71
+ SerializeFactory.create("pickle").savestream(config, handle)
@@ -0,0 +1,86 @@
1
+ """
2
+ Documents module
3
+ """
4
+
5
+ import os
6
+ import tempfile
7
+
8
+ from ...serialize import SerializeFactory
9
+
10
+
11
+ class Documents:
12
+ """
13
+ Streams documents to temporary storage. Allows queuing large volumes of content for later indexing.
14
+ """
15
+
16
+ def __init__(self):
17
+ """
18
+ Creates a new documents stream.
19
+ """
20
+
21
+ self.documents = None
22
+ self.batch = 0
23
+ self.size = 0
24
+
25
+ # Pickle serialization - local temporary data
26
+ self.serializer = SerializeFactory.create("pickle", allowpickle=True)
27
+
28
+ def __len__(self):
29
+ """
30
+ Returns total number of queued documents.
31
+ """
32
+
33
+ return self.size
34
+
35
+ def __iter__(self):
36
+ """
37
+ Streams all queued documents.
38
+ """
39
+
40
+ # Close streaming file
41
+ self.documents.close()
42
+
43
+ # Open stream file
44
+ with open(self.documents.name, "rb") as queue:
45
+ # Read each batch
46
+ for _ in range(self.batch):
47
+ documents = self.serializer.loadstream(queue)
48
+
49
+ # Yield each document
50
+ yield from documents
51
+
52
+ def add(self, documents):
53
+ """
54
+ Adds a batch of documents for indexing.
55
+
56
+ Args:
57
+ documents: list of (id, data, tag) tuples
58
+
59
+ Returns:
60
+ documents
61
+ """
62
+
63
+ # Create documents file if not already open
64
+ # pylint: disable=R1732
65
+ if not self.documents:
66
+ self.documents = tempfile.NamedTemporaryFile(mode="wb", suffix=".docs", delete=False)
67
+
68
+ # Add batch
69
+ self.serializer.savestream(documents, self.documents)
70
+ self.batch += 1
71
+ self.size += len(documents)
72
+
73
+ return documents
74
+
75
+ def close(self):
76
+ """
77
+ Closes and resets this instance. New sets of documents can be added with additional calls to add.
78
+ """
79
+
80
+ # Cleanup stream file
81
+ os.remove(self.documents.name)
82
+
83
+ # Reset document parameters
84
+ self.documents = None
85
+ self.batch = 0
86
+ self.size = 0
@@ -0,0 +1,155 @@
1
+ """
2
+ Functions module
3
+ """
4
+
5
+ from types import FunctionType, MethodType
6
+
7
+
8
+ class Functions:
9
+ """
10
+ Resolves function configuration to function references.
11
+ """
12
+
13
+ def __init__(self, embeddings):
14
+ """
15
+ Creates a new function resolver.
16
+
17
+ Args:
18
+ embeddings: embeddings instance
19
+ """
20
+
21
+ self.embeddings = embeddings
22
+
23
+ # Handle to all reference objects
24
+ self.references = None
25
+
26
+ def __call__(self, config):
27
+ """
28
+ Resolves a list of functions to function references.
29
+
30
+ Args:
31
+ config: configuration
32
+
33
+ Returns:
34
+ list of function references
35
+ """
36
+
37
+ # Initialize stored references array
38
+ self.references = []
39
+
40
+ # Resolve callable functions
41
+ functions = []
42
+ for fn in config["functions"]:
43
+ if isinstance(fn, dict):
44
+ fn = fn.copy()
45
+ fn["function"] = self.function(fn["function"])
46
+ else:
47
+ fn = self.function(fn)
48
+ functions.append(fn)
49
+
50
+ return functions
51
+
52
+ def reset(self):
53
+ """
54
+ Clears all resolved references.
55
+ """
56
+
57
+ if self.references:
58
+ for reference in self.references:
59
+ reference.reset()
60
+
61
+ def function(self, function):
62
+ """
63
+ Resolves function configuration. If function is a string, it's split on '.' and each part
64
+ is separately resolved to an object, attribute or function. Each part is resolved upon the
65
+ first invocation of the function. Otherwise, the input is returned.
66
+
67
+ Args:
68
+ function: function configuration
69
+
70
+ Returns:
71
+ function reference
72
+ """
73
+
74
+ if isinstance(function, str):
75
+ parts = function.split(".")
76
+
77
+ if hasattr(self.embeddings, parts[0]):
78
+ m = Reference(self.embeddings, parts[0])
79
+ self.references.append(m)
80
+ else:
81
+ module = ".".join(parts[:-1])
82
+ m = __import__(module)
83
+
84
+ for comp in parts[1:]:
85
+ m = Reference(m, comp)
86
+ self.references.append(m)
87
+
88
+ return m
89
+
90
+ return function
91
+
92
+
93
+ class Reference:
94
+ """
95
+ Stores a reference to an object attribute. This attribute is resolved by invoking the __call__ method.
96
+ This allows for functions to be independent of the initialization order of an embeddings instance.
97
+ """
98
+
99
+ def __init__(self, obj, attribute):
100
+ """
101
+ Create a new reference.
102
+
103
+ Args:
104
+ obj: object handle
105
+ attribute: attribute name
106
+ """
107
+
108
+ # Object handle and attribute
109
+ self.obj = obj
110
+ self.attribute = attribute
111
+
112
+ # Keep a handle to the original inputs
113
+ self.inputs = (obj, attribute)
114
+
115
+ # True if the object and attribute have been resolved
116
+ self.resolved = False
117
+
118
+ # True if the attribute is a function
119
+ self.function = None
120
+
121
+ def __call__(self, *args):
122
+ """
123
+ Resolves an object attribute reference. If the attribute is a function, the function is executed.
124
+ Otherwise, the object attribute value is returned.
125
+
126
+ Args:
127
+ args: list of function arguments to the object attribute, when attribute is a function
128
+
129
+ Returns:
130
+ object attribute function result or object attribute value
131
+ """
132
+
133
+ # Resolve nested function arguments, if necessary
134
+ if not self.resolved:
135
+ self.obj = self.obj() if isinstance(self.obj, Reference) else self.obj
136
+ self.attribute = self.attribute() if isinstance(self.attribute, Reference) else self.attribute
137
+ self.resolved = True
138
+
139
+ # Lookup attribute
140
+ attribute = getattr(self.obj, self.attribute)
141
+
142
+ # Determine if attribute is a function
143
+ if self.function is None:
144
+ self.function = isinstance(attribute, (FunctionType, MethodType)) or (hasattr(attribute, "__call__") and args)
145
+
146
+ # If attribute is a function, execute and return, otherwise return attribute
147
+ return attribute(*args) if self.function else attribute
148
+
149
+ def reset(self):
150
+ """
151
+ Clears resolved references.
152
+ """
153
+
154
+ self.obj, self.attribute = self.inputs
155
+ self.resolved = False
@@ -0,0 +1,199 @@
1
+ """
2
+ Indexes module
3
+ """
4
+
5
+ import os
6
+
7
+ from .documents import Documents
8
+
9
+
10
+ class Indexes:
11
+ """
12
+ Manages a collection of subindexes for an embeddings instance.
13
+ """
14
+
15
+ def __init__(self, embeddings, indexes):
16
+ """
17
+ Creates a new indexes instance.
18
+
19
+ Args:
20
+ embeddings: embeddings instance
21
+ indexes: dict of subindexes to add
22
+ """
23
+
24
+ self.embeddings = embeddings
25
+ self.indexes = indexes
26
+
27
+ self.documents = None
28
+ self.checkpoint = None
29
+
30
+ # Transform columns
31
+ columns = embeddings.config.get("columns", {})
32
+ self.text = columns.get("text", "text")
33
+ self.object = columns.get("object", "object")
34
+
35
+ # Check if top-level indexing is enabled for this embeddings instance
36
+ self.indexing = embeddings.model or embeddings.scoring
37
+
38
+ def __contains__(self, name):
39
+ """
40
+ Returns True if name is in this instance, False otherwise.
41
+
42
+ Returns:
43
+ True if name is in this instance, False otherwise
44
+ """
45
+
46
+ return name in self.indexes
47
+
48
+ def __getitem__(self, name):
49
+ """
50
+ Looks up an index by name.
51
+
52
+ Args:
53
+ name: index name
54
+
55
+ Returns:
56
+ index
57
+ """
58
+
59
+ return self.indexes[name]
60
+
61
+ def __getattr__(self, name):
62
+ """
63
+ Looks up an index by attribute name.
64
+
65
+ Args:
66
+ name: index name
67
+
68
+ Returns:
69
+ index
70
+ """
71
+
72
+ try:
73
+ return self.indexes[name]
74
+ except Exception as e:
75
+ raise AttributeError(e) from e
76
+
77
+ def default(self):
78
+ """
79
+ Gets the default/first index.
80
+
81
+ Returns:
82
+ default index
83
+ """
84
+
85
+ return list(self.indexes.keys())[0]
86
+
87
+ def findmodel(self, index=None):
88
+ """
89
+ Finds a vector model. If index is empty, the first vector model is returned.
90
+
91
+ Args:
92
+ index: index name to match
93
+
94
+ Returns:
95
+ Vectors
96
+ """
97
+
98
+ # Find vector model
99
+ matches = [self.indexes[index].findmodel()] if index else [index.findmodel() for index in self.indexes.values() if index.findmodel()]
100
+ return matches[0] if matches else None
101
+
102
+ def insert(self, documents, index=None, checkpoint=None):
103
+ """
104
+ Inserts a batch of documents into each subindex.
105
+
106
+ Args:
107
+ documents: list of (id, data, tags)
108
+ index: indexid offset
109
+ checkpoint: optional checkpoint directory, enables indexing restart
110
+ """
111
+
112
+ if not self.documents:
113
+ self.documents = Documents()
114
+ self.checkpoint = checkpoint
115
+
116
+ # Create batch containing documents added to parent index
117
+ batch = []
118
+ for _, document, _ in documents:
119
+ # Add to documents collection if text or object field is set
120
+ parent = document
121
+ if isinstance(parent, dict):
122
+ parent = parent.get(self.text, document.get(self.object))
123
+
124
+ # Add if field is available or top-level indexing is disabled
125
+ if parent is not None or not self.indexing:
126
+ batch.append((index, document, None))
127
+ index += 1
128
+
129
+ # Add filtered documents batch
130
+ self.documents.add(batch)
131
+
132
+ def delete(self, ids):
133
+ """
134
+ Deletes ids from each subindex.
135
+
136
+ Args:
137
+ ids: list of ids to delete
138
+ """
139
+
140
+ for index in self.indexes.values():
141
+ index.delete(ids)
142
+
143
+ def index(self):
144
+ """
145
+ Builds each subindex.
146
+ """
147
+
148
+ for name, index in self.indexes.items():
149
+ index.index(self.documents, checkpoint=f"{self.checkpoint}/{name}" if self.checkpoint else None)
150
+
151
+ # Reset document stream
152
+ self.documents.close()
153
+ self.documents = None
154
+ self.checkpoint = None
155
+
156
+ def upsert(self):
157
+ """
158
+ Runs upsert for each subindex.
159
+ """
160
+
161
+ for index in self.indexes.values():
162
+ index.upsert(self.documents)
163
+
164
+ # Reset document stream
165
+ self.documents.close()
166
+ self.documents = None
167
+
168
+ def load(self, path):
169
+ """
170
+ Loads each subindex from path.
171
+
172
+ Args:
173
+ path: directory path to load subindexes
174
+ """
175
+
176
+ for name, index in self.indexes.items():
177
+ # Load subindex if it exists, subindexes aren't required to have data
178
+ directory = os.path.join(path, name)
179
+ if index.exists(directory):
180
+ index.load(directory)
181
+
182
+ def save(self, path):
183
+ """
184
+ Saves each subindex to path.
185
+
186
+ Args:
187
+ path: directory path to save subindexes
188
+ """
189
+
190
+ for name, index in self.indexes.items():
191
+ index.save(os.path.join(path, name))
192
+
193
+ def close(self):
194
+ """
195
+ Close and free resources used by this instance.
196
+ """
197
+
198
+ for index in self.indexes.values():
199
+ index.close()
@@ -0,0 +1,60 @@
1
+ """
2
+ IndexIds module
3
+ """
4
+
5
+ from ...serialize import Serializer
6
+
7
+
8
+ class IndexIds:
9
+ """
10
+ Stores index ids when content is disabled.
11
+ """
12
+
13
+ def __init__(self, embeddings, ids=None):
14
+ """
15
+ Creates an IndexIds instance.
16
+
17
+ Args:
18
+ embeddings: embeddings instance
19
+ ids: ids to store
20
+ """
21
+
22
+ self.config = embeddings.config
23
+ self.ids = ids
24
+
25
+ def __iter__(self):
26
+ yield from self.ids
27
+
28
+ def __getitem__(self, index):
29
+ return self.ids[index]
30
+
31
+ def __setitem__(self, index, value):
32
+ self.ids[index] = value
33
+
34
+ def __add__(self, ids):
35
+ return self.ids + ids
36
+
37
+ def load(self, path):
38
+ """
39
+ Loads IndexIds from path.
40
+
41
+ Args:
42
+ path: path to load
43
+ """
44
+
45
+ if "ids" in self.config:
46
+ # Legacy ids format
47
+ self.ids = self.config.pop("ids")
48
+ else:
49
+ # Standard ids format
50
+ self.ids = Serializer.load(path)
51
+
52
+ def save(self, path):
53
+ """
54
+ Saves IndexIds to path.
55
+
56
+ Args:
57
+ path: path to save
58
+ """
59
+
60
+ Serializer.save(self.ids, path)