mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,36 @@
1
+ """
2
+ ImageTask module
3
+ """
4
+
5
+ import re
6
+
7
+ # Conditional import
8
+ try:
9
+ from PIL import Image
10
+
11
+ PIL = True
12
+ except ImportError:
13
+ PIL = False
14
+
15
+ from .file import FileTask
16
+
17
+
18
+ class ImageTask(FileTask):
19
+ """
20
+ Task that processes image file urls
21
+ """
22
+
23
+ def register(self):
24
+ """
25
+ Checks if required dependencies are installed.
26
+ """
27
+
28
+ if not PIL:
29
+ raise ImportError('ImageTask is not available - install "workflow" extra to enable')
30
+
31
+ def accept(self, element):
32
+ # Only accept image files
33
+ return super().accept(element) and re.search(r"\.(gif|bmp|jpg|jpeg|png|webp)$", element.lower())
34
+
35
+ def prepare(self, element):
36
+ return Image.open(super().prepare(element))
@@ -0,0 +1,61 @@
1
+ """
2
+ RetrieveTask module
3
+ """
4
+
5
+ import os
6
+ import tempfile
7
+
8
+ from urllib.request import urlretrieve
9
+ from urllib.parse import urlparse
10
+
11
+ from .url import UrlTask
12
+
13
+
14
+ class RetrieveTask(UrlTask):
15
+ """
16
+ Task that retrieves urls (local or remote) to a local directory.
17
+ """
18
+
19
+ def register(self, directory=None, flatten=True):
20
+ """
21
+ Adds retrieve parameters to task.
22
+
23
+ Args:
24
+ directory: local directory used to store retrieved files
25
+ flatten: flatten input directory structure, defaults to True
26
+ """
27
+
28
+ # pylint: disable=W0201
29
+ # Create default temporary directory if not specified
30
+ if not directory:
31
+ # Save tempdir to prevent content from being deleted until this task is out of scope
32
+ # pylint: disable=R1732
33
+ self.tempdir = tempfile.TemporaryDirectory()
34
+ directory = self.tempdir.name
35
+
36
+ # Create output directory if necessary
37
+ os.makedirs(directory, exist_ok=True)
38
+
39
+ self.directory = directory
40
+ self.flatten = flatten
41
+
42
+ def prepare(self, element):
43
+ # Extract file path from URL
44
+ path = urlparse(element).path
45
+
46
+ if self.flatten:
47
+ # Flatten directory structure (default)
48
+ path = os.path.join(self.directory, os.path.basename(path))
49
+ else:
50
+ # Derive output path
51
+ path = os.path.join(self.directory, os.path.normpath(path.lstrip("/")))
52
+ directory = os.path.dirname(path)
53
+
54
+ # Create local directory, if necessary
55
+ os.makedirs(directory, exist_ok=True)
56
+
57
+ # Retrieve URL
58
+ urlretrieve(element, path)
59
+
60
+ # Return new file path
61
+ return path
@@ -0,0 +1,102 @@
1
+ """
2
+ ServiceTask module
3
+ """
4
+
5
+ # Conditional import
6
+ try:
7
+ import requests
8
+ import xmltodict
9
+
10
+ XML_TO_DICT = True
11
+ except ImportError:
12
+ XML_TO_DICT = False
13
+
14
+ from .base import Task
15
+
16
+
17
+ class ServiceTask(Task):
18
+ """
19
+ Task to runs requests against remote service urls.
20
+ """
21
+
22
+ def register(self, url=None, method=None, params=None, batch=True, extract=None):
23
+ """
24
+ Adds service parameters to task. Checks if required dependencies are installed.
25
+
26
+ Args:
27
+ url: url to connect to
28
+ method: http method, GET or POST
29
+ params: default query parameters
30
+ batch: if True, all elements are passed in a single batch request, otherwise a service call is executed per element
31
+ extract: list of sections to extract from response
32
+ """
33
+
34
+ if not XML_TO_DICT:
35
+ raise ImportError('ServiceTask is not available - install "workflow" extra to enable')
36
+
37
+ # pylint: disable=W0201
38
+ # Save URL, method and parameter defaults
39
+ self.url = url
40
+ self.method = method
41
+ self.params = params
42
+
43
+ # If True, all elements are passed in a single batch request, otherwise a service call is executed per element
44
+ self.batch = batch
45
+
46
+ # Save sections to extract. Supports both a single string and a hierarchical list of sections.
47
+ self.extract = extract
48
+ if self.extract:
49
+ self.extract = [self.extract] if isinstance(self.extract, str) else self.extract
50
+
51
+ def execute(self, elements, executor=None):
52
+ if self.batch:
53
+ elements = self.request(elements)
54
+ else:
55
+ elements = [self.request(element) for element in elements]
56
+
57
+ return super().execute(elements, executor)
58
+
59
+ def request(self, data):
60
+ """
61
+ Execute service request.
62
+
63
+ Args:
64
+ url: service url
65
+ method: method (get or post)
66
+ params: dict of constant parameters to pass to request
67
+ data: dynamic data for this specific request
68
+
69
+ Returns:
70
+ response as JSON
71
+ """
72
+
73
+ if not self.params:
74
+ params = data
75
+ else:
76
+ # Create copy of parameters
77
+ params = self.params.copy()
78
+
79
+ # Add data to parameters
80
+ for key in params:
81
+ if not params[key]:
82
+ params[key] = data
83
+
84
+ # Run request
85
+ if self.method and self.method.lower() == "get":
86
+ response = requests.get(self.url, params=params)
87
+ else:
88
+ response = requests.post(self.url, json=params)
89
+
90
+ # Parse data based on content-type
91
+ mimetype = response.headers["Content-Type"].split(";")[0]
92
+ if mimetype.lower().endswith("xml"):
93
+ data = xmltodict.parse(response.text)
94
+ else:
95
+ data = response.json()
96
+
97
+ # Extract content from response, if necessary
98
+ if self.extract:
99
+ for tag in self.extract:
100
+ data = data[tag]
101
+
102
+ return data
@@ -0,0 +1,110 @@
1
+ """
2
+ StorageTask module
3
+ """
4
+
5
+ import os
6
+ import re
7
+
8
+ # Conditional import
9
+ try:
10
+ from libcloud.storage.providers import get_driver
11
+
12
+ LIBCLOUD = True
13
+ except ImportError:
14
+ LIBCLOUD = False
15
+
16
+ from .base import Task
17
+
18
+
19
+ class StorageTask(Task):
20
+ """
21
+ Task that processes object storage buckets. Supports local and cloud providers in Apache libcloud.
22
+ """
23
+
24
+ # URL prefix
25
+ PREFIX = r"(\w+):\/\/.*"
26
+ PATH = r"\w+:\/\/(.*)"
27
+
28
+ def register(self, key=None, secret=None, host=None, port=None, token=None, region=None):
29
+ """
30
+ Checks if required dependencies are installed. Reads in cloud storage parameters.
31
+
32
+ Args:
33
+ key: provider-specific access key
34
+ secret: provider-specific access secret
35
+ host: server host name
36
+ port: server port
37
+ token: temporary session token
38
+ region: storage region
39
+ """
40
+
41
+ if not LIBCLOUD:
42
+ raise ImportError('StorageTask is not available - install "workflow" extra to enable')
43
+
44
+ # pylint: disable=W0201
45
+ self.key = key
46
+ self.secret = secret
47
+ self.host = host
48
+ self.port = port
49
+ self.token = token
50
+ self.region = region
51
+
52
+ def __call__(self, elements, executor=None):
53
+ # Create aggregated directory listing for all elements
54
+ outputs = []
55
+ for element in elements:
56
+ if self.matches(element):
57
+ # Get directory listing and run actions
58
+ outputs.extend(super().__call__(self.list(element), executor))
59
+ else:
60
+ outputs.append(element)
61
+
62
+ return outputs
63
+
64
+ def matches(self, element):
65
+ """
66
+ Determines if this element is a storage element.
67
+
68
+ Args:
69
+ element: input storage element
70
+
71
+ Returns:
72
+ True if this is a storage element
73
+ """
74
+
75
+ # Only accept file URLs
76
+ return re.match(StorageTask.PREFIX, self.upack(element, True).lower())
77
+
78
+ def list(self, element):
79
+ """
80
+ Gets a list of urls for a object container.
81
+
82
+ Args:
83
+ element: object container
84
+
85
+ Returns:
86
+ list of urls
87
+ """
88
+
89
+ provider = re.sub(StorageTask.PREFIX, r"\1", element.lower())
90
+ path = re.sub(StorageTask.PATH, r"\1", element)
91
+
92
+ # Load key and secret, if applicable
93
+ key = self.key if self.key is not None else os.environ.get("ACCESS_KEY")
94
+ secret = self.secret if self.secret is not None else os.environ.get("ACCESS_SECRET")
95
+
96
+ # Parse key and container
97
+ key, container = (os.path.dirname(path), os.path.basename(path)) if key is None else (key, path)
98
+
99
+ # Parse optional prefix from container
100
+ parts = container.split("/", 1)
101
+ container, prefix = (parts[0], parts[1]) if len(parts) > 1 else (container, None)
102
+
103
+ # Get driver for provider
104
+ driver = get_driver(provider)
105
+
106
+ # Get client connection
107
+ client = driver(key, secret, **{field: getattr(self, field) for field in ["host", "port", "region", "token"] if getattr(self, field)})
108
+
109
+ container = client.get_container(container_name=container)
110
+ return [client.get_object_cdn_url(obj) for obj in client.list_container_objects(container=container, prefix=prefix)]
@@ -0,0 +1,33 @@
1
+ """
2
+ StreamTask module
3
+ """
4
+
5
+ from .base import Task
6
+
7
+
8
+ class StreamTask(Task):
9
+ """
10
+ Task that calls a task action and yields results.
11
+ """
12
+
13
+ def register(self, batch=False):
14
+ """
15
+ Adds stream parameters to task.
16
+
17
+ Args:
18
+ batch: all elements are passed to a single action call if True, otherwise an action call is executed per element, defaults to False
19
+ """
20
+
21
+ # pylint: disable=W0201
22
+ # All elements are passed to a single action call if True, otherwise an action call is executed per element, defaults to False
23
+ self.batch = batch
24
+
25
+ def __call__(self, elements, executor=None):
26
+ for action in self.action:
27
+ if self.batch:
28
+ # Single batch call
29
+ yield from action(elements)
30
+ else:
31
+ # Call action for each element
32
+ for x in elements:
33
+ yield from action(x)
@@ -0,0 +1,116 @@
1
+ """
2
+ Template module
3
+ """
4
+
5
+ from string import Formatter
6
+
7
+ from ...util import TemplateFormatter
8
+ from .file import Task
9
+
10
+
11
+ class TemplateTask(Task):
12
+ """
13
+ Task that generates text from a template and task inputs. Templates can be used to prepare data for a number of tasks
14
+ including generating large language model (LLM) prompts.
15
+ """
16
+
17
+ def register(self, template=None, rules=None, strict=True):
18
+ """
19
+ Read template parameters.
20
+
21
+ Args:
22
+ template: prompt template
23
+ rules: parameter rules
24
+ strict: requires all task inputs to be consumed by template, defaults to True
25
+ """
26
+
27
+ # pylint: disable=W0201
28
+ # Template text
29
+ self.template = template if template else self.defaulttemplate()
30
+
31
+ # Template processing rules
32
+ self.rules = rules if rules else self.defaultrules()
33
+
34
+ # Create formatter
35
+ self.formatter = TemplateFormatter() if strict else Formatter()
36
+
37
+ def prepare(self, element):
38
+ # Check if element matches any processing rules
39
+ match = self.match(element)
40
+ if match:
41
+ return match
42
+
43
+ # Apply template processing, if applicable
44
+ if self.template:
45
+ # Pass dictionary as named prompt template parameters
46
+ if isinstance(element, dict):
47
+ return self.formatter.format(self.template, **element)
48
+
49
+ # Pass tuple as prompt template parameters (arg0 - argN)
50
+ if isinstance(element, tuple):
51
+ return self.formatter.format(self.template, **{f"arg{i}": x for i, x in enumerate(element)})
52
+
53
+ # Default behavior is to use input as {text} parameter in prompt template
54
+ return self.formatter.format(self.template, text=element)
55
+
56
+ # Return original inputs when no prompt provided
57
+ return element
58
+
59
+ def defaulttemplate(self):
60
+ """
61
+ Generates a default template for this task. Base method returns None.
62
+
63
+ Returns:
64
+ default template
65
+ """
66
+
67
+ return None
68
+
69
+ def defaultrules(self):
70
+ """
71
+ Generates a default rules for this task. Base method returns an empty dictionary.
72
+
73
+ Returns:
74
+ default rules
75
+ """
76
+
77
+ return {}
78
+
79
+ def match(self, element):
80
+ """
81
+ Check if element matches any processing rules.
82
+
83
+ Args:
84
+ element: input element
85
+
86
+ Returns:
87
+ matching value if found, None otherwise
88
+ """
89
+
90
+ if self.rules and isinstance(element, dict):
91
+ # Check if any rules are matched
92
+ for key, value in self.rules.items():
93
+ if element[key] == value:
94
+ return element[key]
95
+
96
+ return None
97
+
98
+
99
+ class RagTask(TemplateTask):
100
+ """
101
+ Template task that prepares input for a rag pipeline.
102
+ """
103
+
104
+ def prepare(self, element):
105
+ # Apply prompt template using all variables except "query" and use output as question
106
+ if isinstance(element, dict):
107
+ # Make a copy without query and run through template
108
+ params = dict(element)
109
+ params.pop("query", None)
110
+ params["text"] = params.pop("question")
111
+
112
+ element["question"] = super().prepare(params)
113
+ return element
114
+
115
+ # Default mode is to use element text for both query and question
116
+ return {"query": element, "question": super().prepare(element)}
@@ -0,0 +1,20 @@
1
+ """
2
+ UrlTask module
3
+ """
4
+
5
+ import re
6
+
7
+ from .base import Task
8
+
9
+
10
+ class UrlTask(Task):
11
+ """
12
+ Task that processes urls
13
+ """
14
+
15
+ # URL prefix
16
+ PREFIX = r"\w+:\/\/"
17
+
18
+ def accept(self, element):
19
+ # Only accept elements that start with a url prefix
20
+ return super().accept(element) and re.match(UrlTask.PREFIX, element.lower())
@@ -0,0 +1,14 @@
1
+ """
2
+ WorkflowTask module
3
+ """
4
+
5
+ from .base import Task
6
+
7
+
8
+ class WorkflowTask(Task):
9
+ """
10
+ Task that executes a separate Workflow
11
+ """
12
+
13
+ def process(self, action, inputs):
14
+ return list(super().process(action, inputs))