mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
"""
|
2
|
+
ImageTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
|
7
|
+
# Conditional import
|
8
|
+
try:
|
9
|
+
from PIL import Image
|
10
|
+
|
11
|
+
PIL = True
|
12
|
+
except ImportError:
|
13
|
+
PIL = False
|
14
|
+
|
15
|
+
from .file import FileTask
|
16
|
+
|
17
|
+
|
18
|
+
class ImageTask(FileTask):
|
19
|
+
"""
|
20
|
+
Task that processes image file urls
|
21
|
+
"""
|
22
|
+
|
23
|
+
def register(self):
|
24
|
+
"""
|
25
|
+
Checks if required dependencies are installed.
|
26
|
+
"""
|
27
|
+
|
28
|
+
if not PIL:
|
29
|
+
raise ImportError('ImageTask is not available - install "workflow" extra to enable')
|
30
|
+
|
31
|
+
def accept(self, element):
|
32
|
+
# Only accept image files
|
33
|
+
return super().accept(element) and re.search(r"\.(gif|bmp|jpg|jpeg|png|webp)$", element.lower())
|
34
|
+
|
35
|
+
def prepare(self, element):
|
36
|
+
return Image.open(super().prepare(element))
|
@@ -0,0 +1,61 @@
|
|
1
|
+
"""
|
2
|
+
RetrieveTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import tempfile
|
7
|
+
|
8
|
+
from urllib.request import urlretrieve
|
9
|
+
from urllib.parse import urlparse
|
10
|
+
|
11
|
+
from .url import UrlTask
|
12
|
+
|
13
|
+
|
14
|
+
class RetrieveTask(UrlTask):
|
15
|
+
"""
|
16
|
+
Task that retrieves urls (local or remote) to a local directory.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def register(self, directory=None, flatten=True):
|
20
|
+
"""
|
21
|
+
Adds retrieve parameters to task.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
directory: local directory used to store retrieved files
|
25
|
+
flatten: flatten input directory structure, defaults to True
|
26
|
+
"""
|
27
|
+
|
28
|
+
# pylint: disable=W0201
|
29
|
+
# Create default temporary directory if not specified
|
30
|
+
if not directory:
|
31
|
+
# Save tempdir to prevent content from being deleted until this task is out of scope
|
32
|
+
# pylint: disable=R1732
|
33
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
34
|
+
directory = self.tempdir.name
|
35
|
+
|
36
|
+
# Create output directory if necessary
|
37
|
+
os.makedirs(directory, exist_ok=True)
|
38
|
+
|
39
|
+
self.directory = directory
|
40
|
+
self.flatten = flatten
|
41
|
+
|
42
|
+
def prepare(self, element):
|
43
|
+
# Extract file path from URL
|
44
|
+
path = urlparse(element).path
|
45
|
+
|
46
|
+
if self.flatten:
|
47
|
+
# Flatten directory structure (default)
|
48
|
+
path = os.path.join(self.directory, os.path.basename(path))
|
49
|
+
else:
|
50
|
+
# Derive output path
|
51
|
+
path = os.path.join(self.directory, os.path.normpath(path.lstrip("/")))
|
52
|
+
directory = os.path.dirname(path)
|
53
|
+
|
54
|
+
# Create local directory, if necessary
|
55
|
+
os.makedirs(directory, exist_ok=True)
|
56
|
+
|
57
|
+
# Retrieve URL
|
58
|
+
urlretrieve(element, path)
|
59
|
+
|
60
|
+
# Return new file path
|
61
|
+
return path
|
@@ -0,0 +1,102 @@
|
|
1
|
+
"""
|
2
|
+
ServiceTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Conditional import
|
6
|
+
try:
|
7
|
+
import requests
|
8
|
+
import xmltodict
|
9
|
+
|
10
|
+
XML_TO_DICT = True
|
11
|
+
except ImportError:
|
12
|
+
XML_TO_DICT = False
|
13
|
+
|
14
|
+
from .base import Task
|
15
|
+
|
16
|
+
|
17
|
+
class ServiceTask(Task):
|
18
|
+
"""
|
19
|
+
Task to runs requests against remote service urls.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def register(self, url=None, method=None, params=None, batch=True, extract=None):
|
23
|
+
"""
|
24
|
+
Adds service parameters to task. Checks if required dependencies are installed.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
url: url to connect to
|
28
|
+
method: http method, GET or POST
|
29
|
+
params: default query parameters
|
30
|
+
batch: if True, all elements are passed in a single batch request, otherwise a service call is executed per element
|
31
|
+
extract: list of sections to extract from response
|
32
|
+
"""
|
33
|
+
|
34
|
+
if not XML_TO_DICT:
|
35
|
+
raise ImportError('ServiceTask is not available - install "workflow" extra to enable')
|
36
|
+
|
37
|
+
# pylint: disable=W0201
|
38
|
+
# Save URL, method and parameter defaults
|
39
|
+
self.url = url
|
40
|
+
self.method = method
|
41
|
+
self.params = params
|
42
|
+
|
43
|
+
# If True, all elements are passed in a single batch request, otherwise a service call is executed per element
|
44
|
+
self.batch = batch
|
45
|
+
|
46
|
+
# Save sections to extract. Supports both a single string and a hierarchical list of sections.
|
47
|
+
self.extract = extract
|
48
|
+
if self.extract:
|
49
|
+
self.extract = [self.extract] if isinstance(self.extract, str) else self.extract
|
50
|
+
|
51
|
+
def execute(self, elements, executor=None):
|
52
|
+
if self.batch:
|
53
|
+
elements = self.request(elements)
|
54
|
+
else:
|
55
|
+
elements = [self.request(element) for element in elements]
|
56
|
+
|
57
|
+
return super().execute(elements, executor)
|
58
|
+
|
59
|
+
def request(self, data):
|
60
|
+
"""
|
61
|
+
Execute service request.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
url: service url
|
65
|
+
method: method (get or post)
|
66
|
+
params: dict of constant parameters to pass to request
|
67
|
+
data: dynamic data for this specific request
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
response as JSON
|
71
|
+
"""
|
72
|
+
|
73
|
+
if not self.params:
|
74
|
+
params = data
|
75
|
+
else:
|
76
|
+
# Create copy of parameters
|
77
|
+
params = self.params.copy()
|
78
|
+
|
79
|
+
# Add data to parameters
|
80
|
+
for key in params:
|
81
|
+
if not params[key]:
|
82
|
+
params[key] = data
|
83
|
+
|
84
|
+
# Run request
|
85
|
+
if self.method and self.method.lower() == "get":
|
86
|
+
response = requests.get(self.url, params=params)
|
87
|
+
else:
|
88
|
+
response = requests.post(self.url, json=params)
|
89
|
+
|
90
|
+
# Parse data based on content-type
|
91
|
+
mimetype = response.headers["Content-Type"].split(";")[0]
|
92
|
+
if mimetype.lower().endswith("xml"):
|
93
|
+
data = xmltodict.parse(response.text)
|
94
|
+
else:
|
95
|
+
data = response.json()
|
96
|
+
|
97
|
+
# Extract content from response, if necessary
|
98
|
+
if self.extract:
|
99
|
+
for tag in self.extract:
|
100
|
+
data = data[tag]
|
101
|
+
|
102
|
+
return data
|
@@ -0,0 +1,110 @@
|
|
1
|
+
"""
|
2
|
+
StorageTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
|
8
|
+
# Conditional import
|
9
|
+
try:
|
10
|
+
from libcloud.storage.providers import get_driver
|
11
|
+
|
12
|
+
LIBCLOUD = True
|
13
|
+
except ImportError:
|
14
|
+
LIBCLOUD = False
|
15
|
+
|
16
|
+
from .base import Task
|
17
|
+
|
18
|
+
|
19
|
+
class StorageTask(Task):
|
20
|
+
"""
|
21
|
+
Task that processes object storage buckets. Supports local and cloud providers in Apache libcloud.
|
22
|
+
"""
|
23
|
+
|
24
|
+
# URL prefix
|
25
|
+
PREFIX = r"(\w+):\/\/.*"
|
26
|
+
PATH = r"\w+:\/\/(.*)"
|
27
|
+
|
28
|
+
def register(self, key=None, secret=None, host=None, port=None, token=None, region=None):
|
29
|
+
"""
|
30
|
+
Checks if required dependencies are installed. Reads in cloud storage parameters.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
key: provider-specific access key
|
34
|
+
secret: provider-specific access secret
|
35
|
+
host: server host name
|
36
|
+
port: server port
|
37
|
+
token: temporary session token
|
38
|
+
region: storage region
|
39
|
+
"""
|
40
|
+
|
41
|
+
if not LIBCLOUD:
|
42
|
+
raise ImportError('StorageTask is not available - install "workflow" extra to enable')
|
43
|
+
|
44
|
+
# pylint: disable=W0201
|
45
|
+
self.key = key
|
46
|
+
self.secret = secret
|
47
|
+
self.host = host
|
48
|
+
self.port = port
|
49
|
+
self.token = token
|
50
|
+
self.region = region
|
51
|
+
|
52
|
+
def __call__(self, elements, executor=None):
|
53
|
+
# Create aggregated directory listing for all elements
|
54
|
+
outputs = []
|
55
|
+
for element in elements:
|
56
|
+
if self.matches(element):
|
57
|
+
# Get directory listing and run actions
|
58
|
+
outputs.extend(super().__call__(self.list(element), executor))
|
59
|
+
else:
|
60
|
+
outputs.append(element)
|
61
|
+
|
62
|
+
return outputs
|
63
|
+
|
64
|
+
def matches(self, element):
|
65
|
+
"""
|
66
|
+
Determines if this element is a storage element.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
element: input storage element
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
True if this is a storage element
|
73
|
+
"""
|
74
|
+
|
75
|
+
# Only accept file URLs
|
76
|
+
return re.match(StorageTask.PREFIX, self.upack(element, True).lower())
|
77
|
+
|
78
|
+
def list(self, element):
|
79
|
+
"""
|
80
|
+
Gets a list of urls for a object container.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
element: object container
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
list of urls
|
87
|
+
"""
|
88
|
+
|
89
|
+
provider = re.sub(StorageTask.PREFIX, r"\1", element.lower())
|
90
|
+
path = re.sub(StorageTask.PATH, r"\1", element)
|
91
|
+
|
92
|
+
# Load key and secret, if applicable
|
93
|
+
key = self.key if self.key is not None else os.environ.get("ACCESS_KEY")
|
94
|
+
secret = self.secret if self.secret is not None else os.environ.get("ACCESS_SECRET")
|
95
|
+
|
96
|
+
# Parse key and container
|
97
|
+
key, container = (os.path.dirname(path), os.path.basename(path)) if key is None else (key, path)
|
98
|
+
|
99
|
+
# Parse optional prefix from container
|
100
|
+
parts = container.split("/", 1)
|
101
|
+
container, prefix = (parts[0], parts[1]) if len(parts) > 1 else (container, None)
|
102
|
+
|
103
|
+
# Get driver for provider
|
104
|
+
driver = get_driver(provider)
|
105
|
+
|
106
|
+
# Get client connection
|
107
|
+
client = driver(key, secret, **{field: getattr(self, field) for field in ["host", "port", "region", "token"] if getattr(self, field)})
|
108
|
+
|
109
|
+
container = client.get_container(container_name=container)
|
110
|
+
return [client.get_object_cdn_url(obj) for obj in client.list_container_objects(container=container, prefix=prefix)]
|
@@ -0,0 +1,33 @@
|
|
1
|
+
"""
|
2
|
+
StreamTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .base import Task
|
6
|
+
|
7
|
+
|
8
|
+
class StreamTask(Task):
|
9
|
+
"""
|
10
|
+
Task that calls a task action and yields results.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def register(self, batch=False):
|
14
|
+
"""
|
15
|
+
Adds stream parameters to task.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
batch: all elements are passed to a single action call if True, otherwise an action call is executed per element, defaults to False
|
19
|
+
"""
|
20
|
+
|
21
|
+
# pylint: disable=W0201
|
22
|
+
# All elements are passed to a single action call if True, otherwise an action call is executed per element, defaults to False
|
23
|
+
self.batch = batch
|
24
|
+
|
25
|
+
def __call__(self, elements, executor=None):
|
26
|
+
for action in self.action:
|
27
|
+
if self.batch:
|
28
|
+
# Single batch call
|
29
|
+
yield from action(elements)
|
30
|
+
else:
|
31
|
+
# Call action for each element
|
32
|
+
for x in elements:
|
33
|
+
yield from action(x)
|
@@ -0,0 +1,116 @@
|
|
1
|
+
"""
|
2
|
+
Template module
|
3
|
+
"""
|
4
|
+
|
5
|
+
from string import Formatter
|
6
|
+
|
7
|
+
from ...util import TemplateFormatter
|
8
|
+
from .file import Task
|
9
|
+
|
10
|
+
|
11
|
+
class TemplateTask(Task):
|
12
|
+
"""
|
13
|
+
Task that generates text from a template and task inputs. Templates can be used to prepare data for a number of tasks
|
14
|
+
including generating large language model (LLM) prompts.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def register(self, template=None, rules=None, strict=True):
|
18
|
+
"""
|
19
|
+
Read template parameters.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
template: prompt template
|
23
|
+
rules: parameter rules
|
24
|
+
strict: requires all task inputs to be consumed by template, defaults to True
|
25
|
+
"""
|
26
|
+
|
27
|
+
# pylint: disable=W0201
|
28
|
+
# Template text
|
29
|
+
self.template = template if template else self.defaulttemplate()
|
30
|
+
|
31
|
+
# Template processing rules
|
32
|
+
self.rules = rules if rules else self.defaultrules()
|
33
|
+
|
34
|
+
# Create formatter
|
35
|
+
self.formatter = TemplateFormatter() if strict else Formatter()
|
36
|
+
|
37
|
+
def prepare(self, element):
|
38
|
+
# Check if element matches any processing rules
|
39
|
+
match = self.match(element)
|
40
|
+
if match:
|
41
|
+
return match
|
42
|
+
|
43
|
+
# Apply template processing, if applicable
|
44
|
+
if self.template:
|
45
|
+
# Pass dictionary as named prompt template parameters
|
46
|
+
if isinstance(element, dict):
|
47
|
+
return self.formatter.format(self.template, **element)
|
48
|
+
|
49
|
+
# Pass tuple as prompt template parameters (arg0 - argN)
|
50
|
+
if isinstance(element, tuple):
|
51
|
+
return self.formatter.format(self.template, **{f"arg{i}": x for i, x in enumerate(element)})
|
52
|
+
|
53
|
+
# Default behavior is to use input as {text} parameter in prompt template
|
54
|
+
return self.formatter.format(self.template, text=element)
|
55
|
+
|
56
|
+
# Return original inputs when no prompt provided
|
57
|
+
return element
|
58
|
+
|
59
|
+
def defaulttemplate(self):
|
60
|
+
"""
|
61
|
+
Generates a default template for this task. Base method returns None.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
default template
|
65
|
+
"""
|
66
|
+
|
67
|
+
return None
|
68
|
+
|
69
|
+
def defaultrules(self):
|
70
|
+
"""
|
71
|
+
Generates a default rules for this task. Base method returns an empty dictionary.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
default rules
|
75
|
+
"""
|
76
|
+
|
77
|
+
return {}
|
78
|
+
|
79
|
+
def match(self, element):
|
80
|
+
"""
|
81
|
+
Check if element matches any processing rules.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
element: input element
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
matching value if found, None otherwise
|
88
|
+
"""
|
89
|
+
|
90
|
+
if self.rules and isinstance(element, dict):
|
91
|
+
# Check if any rules are matched
|
92
|
+
for key, value in self.rules.items():
|
93
|
+
if element[key] == value:
|
94
|
+
return element[key]
|
95
|
+
|
96
|
+
return None
|
97
|
+
|
98
|
+
|
99
|
+
class RagTask(TemplateTask):
|
100
|
+
"""
|
101
|
+
Template task that prepares input for a rag pipeline.
|
102
|
+
"""
|
103
|
+
|
104
|
+
def prepare(self, element):
|
105
|
+
# Apply prompt template using all variables except "query" and use output as question
|
106
|
+
if isinstance(element, dict):
|
107
|
+
# Make a copy without query and run through template
|
108
|
+
params = dict(element)
|
109
|
+
params.pop("query", None)
|
110
|
+
params["text"] = params.pop("question")
|
111
|
+
|
112
|
+
element["question"] = super().prepare(params)
|
113
|
+
return element
|
114
|
+
|
115
|
+
# Default mode is to use element text for both query and question
|
116
|
+
return {"query": element, "question": super().prepare(element)}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
UrlTask module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
|
7
|
+
from .base import Task
|
8
|
+
|
9
|
+
|
10
|
+
class UrlTask(Task):
|
11
|
+
"""
|
12
|
+
Task that processes urls
|
13
|
+
"""
|
14
|
+
|
15
|
+
# URL prefix
|
16
|
+
PREFIX = r"\w+:\/\/"
|
17
|
+
|
18
|
+
def accept(self, element):
|
19
|
+
# Only accept elements that start with a url prefix
|
20
|
+
return super().accept(element) and re.match(UrlTask.PREFIX, element.lower())
|