tensorlake 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorlake-0.1.0/PKG-INFO +18 -0
- tensorlake-0.1.0/pyproject.toml +25 -0
- tensorlake-0.1.0/tensorlake/__init__.py +31 -0
- tensorlake-0.1.0/tensorlake/builder/__init__.py +0 -0
- tensorlake-0.1.0/tensorlake/builder/client.py +73 -0
- tensorlake-0.1.0/tensorlake/cli.py +212 -0
- tensorlake-0.1.0/tensorlake/data_loaders/__init__.py +58 -0
- tensorlake-0.1.0/tensorlake/data_loaders/local_directory_loader.py +37 -0
- tensorlake-0.1.0/tensorlake/data_loaders/url_loader.py +52 -0
- tensorlake-0.1.0/tensorlake/error.py +8 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/data_objects.py +27 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/graph.py +364 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/graph_definition.py +63 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/graph_validation.py +70 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/image.py +227 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/indexify_functions.py +344 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/invocation_state/invocation_state.py +22 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/invocation_state/local_invocation_state.py +30 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/object_serializer.py +68 -0
- tensorlake-0.1.0/tensorlake/functions_sdk/pipeline.py +30 -0
- tensorlake-0.1.0/tensorlake/http_client.py +379 -0
- tensorlake-0.1.0/tensorlake/remote_graph.py +138 -0
- tensorlake-0.1.0/tensorlake/remote_pipeline.py +25 -0
- tensorlake-0.1.0/tensorlake/settings.py +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: tensorlake
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Petabyte scale data framework for unstructured data of any modality
|
|
5
|
+
Home-page: https://github.com/tensorlakeai/tensorlake
|
|
6
|
+
Author: Tensorlake Inc.
|
|
7
|
+
Author-email: support@tensorlake.ai
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: cloudpickle (>=3.1.0,<4.0.0)
|
|
15
|
+
Requires-Dist: docker (>=7.1.0,<8.0.0)
|
|
16
|
+
Requires-Dist: httpx[http2] (>=0.28.1,<0.29.0)
|
|
17
|
+
Requires-Dist: pydantic (==2.10.4)
|
|
18
|
+
Project-URL: Repository, https://github.com/tensorlakeai/tensorlake
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "tensorlake"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Petabyte scale data framework for unstructured data of any modality"
|
|
5
|
+
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
|
6
|
+
homepage = "https://github.com/tensorlakeai/tensorlake"
|
|
7
|
+
repository = "https://github.com/tensorlakeai/tensorlake"
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.10"
|
|
11
|
+
httpx = { version = "^0.28.1", extras = ["http2"] }
|
|
12
|
+
pydantic = "2.10.4"
|
|
13
|
+
cloudpickle = "^3.1.0"
|
|
14
|
+
docker = "^7.1.0"
|
|
15
|
+
|
|
16
|
+
[tool.poetry.scripts]
|
|
17
|
+
tensorlake = "tensorlake.cli:tensorlake"
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["poetry-core>=1.0.0"]
|
|
21
|
+
build-backend = "poetry.core.masonry.api"
|
|
22
|
+
|
|
23
|
+
[tool.poetry.group.dev.dependencies]
|
|
24
|
+
black = "^24.10.0"
|
|
25
|
+
pylint = "^3.3.0"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from . import data_loaders
|
|
2
|
+
from .functions_sdk.graph import Graph
|
|
3
|
+
from .functions_sdk.image import Image
|
|
4
|
+
from .functions_sdk.indexify_functions import (
|
|
5
|
+
IndexifyFunction,
|
|
6
|
+
IndexifyRouter,
|
|
7
|
+
get_ctx,
|
|
8
|
+
indexify_router,
|
|
9
|
+
tensorlake_function,
|
|
10
|
+
)
|
|
11
|
+
from .functions_sdk.pipeline import Pipeline
|
|
12
|
+
from .http_client import TensorlakeClient
|
|
13
|
+
from .remote_graph import RemoteGraph
|
|
14
|
+
from .remote_pipeline import RemotePipeline
|
|
15
|
+
from .settings import DEFAULT_SERVICE_URL
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"data_loaders",
|
|
19
|
+
"Graph",
|
|
20
|
+
"RemoteGraph",
|
|
21
|
+
"Pipeline",
|
|
22
|
+
"RemotePipeline",
|
|
23
|
+
"Image",
|
|
24
|
+
"tensorlake_function",
|
|
25
|
+
"get_ctx",
|
|
26
|
+
"IndexifyFunction",
|
|
27
|
+
"IndexifyRouter",
|
|
28
|
+
"indexify_router",
|
|
29
|
+
"DEFAULT_SERVICE_URL",
|
|
30
|
+
"TensorlakeClient",
|
|
31
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from tensorlake.functions_sdk.image import Build
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ImageBuilderClient:
|
|
10
|
+
def __init__(self, build_service: str, api_key):
|
|
11
|
+
self.client = httpx
|
|
12
|
+
self.build_service = build_service
|
|
13
|
+
self.headers = {}
|
|
14
|
+
if api_key:
|
|
15
|
+
self.headers["Authorization"] = f"Bearer {api_key}"
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_env(cls):
|
|
19
|
+
api_key = os.getenv("TENSORLAKE_API_KEY")
|
|
20
|
+
indexify_url = os.getenv("INDEXIFY_URL", "https://api.tensorlake.ai")
|
|
21
|
+
build_url = os.getenv(
|
|
22
|
+
"TENSORLAKE_BUILD_SERVICE", f"{indexify_url}/images"
|
|
23
|
+
) # Mainly used for debugging/local testing
|
|
24
|
+
return cls(build_url, api_key)
|
|
25
|
+
|
|
26
|
+
def get(self, endpoint: str, params: Optional[Dict] = None):
|
|
27
|
+
res = self.client.get(
|
|
28
|
+
f"{self.build_service}{endpoint}", params=params, headers=self.headers
|
|
29
|
+
)
|
|
30
|
+
res.raise_for_status()
|
|
31
|
+
return res.json()
|
|
32
|
+
|
|
33
|
+
def get_build(self, build_id: int):
|
|
34
|
+
return Build.model_validate(self.get(f"/v1/builds/{build_id}"))
|
|
35
|
+
|
|
36
|
+
def get_build_logs(self, build_id: int):
|
|
37
|
+
log_response = self.client.get(f"{self.build_service}/builds/{build_id}/log")
|
|
38
|
+
log_response.raise_for_status()
|
|
39
|
+
return log_response.content.decode("utf-8")
|
|
40
|
+
|
|
41
|
+
def post(self, endpoint: str, data: Dict, files: Dict):
|
|
42
|
+
res = self.client.post(
|
|
43
|
+
f"{self.build_service}{endpoint}",
|
|
44
|
+
data=data,
|
|
45
|
+
files=files,
|
|
46
|
+
headers=self.headers,
|
|
47
|
+
timeout=60,
|
|
48
|
+
)
|
|
49
|
+
res.raise_for_status()
|
|
50
|
+
return res.json()
|
|
51
|
+
|
|
52
|
+
def build_exists(self, image_name: str, image_hash: str):
|
|
53
|
+
builds = self.find_build(image_name, image_hash)
|
|
54
|
+
return builds != []
|
|
55
|
+
|
|
56
|
+
def find_build(self, image_name: str, image_hash: str):
|
|
57
|
+
params = {"image_name": image_name, "image_hash": image_hash}
|
|
58
|
+
res = self.client.get(
|
|
59
|
+
f"{self.build_service}/v1/builds", headers=self.headers, params=params
|
|
60
|
+
)
|
|
61
|
+
res.raise_for_status()
|
|
62
|
+
return [Build.model_validate(b) for b in res.json()]
|
|
63
|
+
|
|
64
|
+
def get_latest_build(self, image_name: str) -> Build:
|
|
65
|
+
res = self.client.get(
|
|
66
|
+
f"{self.build_service}/v1/builds",
|
|
67
|
+
headers=self.headers,
|
|
68
|
+
params={"image_name": image_name},
|
|
69
|
+
)
|
|
70
|
+
res.raise_for_status()
|
|
71
|
+
builds = [Build.model_validate(b) for b in res.json()]
|
|
72
|
+
builds.sort(key=lambda b: b.created_at, reverse=True)
|
|
73
|
+
return builds[0]
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import time
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from tensorlake import Graph, Image, RemoteGraph, TensorlakeClient
|
|
9
|
+
from tensorlake.builder.client import ImageBuilderClient
|
|
10
|
+
from tensorlake.functions_sdk.image import Build
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.group()
|
|
14
|
+
def tensorlake():
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@click.command()
|
|
19
|
+
@click.argument("workflow_file", type=click.File("r"))
|
|
20
|
+
def deploy(workflow_file: click.File):
|
|
21
|
+
"""Deploy a workflow to tensorlake."""
|
|
22
|
+
|
|
23
|
+
click.echo(f"Preparing deployment for {workflow_file.name}")
|
|
24
|
+
builder = ImageBuilderClient.from_env()
|
|
25
|
+
seen_images: Dict[Image, str] = {}
|
|
26
|
+
deployed_graphs: List[Graph] = []
|
|
27
|
+
|
|
28
|
+
# Read the graph file and build the images
|
|
29
|
+
workflow_globals = {}
|
|
30
|
+
with open(workflow_file.name, "r") as f:
|
|
31
|
+
exec(f.read(), workflow_globals)
|
|
32
|
+
|
|
33
|
+
for name, obj in workflow_globals.items():
|
|
34
|
+
if isinstance(obj, Graph):
|
|
35
|
+
deployed_graphs.append(obj)
|
|
36
|
+
for node_name, node_obj in obj.nodes.items():
|
|
37
|
+
image = node_obj.image
|
|
38
|
+
if image in seen_images:
|
|
39
|
+
continue
|
|
40
|
+
seen_images[image] = image.hash()
|
|
41
|
+
|
|
42
|
+
_prepare_images(builder, seen_images)
|
|
43
|
+
|
|
44
|
+
# If we are still here then our images should all have URIs
|
|
45
|
+
|
|
46
|
+
# TODO: Fold calls to the platform API into a client class.
|
|
47
|
+
indexify_addr = os.getenv("INDEXIFY_URL", "https://api.tensorlake.ai")
|
|
48
|
+
introspect_response = builder.client.post(
|
|
49
|
+
f"{indexify_addr}/platform/v1/keys/introspect", headers=builder.headers
|
|
50
|
+
)
|
|
51
|
+
introspect_response.raise_for_status()
|
|
52
|
+
project_id = introspect_response.json()["projectId"]
|
|
53
|
+
|
|
54
|
+
client = TensorlakeClient(namespace=project_id, service_url="http://localhost:8900")
|
|
55
|
+
click.secho("Everything looks good, deploying now", fg="green")
|
|
56
|
+
for graph in deployed_graphs:
|
|
57
|
+
# TODO: Every time we post we get a new version, is that expected or the client should do the checks?
|
|
58
|
+
remote = RemoteGraph.deploy(graph, client=client)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _wait_for_build(builder: ImageBuilderClient, build: Build):
|
|
62
|
+
click.echo(f"Waiting for {build.image_name} to build")
|
|
63
|
+
while build.status != "completed":
|
|
64
|
+
time.sleep(1)
|
|
65
|
+
build = builder.get_build(build.id)
|
|
66
|
+
|
|
67
|
+
if build.push_completed_at:
|
|
68
|
+
build_duration = build.build_completed_at - build.push_completed_at
|
|
69
|
+
click.echo(f"Building completed in {build.image_name} {build_duration.seconds}")
|
|
70
|
+
return build
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _build_image(
|
|
74
|
+
builder: ImageBuilderClient, image: Image, image_hash: str = ""
|
|
75
|
+
) -> Build:
|
|
76
|
+
click.echo(f"Building {image._image_name}")
|
|
77
|
+
fd, context_file = tempfile.mkstemp()
|
|
78
|
+
image.build_context(context_file)
|
|
79
|
+
|
|
80
|
+
click.echo(
|
|
81
|
+
f"{image._image_name}: Posting {os.path.getsize(context_file)} bytes of context to build service...."
|
|
82
|
+
)
|
|
83
|
+
files = {"context": open(context_file, "rb")}
|
|
84
|
+
data = {"name": image._image_name, "hash": image_hash}
|
|
85
|
+
|
|
86
|
+
res = builder.client.post(
|
|
87
|
+
f"{builder.build_service}/v1/builds",
|
|
88
|
+
data=data,
|
|
89
|
+
files=files,
|
|
90
|
+
headers=builder.headers,
|
|
91
|
+
timeout=60,
|
|
92
|
+
)
|
|
93
|
+
res.raise_for_status()
|
|
94
|
+
build = Build.model_validate(res.json())
|
|
95
|
+
|
|
96
|
+
return _wait_for_build(builder, build)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _show_failed_summary(builder: ImageBuilderClient, build: Build):
|
|
100
|
+
click.secho(
|
|
101
|
+
f"Building {build.image_name} failed with error message: {build.error_message}",
|
|
102
|
+
fg="red",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
log_response = builder.client.get(
|
|
106
|
+
f"{builder.build_service}/v1/builds/{build.id}/log", headers=builder.headers
|
|
107
|
+
)
|
|
108
|
+
if log_response.status_code == 200:
|
|
109
|
+
log = log_response.content.decode("utf-8")
|
|
110
|
+
click.echo(log)
|
|
111
|
+
elif log_response.status_code == 404:
|
|
112
|
+
click.echo("Logs not found")
|
|
113
|
+
else:
|
|
114
|
+
log_response.raise_for_status()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _prepare_images(builder: ImageBuilderClient, images: Dict[Image, str]):
|
|
118
|
+
ready_builds: Dict[Image, Build] = {}
|
|
119
|
+
# Go through the images and build anything that hasn't been built
|
|
120
|
+
for image, image_hash in images.items():
|
|
121
|
+
builds = builder.find_build(image._image_name, image_hash)
|
|
122
|
+
|
|
123
|
+
if builds:
|
|
124
|
+
build = builds[0]
|
|
125
|
+
if build.status == "completed":
|
|
126
|
+
if build.result == "failed":
|
|
127
|
+
_show_failed_summary(builder, build)
|
|
128
|
+
else:
|
|
129
|
+
click.secho(f"Image '{build.image_name}' is built", fg="green")
|
|
130
|
+
ready_builds[image] = build
|
|
131
|
+
|
|
132
|
+
elif build.status in ("ready", "building"):
|
|
133
|
+
build = _wait_for_build(builder, build)
|
|
134
|
+
if build.result != "failed":
|
|
135
|
+
ready_builds[image] = build
|
|
136
|
+
else:
|
|
137
|
+
_show_failed_summary(builder, build)
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
build = _build_image(builder, image, image_hash=image_hash)
|
|
141
|
+
|
|
142
|
+
# Find any blockers and report them to the users
|
|
143
|
+
blockers = []
|
|
144
|
+
for image in images:
|
|
145
|
+
if image not in ready_builds:
|
|
146
|
+
blockers.append(image)
|
|
147
|
+
click.secho(
|
|
148
|
+
f"Image {image._image_name} could not be built, this is blocking deployment",
|
|
149
|
+
fg="red",
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
build = ready_builds[image]
|
|
153
|
+
image.uri = build.uri
|
|
154
|
+
if blockers:
|
|
155
|
+
raise click.Abort
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@click.command()
|
|
159
|
+
@click.argument("workflow_file", type=click.File("r"))
|
|
160
|
+
def prepare(workflow_file: click.File):
|
|
161
|
+
"""Prepare a workflow and it's artifacts for deployment."""
|
|
162
|
+
|
|
163
|
+
click.echo(f"Preparing deployment for {workflow_file.name}")
|
|
164
|
+
client = ImageBuilderClient.from_env()
|
|
165
|
+
seen_images: Dict[Image, str] = {}
|
|
166
|
+
|
|
167
|
+
# Read the graph file and build the images
|
|
168
|
+
workflow_globals = {}
|
|
169
|
+
with open(workflow_file.name, "r") as f:
|
|
170
|
+
exec(f.read(), workflow_globals)
|
|
171
|
+
|
|
172
|
+
for name, obj in workflow_globals.items():
|
|
173
|
+
if isinstance(obj, Graph):
|
|
174
|
+
click.echo(f"Found graph {name}")
|
|
175
|
+
for node_name, node_obj in obj.nodes.items():
|
|
176
|
+
image = node_obj.image
|
|
177
|
+
click.echo(
|
|
178
|
+
f"graph function {node_name} uses image '{image._image_name}'"
|
|
179
|
+
)
|
|
180
|
+
if image in seen_images:
|
|
181
|
+
continue
|
|
182
|
+
seen_images[image] = image.hash()
|
|
183
|
+
|
|
184
|
+
click.echo(f"Found {len(seen_images)} images in this workflow")
|
|
185
|
+
_prepare_images(client, seen_images)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@click.command(help="Extract and display logs from tensorlake")
|
|
189
|
+
@click.option("--image", "-i")
|
|
190
|
+
def show_logs(image: str):
|
|
191
|
+
if image:
|
|
192
|
+
builder = ImageBuilderClient.from_env()
|
|
193
|
+
if ":" in image:
|
|
194
|
+
image, image_hash = image.split(":")
|
|
195
|
+
build = builder.find_build(image, image_hash)[0]
|
|
196
|
+
else:
|
|
197
|
+
build = builder.get_latest_build(image)
|
|
198
|
+
|
|
199
|
+
log_response = builder.client.get(
|
|
200
|
+
f"{builder.build_service}/v1/builds/{build.id}/log", headers=builder.headers
|
|
201
|
+
)
|
|
202
|
+
if log_response.status_code == 200:
|
|
203
|
+
log = log_response.content.decode("utf-8")
|
|
204
|
+
print(log)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
tensorlake.add_command(deploy)
|
|
208
|
+
tensorlake.add_command(prepare)
|
|
209
|
+
tensorlake.add_command(show_logs)
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
tensorlake()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import mimetypes
|
|
3
|
+
import os
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FileMetadata(BaseModel):
|
|
11
|
+
path: str
|
|
12
|
+
file_size: int
|
|
13
|
+
mime_type: str
|
|
14
|
+
md5_hash: str
|
|
15
|
+
created_at: int
|
|
16
|
+
updated_at: int
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_path(cls, path: str):
|
|
20
|
+
file_size = os.path.getsize(path)
|
|
21
|
+
mime_type = mimetypes.guess_type(path)[0]
|
|
22
|
+
|
|
23
|
+
# Compute MD5 hash
|
|
24
|
+
hash_md5 = hashlib.md5()
|
|
25
|
+
with open(path, "rb") as f:
|
|
26
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
27
|
+
hash_md5.update(chunk)
|
|
28
|
+
md5_hash = hash_md5.hexdigest()
|
|
29
|
+
|
|
30
|
+
created_at = int(os.path.getctime(path))
|
|
31
|
+
updated_at = int(os.path.getmtime(path))
|
|
32
|
+
|
|
33
|
+
return cls(
|
|
34
|
+
path=path,
|
|
35
|
+
file_size=file_size,
|
|
36
|
+
mime_type=str(mime_type),
|
|
37
|
+
md5_hash=md5_hash,
|
|
38
|
+
created_at=created_at,
|
|
39
|
+
updated_at=updated_at,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DataLoader(ABC):
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def load(self) -> List[FileMetadata]:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def read_all_bytes(self, file_metadata: FileMetadata) -> bytes:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def state(self) -> dict:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
from .local_directory_loader import LocalDirectoryLoader
|
|
58
|
+
from .url_loader import UrlLoader
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from . import DataLoader, FileMetadata
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LocalDirectoryLoader(DataLoader):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
directory: str,
|
|
11
|
+
file_extensions: Optional[List[str]] = None,
|
|
12
|
+
state: dict = {},
|
|
13
|
+
):
|
|
14
|
+
self.directory = directory
|
|
15
|
+
self.file_extensions = file_extensions
|
|
16
|
+
self.processed_files = set(state.get("processed_files", []))
|
|
17
|
+
|
|
18
|
+
def load(self) -> List[FileMetadata]:
|
|
19
|
+
file_metadata_list = []
|
|
20
|
+
for root, _, files in os.walk(self.directory):
|
|
21
|
+
for file in files:
|
|
22
|
+
if self.file_extensions is None or any(
|
|
23
|
+
file.endswith(ext) for ext in self.file_extensions
|
|
24
|
+
):
|
|
25
|
+
file_path = os.path.join(root, file)
|
|
26
|
+
if file_path not in self.processed_files:
|
|
27
|
+
file_metadata_list.append(FileMetadata.from_path(file_path))
|
|
28
|
+
self.processed_files.add(file_path)
|
|
29
|
+
|
|
30
|
+
return file_metadata_list
|
|
31
|
+
|
|
32
|
+
def read_all_bytes(self, file: FileMetadata) -> bytes:
|
|
33
|
+
with open(file.path, "rb") as f:
|
|
34
|
+
return f.read()
|
|
35
|
+
|
|
36
|
+
def state(self) -> dict:
|
|
37
|
+
return {"processed_files": list(self.processed_files)}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import email.utils
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from . import DataLoader, FileMetadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_date_to_epoch(date_str: str) -> int:
|
|
10
|
+
"""
|
|
11
|
+
Convert a date string from URL header to Unix epoch time.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
date_str (str): The date string from the URL header.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
int: The Unix epoch time.
|
|
18
|
+
"""
|
|
19
|
+
if not date_str:
|
|
20
|
+
return 0
|
|
21
|
+
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
|
22
|
+
return int(parsed_date.timestamp())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UrlLoader(DataLoader):
|
|
26
|
+
def __init__(self, urls: List[str], state: dict = {}):
|
|
27
|
+
self.urls = urls
|
|
28
|
+
|
|
29
|
+
def load(self) -> List[FileMetadata]:
|
|
30
|
+
file_metadata_list = []
|
|
31
|
+
for url in self.urls:
|
|
32
|
+
response = httpx.head(url, follow_redirects=True)
|
|
33
|
+
file_metadata_list.append(
|
|
34
|
+
FileMetadata(
|
|
35
|
+
path=url,
|
|
36
|
+
file_size=response.headers.get("content-length", 0),
|
|
37
|
+
mime_type=response.headers.get("content-type"),
|
|
38
|
+
md5_hash="",
|
|
39
|
+
created_at=convert_date_to_epoch(response.headers.get("date")),
|
|
40
|
+
updated_at=convert_date_to_epoch(
|
|
41
|
+
response.headers.get("last-modified")
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
return file_metadata_list
|
|
46
|
+
|
|
47
|
+
def read_all_bytes(self, file: FileMetadata) -> bytes:
|
|
48
|
+
response = httpx.get(file.path, follow_redirects=True)
|
|
49
|
+
return response.content
|
|
50
|
+
|
|
51
|
+
def state(self) -> dict:
|
|
52
|
+
return {}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FileInput(BaseModel):
|
|
7
|
+
url: str
|
|
8
|
+
mime_type: Optional[str] = None
|
|
9
|
+
metadata: Optional[Dict[str, Json]] = None
|
|
10
|
+
sha_256: Optional[str] = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RouterOutput(BaseModel):
|
|
14
|
+
edges: List[str]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IndexifyData(BaseModel):
|
|
18
|
+
id: Optional[str] = None
|
|
19
|
+
payload: Union[bytes, str]
|
|
20
|
+
encoder: Literal["cloudpickle", "json"] = "cloudpickle"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class File(BaseModel):
|
|
24
|
+
data: bytes
|
|
25
|
+
mime_type: Optional[str] = None
|
|
26
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
27
|
+
sha_256: Optional[str] = None
|