tensorlake 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.3
2
+ Name: tensorlake
3
+ Version: 0.1.0
4
+ Summary: Petabyte scale data framework for unstructured data of any modality
5
+ Home-page: https://github.com/tensorlakeai/tensorlake
6
+ Author: Tensorlake Inc.
7
+ Author-email: support@tensorlake.ai
8
+ Requires-Python: >=3.10,<4.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: cloudpickle (>=3.1.0,<4.0.0)
15
+ Requires-Dist: docker (>=7.1.0,<8.0.0)
16
+ Requires-Dist: httpx[http2] (>=0.28.1,<0.29.0)
17
+ Requires-Dist: pydantic (==2.10.4)
18
+ Project-URL: Repository, https://github.com/tensorlakeai/tensorlake
@@ -0,0 +1,25 @@
1
+ [tool.poetry]
2
+ name = "tensorlake"
3
+ version = "0.1.0"
4
+ description = "Petabyte scale data framework for unstructured data of any modality"
5
+ authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
6
+ homepage = "https://github.com/tensorlakeai/tensorlake"
7
+ repository = "https://github.com/tensorlakeai/tensorlake"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ httpx = { version = "^0.28.1", extras = ["http2"] }
12
+ pydantic = "2.10.4"
13
+ cloudpickle = "^3.1.0"
14
+ docker = "^7.1.0"
15
+
16
+ [tool.poetry.scripts]
17
+ tensorlake = "tensorlake.cli:tensorlake"
18
+
19
+ [build-system]
20
+ requires = ["poetry-core>=1.0.0"]
21
+ build-backend = "poetry.core.masonry.api"
22
+
23
+ [tool.poetry.group.dev.dependencies]
24
+ black = "^24.10.0"
25
+ pylint = "^3.3.0"
@@ -0,0 +1,31 @@
1
+ from . import data_loaders
2
+ from .functions_sdk.graph import Graph
3
+ from .functions_sdk.image import Image
4
+ from .functions_sdk.indexify_functions import (
5
+ IndexifyFunction,
6
+ IndexifyRouter,
7
+ get_ctx,
8
+ indexify_router,
9
+ tensorlake_function,
10
+ )
11
+ from .functions_sdk.pipeline import Pipeline
12
+ from .http_client import TensorlakeClient
13
+ from .remote_graph import RemoteGraph
14
+ from .remote_pipeline import RemotePipeline
15
+ from .settings import DEFAULT_SERVICE_URL
16
+
17
+ __all__ = [
18
+ "data_loaders",
19
+ "Graph",
20
+ "RemoteGraph",
21
+ "Pipeline",
22
+ "RemotePipeline",
23
+ "Image",
24
+ "tensorlake_function",
25
+ "get_ctx",
26
+ "IndexifyFunction",
27
+ "IndexifyRouter",
28
+ "indexify_router",
29
+ "DEFAULT_SERVICE_URL",
30
+ "TensorlakeClient",
31
+ ]
File without changes
@@ -0,0 +1,73 @@
1
+ import os
2
+ from typing import Dict, Optional
3
+
4
+ import httpx
5
+
6
+ from tensorlake.functions_sdk.image import Build
7
+
8
+
9
+ class ImageBuilderClient:
10
+ def __init__(self, build_service: str, api_key):
11
+ self.client = httpx
12
+ self.build_service = build_service
13
+ self.headers = {}
14
+ if api_key:
15
+ self.headers["Authorization"] = f"Bearer {api_key}"
16
+
17
+ @classmethod
18
+ def from_env(cls):
19
+ api_key = os.getenv("TENSORLAKE_API_KEY")
20
+ indexify_url = os.getenv("INDEXIFY_URL", "https://api.tensorlake.ai")
21
+ build_url = os.getenv(
22
+ "TENSORLAKE_BUILD_SERVICE", f"{indexify_url}/images"
23
+ ) # Mainly used for debugging/local testing
24
+ return cls(build_url, api_key)
25
+
26
+ def get(self, endpoint: str, params: Optional[Dict] = None):
27
+ res = self.client.get(
28
+ f"{self.build_service}{endpoint}", params=params, headers=self.headers
29
+ )
30
+ res.raise_for_status()
31
+ return res.json()
32
+
33
+ def get_build(self, build_id: int):
34
+ return Build.model_validate(self.get(f"/v1/builds/{build_id}"))
35
+
36
+ def get_build_logs(self, build_id: int):
37
+ log_response = self.client.get(f"{self.build_service}/builds/{build_id}/log")
38
+ log_response.raise_for_status()
39
+ return log_response.content.decode("utf-8")
40
+
41
+ def post(self, endpoint: str, data: Dict, files: Dict):
42
+ res = self.client.post(
43
+ f"{self.build_service}{endpoint}",
44
+ data=data,
45
+ files=files,
46
+ headers=self.headers,
47
+ timeout=60,
48
+ )
49
+ res.raise_for_status()
50
+ return res.json()
51
+
52
+ def build_exists(self, image_name: str, image_hash: str):
53
+ builds = self.find_build(image_name, image_hash)
54
+ return builds != []
55
+
56
+ def find_build(self, image_name: str, image_hash: str):
57
+ params = {"image_name": image_name, "image_hash": image_hash}
58
+ res = self.client.get(
59
+ f"{self.build_service}/v1/builds", headers=self.headers, params=params
60
+ )
61
+ res.raise_for_status()
62
+ return [Build.model_validate(b) for b in res.json()]
63
+
64
+ def get_latest_build(self, image_name: str) -> Build:
65
+ res = self.client.get(
66
+ f"{self.build_service}/v1/builds",
67
+ headers=self.headers,
68
+ params={"image_name": image_name},
69
+ )
70
+ res.raise_for_status()
71
+ builds = [Build.model_validate(b) for b in res.json()]
72
+ builds.sort(key=lambda b: b.created_at, reverse=True)
73
+ return builds[0]
@@ -0,0 +1,212 @@
1
+ import os
2
+ import tempfile
3
+ import time
4
+ from typing import Dict, List
5
+
6
+ import click
7
+
8
+ from tensorlake import Graph, Image, RemoteGraph, TensorlakeClient
9
+ from tensorlake.builder.client import ImageBuilderClient
10
+ from tensorlake.functions_sdk.image import Build
11
+
12
+
13
+ @click.group()
14
+ def tensorlake():
15
+ pass
16
+
17
+
18
+ @click.command()
19
+ @click.argument("workflow_file", type=click.File("r"))
20
+ def deploy(workflow_file: click.File):
21
+ """Deploy a workflow to tensorlake."""
22
+
23
+ click.echo(f"Preparing deployment for {workflow_file.name}")
24
+ builder = ImageBuilderClient.from_env()
25
+ seen_images: Dict[Image, str] = {}
26
+ deployed_graphs: List[Graph] = []
27
+
28
+ # Read the graph file and build the images
29
+ workflow_globals = {}
30
+ with open(workflow_file.name, "r") as f:
31
+ exec(f.read(), workflow_globals)
32
+
33
+ for name, obj in workflow_globals.items():
34
+ if isinstance(obj, Graph):
35
+ deployed_graphs.append(obj)
36
+ for node_name, node_obj in obj.nodes.items():
37
+ image = node_obj.image
38
+ if image in seen_images:
39
+ continue
40
+ seen_images[image] = image.hash()
41
+
42
+ _prepare_images(builder, seen_images)
43
+
44
+ # If we are still here then our images should all have URIs
45
+
46
+ # TODO: Fold calls to the platform API into a client class.
47
+ indexify_addr = os.getenv("INDEXIFY_URL", "https://api.tensorlake.ai")
48
+ introspect_response = builder.client.post(
49
+ f"{indexify_addr}/platform/v1/keys/introspect", headers=builder.headers
50
+ )
51
+ introspect_response.raise_for_status()
52
+ project_id = introspect_response.json()["projectId"]
53
+
54
+ client = TensorlakeClient(namespace=project_id, service_url="http://localhost:8900")
55
+ click.secho("Everything looks good, deploying now", fg="green")
56
+ for graph in deployed_graphs:
57
+ # TODO: Every time we post we get a new version, is that expected or the client should do the checks?
58
+ remote = RemoteGraph.deploy(graph, client=client)
59
+
60
+
61
+ def _wait_for_build(builder: ImageBuilderClient, build: Build):
62
+ click.echo(f"Waiting for {build.image_name} to build")
63
+ while build.status != "completed":
64
+ time.sleep(1)
65
+ build = builder.get_build(build.id)
66
+
67
+ if build.push_completed_at:
68
+ build_duration = build.build_completed_at - build.push_completed_at
69
+ click.echo(f"Building completed in {build.image_name} {build_duration.seconds}")
70
+ return build
71
+
72
+
73
+ def _build_image(
74
+ builder: ImageBuilderClient, image: Image, image_hash: str = ""
75
+ ) -> Build:
76
+ click.echo(f"Building {image._image_name}")
77
+ fd, context_file = tempfile.mkstemp()
78
+ image.build_context(context_file)
79
+
80
+ click.echo(
81
+ f"{image._image_name}: Posting {os.path.getsize(context_file)} bytes of context to build service...."
82
+ )
83
+ files = {"context": open(context_file, "rb")}
84
+ data = {"name": image._image_name, "hash": image_hash}
85
+
86
+ res = builder.client.post(
87
+ f"{builder.build_service}/v1/builds",
88
+ data=data,
89
+ files=files,
90
+ headers=builder.headers,
91
+ timeout=60,
92
+ )
93
+ res.raise_for_status()
94
+ build = Build.model_validate(res.json())
95
+
96
+ return _wait_for_build(builder, build)
97
+
98
+
99
+ def _show_failed_summary(builder: ImageBuilderClient, build: Build):
100
+ click.secho(
101
+ f"Building {build.image_name} failed with error message: {build.error_message}",
102
+ fg="red",
103
+ )
104
+
105
+ log_response = builder.client.get(
106
+ f"{builder.build_service}/v1/builds/{build.id}/log", headers=builder.headers
107
+ )
108
+ if log_response.status_code == 200:
109
+ log = log_response.content.decode("utf-8")
110
+ click.echo(log)
111
+ elif log_response.status_code == 404:
112
+ click.echo("Logs not found")
113
+ else:
114
+ log_response.raise_for_status()
115
+
116
+
117
+ def _prepare_images(builder: ImageBuilderClient, images: Dict[Image, str]):
118
+ ready_builds: Dict[Image, Build] = {}
119
+ # Go through the images and build anything that hasn't been built
120
+ for image, image_hash in images.items():
121
+ builds = builder.find_build(image._image_name, image_hash)
122
+
123
+ if builds:
124
+ build = builds[0]
125
+ if build.status == "completed":
126
+ if build.result == "failed":
127
+ _show_failed_summary(builder, build)
128
+ else:
129
+ click.secho(f"Image '{build.image_name}' is built", fg="green")
130
+ ready_builds[image] = build
131
+
132
+ elif build.status in ("ready", "building"):
133
+ build = _wait_for_build(builder, build)
134
+ if build.result != "failed":
135
+ ready_builds[image] = build
136
+ else:
137
+ _show_failed_summary(builder, build)
138
+
139
+ else:
140
+ build = _build_image(builder, image, image_hash=image_hash)
141
+
142
+ # Find any blockers and report them to the users
143
+ blockers = []
144
+ for image in images:
145
+ if image not in ready_builds:
146
+ blockers.append(image)
147
+ click.secho(
148
+ f"Image {image._image_name} could not be built, this is blocking deployment",
149
+ fg="red",
150
+ )
151
+ else:
152
+ build = ready_builds[image]
153
+ image.uri = build.uri
154
+ if blockers:
155
+ raise click.Abort
156
+
157
+
158
+ @click.command()
159
+ @click.argument("workflow_file", type=click.File("r"))
160
+ def prepare(workflow_file: click.File):
161
+ """Prepare a workflow and it's artifacts for deployment."""
162
+
163
+ click.echo(f"Preparing deployment for {workflow_file.name}")
164
+ client = ImageBuilderClient.from_env()
165
+ seen_images: Dict[Image, str] = {}
166
+
167
+ # Read the graph file and build the images
168
+ workflow_globals = {}
169
+ with open(workflow_file.name, "r") as f:
170
+ exec(f.read(), workflow_globals)
171
+
172
+ for name, obj in workflow_globals.items():
173
+ if isinstance(obj, Graph):
174
+ click.echo(f"Found graph {name}")
175
+ for node_name, node_obj in obj.nodes.items():
176
+ image = node_obj.image
177
+ click.echo(
178
+ f"graph function {node_name} uses image '{image._image_name}'"
179
+ )
180
+ if image in seen_images:
181
+ continue
182
+ seen_images[image] = image.hash()
183
+
184
+ click.echo(f"Found {len(seen_images)} images in this workflow")
185
+ _prepare_images(client, seen_images)
186
+
187
+
188
+ @click.command(help="Extract and display logs from tensorlake")
189
+ @click.option("--image", "-i")
190
+ def show_logs(image: str):
191
+ if image:
192
+ builder = ImageBuilderClient.from_env()
193
+ if ":" in image:
194
+ image, image_hash = image.split(":")
195
+ build = builder.find_build(image, image_hash)[0]
196
+ else:
197
+ build = builder.get_latest_build(image)
198
+
199
+ log_response = builder.client.get(
200
+ f"{builder.build_service}/v1/builds/{build.id}/log", headers=builder.headers
201
+ )
202
+ if log_response.status_code == 200:
203
+ log = log_response.content.decode("utf-8")
204
+ print(log)
205
+
206
+
207
+ tensorlake.add_command(deploy)
208
+ tensorlake.add_command(prepare)
209
+ tensorlake.add_command(show_logs)
210
+
211
+ if __name__ == "__main__":
212
+ tensorlake()
@@ -0,0 +1,58 @@
1
+ import hashlib
2
+ import mimetypes
3
+ import os
4
+ from abc import ABC, abstractmethod
5
+ from typing import List
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ class FileMetadata(BaseModel):
11
+ path: str
12
+ file_size: int
13
+ mime_type: str
14
+ md5_hash: str
15
+ created_at: int
16
+ updated_at: int
17
+
18
+ @classmethod
19
+ def from_path(cls, path: str):
20
+ file_size = os.path.getsize(path)
21
+ mime_type = mimetypes.guess_type(path)[0]
22
+
23
+ # Compute MD5 hash
24
+ hash_md5 = hashlib.md5()
25
+ with open(path, "rb") as f:
26
+ for chunk in iter(lambda: f.read(4096), b""):
27
+ hash_md5.update(chunk)
28
+ md5_hash = hash_md5.hexdigest()
29
+
30
+ created_at = int(os.path.getctime(path))
31
+ updated_at = int(os.path.getmtime(path))
32
+
33
+ return cls(
34
+ path=path,
35
+ file_size=file_size,
36
+ mime_type=str(mime_type),
37
+ md5_hash=md5_hash,
38
+ created_at=created_at,
39
+ updated_at=updated_at,
40
+ )
41
+
42
+
43
+ class DataLoader(ABC):
44
+ @abstractmethod
45
+ def load(self) -> List[FileMetadata]:
46
+ pass
47
+
48
+ @abstractmethod
49
+ def read_all_bytes(self, file_metadata: FileMetadata) -> bytes:
50
+ pass
51
+
52
+ @abstractmethod
53
+ def state(self) -> dict:
54
+ pass
55
+
56
+
57
+ from .local_directory_loader import LocalDirectoryLoader
58
+ from .url_loader import UrlLoader
@@ -0,0 +1,37 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from . import DataLoader, FileMetadata
5
+
6
+
7
+ class LocalDirectoryLoader(DataLoader):
8
+ def __init__(
9
+ self,
10
+ directory: str,
11
+ file_extensions: Optional[List[str]] = None,
12
+ state: dict = {},
13
+ ):
14
+ self.directory = directory
15
+ self.file_extensions = file_extensions
16
+ self.processed_files = set(state.get("processed_files", []))
17
+
18
+ def load(self) -> List[FileMetadata]:
19
+ file_metadata_list = []
20
+ for root, _, files in os.walk(self.directory):
21
+ for file in files:
22
+ if self.file_extensions is None or any(
23
+ file.endswith(ext) for ext in self.file_extensions
24
+ ):
25
+ file_path = os.path.join(root, file)
26
+ if file_path not in self.processed_files:
27
+ file_metadata_list.append(FileMetadata.from_path(file_path))
28
+ self.processed_files.add(file_path)
29
+
30
+ return file_metadata_list
31
+
32
+ def read_all_bytes(self, file: FileMetadata) -> bytes:
33
+ with open(file.path, "rb") as f:
34
+ return f.read()
35
+
36
+ def state(self) -> dict:
37
+ return {"processed_files": list(self.processed_files)}
@@ -0,0 +1,52 @@
1
+ import email.utils
2
+ from typing import List
3
+
4
+ import httpx
5
+
6
+ from . import DataLoader, FileMetadata
7
+
8
+
9
+ def convert_date_to_epoch(date_str: str) -> int:
10
+ """
11
+ Convert a date string from URL header to Unix epoch time.
12
+
13
+ Args:
14
+ date_str (str): The date string from the URL header.
15
+
16
+ Returns:
17
+ int: The Unix epoch time.
18
+ """
19
+ if not date_str:
20
+ return 0
21
+ parsed_date = email.utils.parsedate_to_datetime(date_str)
22
+ return int(parsed_date.timestamp())
23
+
24
+
25
+ class UrlLoader(DataLoader):
26
+ def __init__(self, urls: List[str], state: dict = {}):
27
+ self.urls = urls
28
+
29
+ def load(self) -> List[FileMetadata]:
30
+ file_metadata_list = []
31
+ for url in self.urls:
32
+ response = httpx.head(url, follow_redirects=True)
33
+ file_metadata_list.append(
34
+ FileMetadata(
35
+ path=url,
36
+ file_size=response.headers.get("content-length", 0),
37
+ mime_type=response.headers.get("content-type"),
38
+ md5_hash="",
39
+ created_at=convert_date_to_epoch(response.headers.get("date")),
40
+ updated_at=convert_date_to_epoch(
41
+ response.headers.get("last-modified")
42
+ ),
43
+ )
44
+ )
45
+ return file_metadata_list
46
+
47
+ def read_all_bytes(self, file: FileMetadata) -> bytes:
48
+ response = httpx.get(file.path, follow_redirects=True)
49
+ return response.content
50
+
51
+ def state(self) -> dict:
52
+ return {}
@@ -0,0 +1,8 @@
1
+ class ApiException(Exception):
2
+ def __init__(self, message: str) -> None:
3
+ super().__init__(message)
4
+
5
+
6
+ class GraphStillProcessing(Exception):
7
+ def __init__(self) -> None:
8
+ super().__init__("graph is still processing")
@@ -0,0 +1,27 @@
1
+ from typing import Any, Dict, List, Literal, Optional, Union
2
+
3
+ from pydantic import BaseModel, Json
4
+
5
+
6
+ class FileInput(BaseModel):
7
+ url: str
8
+ mime_type: Optional[str] = None
9
+ metadata: Optional[Dict[str, Json]] = None
10
+ sha_256: Optional[str] = None
11
+
12
+
13
+ class RouterOutput(BaseModel):
14
+ edges: List[str]
15
+
16
+
17
+ class IndexifyData(BaseModel):
18
+ id: Optional[str] = None
19
+ payload: Union[bytes, str]
20
+ encoder: Literal["cloudpickle", "json"] = "cloudpickle"
21
+
22
+
23
+ class File(BaseModel):
24
+ data: bytes
25
+ mime_type: Optional[str] = None
26
+ metadata: Optional[Dict[str, Any]] = None
27
+ sha_256: Optional[str] = None