pyspiral 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.12__cp312-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.12.dist-info}/METADATA +1 -1
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.12.dist-info}/RECORD +31 -27
- spiral/_lib.abi3.so +0 -0
- spiral/cli/key_spaces.py +1 -1
- spiral/cli/tables.py +3 -3
- spiral/client.py +20 -12
- spiral/core/client/__init__.pyi +8 -8
- spiral/core/expr/__init__.pyi +15 -0
- spiral/core/expr/images/__init__.pyi +3 -0
- spiral/core/expr/list_/__init__.pyi +4 -0
- spiral/core/expr/refs/__init__.pyi +4 -0
- spiral/core/expr/str_/__init__.pyi +3 -0
- spiral/core/expr/struct_/__init__.pyi +6 -0
- spiral/core/expr/text/__init__.pyi +5 -0
- spiral/core/expr/udf/__init__.pyi +14 -0
- spiral/core/expr/video/__init__.pyi +3 -0
- spiral/core/table/__init__.pyi +10 -1
- spiral/core/table/spec/__init__.pyi +4 -0
- spiral/dataloader.py +46 -37
- spiral/expressions/__init__.py +13 -20
- spiral/expressions/base.py +9 -4
- spiral/expressions/s3.py +18 -0
- spiral/expressions/tiff.py +2 -3
- spiral/expressions/udf.py +34 -25
- spiral/project.py +6 -6
- spiral/scan.py +28 -0
- spiral/streaming_/stream.py +1 -1
- spiral/table.py +25 -5
- spiral/transaction.py +27 -0
- spiral/expressions/http.py +0 -86
- spiral/expressions/io.py +0 -100
- spiral/expressions/mp4.py +0 -62
- spiral/expressions/png.py +0 -18
- spiral/expressions/qoi.py +0 -18
- spiral/expressions/refs.py +0 -58
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.12.dist-info}/WHEEL +0 -0
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.12.dist-info}/entry_points.txt +0 -0
spiral/expressions/tiff.py
CHANGED
|
@@ -2,7 +2,6 @@ import numpy as np
|
|
|
2
2
|
import pyarrow as pa
|
|
3
3
|
|
|
4
4
|
from spiral.expressions.base import Expr, ExprLike
|
|
5
|
-
from spiral.expressions.udf import RefUDF
|
|
6
5
|
|
|
7
6
|
_TIFF_RES_DTYPE: pa.DataType = pa.struct(
|
|
8
7
|
[
|
|
@@ -78,7 +77,7 @@ def select(
|
|
|
78
77
|
return TiffSelectUDF()(expr, shape, indexes)
|
|
79
78
|
|
|
80
79
|
|
|
81
|
-
class TiffReadUDF
|
|
80
|
+
class TiffReadUDF:
|
|
82
81
|
def __init__(self):
|
|
83
82
|
super().__init__("tiff.read")
|
|
84
83
|
|
|
@@ -122,7 +121,7 @@ class TiffReadUDF(RefUDF):
|
|
|
122
121
|
return _return_result(result, indexes)
|
|
123
122
|
|
|
124
123
|
|
|
125
|
-
class TiffSelectUDF
|
|
124
|
+
class TiffSelectUDF:
|
|
126
125
|
def __init__(self):
|
|
127
126
|
super().__init__("tiff.select")
|
|
128
127
|
|
spiral/expressions/udf.py
CHANGED
|
@@ -3,44 +3,53 @@ import abc
|
|
|
3
3
|
import pyarrow as pa
|
|
4
4
|
|
|
5
5
|
from spiral import _lib
|
|
6
|
-
from spiral.expressions.base import Expr
|
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
|
|
11
|
-
self._udf = udf
|
|
9
|
+
class UDF(abc.ABC):
|
|
10
|
+
"""A User-Defined Function (UDF). This class should be subclassed to define custom UDFs.
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
"""Create an expression that calls this UDF with the given arguments."""
|
|
15
|
-
from spiral import expressions as se
|
|
12
|
+
Example:
|
|
16
13
|
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
```python
|
|
15
|
+
from spiral import expressions as se
|
|
16
|
+
import pyarrow as pa
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
class MyAdd(se.UDF):
|
|
19
|
+
def __init__(self):
|
|
20
|
+
super().__init__("my_add")
|
|
22
21
|
|
|
22
|
+
def return_type(self, scope: pa.DataType):
|
|
23
|
+
if not isinstance(scope, pa.StructType):
|
|
24
|
+
raise ValueError("Expected struct type as input")
|
|
25
|
+
return scope.field(0).type
|
|
23
26
|
|
|
24
|
-
|
|
25
|
-
|
|
27
|
+
def invoke(self, scope: pa.Array):
|
|
28
|
+
if not isinstance(scope, pa.StructArray):
|
|
29
|
+
raise ValueError("Expected struct array as input")
|
|
30
|
+
return pa.compute.add(scope.field(0), scope.field(1))
|
|
26
31
|
|
|
27
|
-
|
|
28
|
-
super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
|
|
32
|
+
my_add = MyAdd()
|
|
29
33
|
|
|
30
|
-
|
|
31
|
-
|
|
34
|
+
expr = my_add(table.select("first_arg", "second_arg"))
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
32
37
|
|
|
38
|
+
def __init__(self, name: str):
|
|
39
|
+
self._udf = _lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke)
|
|
33
40
|
|
|
34
|
-
|
|
35
|
-
|
|
41
|
+
def __call__(self, scope: ExprLike) -> Expr:
|
|
42
|
+
"""Create an expression that calls this UDF with the given arguments."""
|
|
43
|
+
from spiral import expressions as se
|
|
36
44
|
|
|
37
|
-
|
|
38
|
-
super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
|
|
45
|
+
return Expr(self._udf(se.lift(scope).__expr__))
|
|
39
46
|
|
|
40
47
|
@abc.abstractmethod
|
|
41
|
-
def
|
|
42
|
-
"""
|
|
48
|
+
def return_type(self, scope: pa.DataType) -> pa.DataType:
|
|
49
|
+
"""Must return the return type of the UDF given the input scope type."""
|
|
50
|
+
...
|
|
43
51
|
|
|
44
|
-
|
|
45
|
-
|
|
52
|
+
@abc.abstractmethod
|
|
53
|
+
def invoke(self, scope: pa.Array) -> pa.Array:
|
|
54
|
+
"""Must implement the UDF logic given the input scope array."""
|
|
46
55
|
...
|
spiral/project.py
CHANGED
|
@@ -53,7 +53,7 @@ class Project:
|
|
|
53
53
|
res = res[0]
|
|
54
54
|
|
|
55
55
|
return Table(
|
|
56
|
-
self._spiral, self._spiral.
|
|
56
|
+
self._spiral, self._spiral.core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
def create_table(
|
|
@@ -78,7 +78,7 @@ class Project:
|
|
|
78
78
|
key_schema = pa.schema(key_schema)
|
|
79
79
|
key_schema = Schema.from_arrow(key_schema)
|
|
80
80
|
|
|
81
|
-
core_table = self._spiral.
|
|
81
|
+
core_table = self._spiral.core.create_table(
|
|
82
82
|
project_id=self._id,
|
|
83
83
|
dataset=dataset,
|
|
84
84
|
table=table,
|
|
@@ -105,7 +105,7 @@ class Project:
|
|
|
105
105
|
raise ValueError(f"Index not found: {name}")
|
|
106
106
|
res = res[0]
|
|
107
107
|
|
|
108
|
-
return TextIndex(self._spiral.
|
|
108
|
+
return TextIndex(self._spiral.core.text_index(res.id), name=name)
|
|
109
109
|
|
|
110
110
|
def create_text_index(
|
|
111
111
|
self,
|
|
@@ -135,7 +135,7 @@ class Project:
|
|
|
135
135
|
if where is not None:
|
|
136
136
|
where = se.lift(where)
|
|
137
137
|
|
|
138
|
-
core_index = self._spiral.
|
|
138
|
+
core_index = self._spiral.core.create_text_index(
|
|
139
139
|
project_id=self._id,
|
|
140
140
|
name=name,
|
|
141
141
|
projection=projection.__expr__,
|
|
@@ -154,7 +154,7 @@ class Project:
|
|
|
154
154
|
raise ValueError(f"Index not found: {name}")
|
|
155
155
|
res = res[0]
|
|
156
156
|
|
|
157
|
-
return KeySpaceIndex(self._spiral.
|
|
157
|
+
return KeySpaceIndex(self._spiral.core.key_space_index(res.id), name=name)
|
|
158
158
|
|
|
159
159
|
def create_key_space_index(
|
|
160
160
|
self,
|
|
@@ -185,7 +185,7 @@ class Project:
|
|
|
185
185
|
if where is not None:
|
|
186
186
|
where = se.lift(where)
|
|
187
187
|
|
|
188
|
-
core_index = self._spiral.
|
|
188
|
+
core_index = self._spiral.core.create_key_space_index(
|
|
189
189
|
project_id=self._id,
|
|
190
190
|
name=name,
|
|
191
191
|
granularity=granularity,
|
spiral/scan.py
CHANGED
|
@@ -4,6 +4,7 @@ import pyarrow as pa
|
|
|
4
4
|
|
|
5
5
|
from spiral.core.client import Shard, ShuffleConfig
|
|
6
6
|
from spiral.core.table import Scan as CoreScan
|
|
7
|
+
from spiral.core.table import ScanState as CoreScanState
|
|
7
8
|
from spiral.core.table.spec import Schema
|
|
8
9
|
from spiral.settings import CI, DEV
|
|
9
10
|
|
|
@@ -18,12 +19,34 @@ if TYPE_CHECKING:
|
|
|
18
19
|
from spiral.dataloader import SpiralDataLoader, World # noqa
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
class ScanState:
|
|
23
|
+
"""
|
|
24
|
+
Evaluated properties of the scan
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
__slots__ = ("core",)
|
|
28
|
+
|
|
29
|
+
def __init__(self, core: CoreScanState):
|
|
30
|
+
self.core = core
|
|
31
|
+
|
|
32
|
+
def __getstate__(self):
|
|
33
|
+
return self.core.to_json()
|
|
34
|
+
|
|
35
|
+
def __setstate__(self, state):
|
|
36
|
+
self.core = CoreScanState.from_json(state)
|
|
37
|
+
|
|
38
|
+
|
|
21
39
|
class Scan:
|
|
22
40
|
"""Scan object."""
|
|
23
41
|
|
|
24
42
|
def __init__(self, core: CoreScan):
|
|
25
43
|
self.core = core
|
|
26
44
|
|
|
45
|
+
@property
|
|
46
|
+
def scan_state(self) -> ScanState:
|
|
47
|
+
"""Returns evaluated properties of the scan."""
|
|
48
|
+
return ScanState(self.core.scan_state())
|
|
49
|
+
|
|
27
50
|
@property
|
|
28
51
|
def metrics(self) -> dict[str, Any]:
|
|
29
52
|
"""Returns metrics about the scan."""
|
|
@@ -34,6 +57,11 @@ class Scan:
|
|
|
34
57
|
"""Returns the schema of the scan."""
|
|
35
58
|
return self.core.schema()
|
|
36
59
|
|
|
60
|
+
@property
|
|
61
|
+
def key_schema(self) -> Schema:
|
|
62
|
+
"""Returns the key schema of the scan."""
|
|
63
|
+
return self.core.key_schema()
|
|
64
|
+
|
|
37
65
|
def is_empty(self) -> bool:
|
|
38
66
|
"""Check if the Spiral is empty for the given key range.
|
|
39
67
|
|
spiral/streaming_/stream.py
CHANGED
spiral/table.py
CHANGED
|
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
from spiral.client import Spiral
|
|
19
19
|
from spiral.dataloader import SpiralDataLoader
|
|
20
20
|
from spiral.key_space_index import KeySpaceIndex
|
|
21
|
+
from spiral.scan import Scan
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class Table(Expr):
|
|
@@ -110,6 +111,25 @@ class Table(Expr):
|
|
|
110
111
|
partition_size_bytes=partition_size_bytes,
|
|
111
112
|
)
|
|
112
113
|
|
|
114
|
+
def writeback(
|
|
115
|
+
self,
|
|
116
|
+
scan: "Scan",
|
|
117
|
+
*,
|
|
118
|
+
partition_size_bytes: int | None = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
"""Write back the results of a scan to the table.
|
|
121
|
+
|
|
122
|
+
:param scan: The scan to write back.
|
|
123
|
+
The scan does NOT need to be over the same table as transaction,
|
|
124
|
+
but it does need to have the same key schema.
|
|
125
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
|
126
|
+
"""
|
|
127
|
+
with self.txn() as txn:
|
|
128
|
+
txn.writeback(
|
|
129
|
+
scan,
|
|
130
|
+
partition_size_bytes=partition_size_bytes,
|
|
131
|
+
)
|
|
132
|
+
|
|
113
133
|
def drop_columns(self, column_paths: list[str]) -> None:
|
|
114
134
|
"""
|
|
115
135
|
Drops the specified columns from the table.
|
|
@@ -136,7 +156,7 @@ class Table(Expr):
|
|
|
136
156
|
it is important that the primary key columns are unique within the transaction.
|
|
137
157
|
The behavior is undefined if this is not the case.
|
|
138
158
|
"""
|
|
139
|
-
return Transaction(self.spiral.
|
|
159
|
+
return Transaction(self.spiral.core.transaction(self.core, settings().file_format, retries=retries))
|
|
140
160
|
|
|
141
161
|
def to_dataset(self) -> "ds.Dataset":
|
|
142
162
|
"""Returns a PyArrow Dataset representing the table."""
|
|
@@ -175,7 +195,7 @@ class Table(Expr):
|
|
|
175
195
|
if index.asof == 0:
|
|
176
196
|
raise ValueError("Index have to be synced before it can be used.")
|
|
177
197
|
|
|
178
|
-
shards = self.spiral.
|
|
198
|
+
shards = self.spiral.internal.compute_shards(index=index.core)
|
|
179
199
|
|
|
180
200
|
return self.spiral.scan(
|
|
181
201
|
projection if projection is not None else index.projection,
|
|
@@ -208,7 +228,7 @@ class Table(Expr):
|
|
|
208
228
|
if index.asof == 0:
|
|
209
229
|
raise ValueError("Index have to be synced before it can be used.")
|
|
210
230
|
|
|
211
|
-
shards = self.spiral.
|
|
231
|
+
shards = self.spiral.core.internal.compute_shards(index=index.core)
|
|
212
232
|
|
|
213
233
|
return self.spiral.scan(
|
|
214
234
|
projection if projection is not None else index.projection,
|
|
@@ -240,7 +260,7 @@ class Table(Expr):
|
|
|
240
260
|
if index.asof == 0:
|
|
241
261
|
raise ValueError("Index have to be synced before it can be used.")
|
|
242
262
|
|
|
243
|
-
shards = self.spiral.
|
|
263
|
+
shards = self.spiral.core.internal.compute_shards(index=index.core)
|
|
244
264
|
|
|
245
265
|
return self.spiral.scan(
|
|
246
266
|
index.projection,
|
|
@@ -282,7 +302,7 @@ class Table(Expr):
|
|
|
282
302
|
where=index.filter,
|
|
283
303
|
asof=index.asof,
|
|
284
304
|
)
|
|
285
|
-
shards = self.spiral.
|
|
305
|
+
shards = self.spiral.internal.compute_shards(index=index.core)
|
|
286
306
|
|
|
287
307
|
return SpiralStream(
|
|
288
308
|
sp=self.spiral,
|
spiral/transaction.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from spiral.core.table import Transaction as CoreTransaction
|
|
2
|
+
from spiral.core.table.spec import Operation
|
|
2
3
|
from spiral.expressions.base import ExprLike
|
|
4
|
+
from spiral.scan import Scan
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
class Transaction:
|
|
@@ -39,6 +41,17 @@ class Transaction:
|
|
|
39
41
|
|
|
40
42
|
self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
|
|
41
43
|
|
|
44
|
+
def writeback(self, scan: Scan, *, partition_size_bytes: int | None = None):
|
|
45
|
+
"""Write back the results of a scan to the table.
|
|
46
|
+
|
|
47
|
+
:param scan: The scan to write back.
|
|
48
|
+
The scan does NOT need to be over the same table as transaction,
|
|
49
|
+
but it does need to have the same key schema.
|
|
50
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
|
51
|
+
If not provided, the default partition size is used.
|
|
52
|
+
"""
|
|
53
|
+
self._core.writeback(scan.core, partition_size_bytes=partition_size_bytes)
|
|
54
|
+
|
|
42
55
|
def drop_columns(self, column_paths: list[str]):
|
|
43
56
|
"""
|
|
44
57
|
Drops the specified columns from the table.
|
|
@@ -49,6 +62,20 @@ class Transaction:
|
|
|
49
62
|
"""
|
|
50
63
|
self._core.drop_columns(column_paths)
|
|
51
64
|
|
|
65
|
+
def take(self) -> list[Operation]:
|
|
66
|
+
"""Take the operations from the transaction
|
|
67
|
+
|
|
68
|
+
Transaction can no longer be committed or aborted after calling this method.
|
|
69
|
+
."""
|
|
70
|
+
return self._core.take()
|
|
71
|
+
|
|
72
|
+
def include(self, ops: list[Operation]):
|
|
73
|
+
"""Include the given operations in the transaction.
|
|
74
|
+
|
|
75
|
+
Checks for conflicts between the included operations and any existing operations.
|
|
76
|
+
"""
|
|
77
|
+
self._core.include(ops)
|
|
78
|
+
|
|
52
79
|
def commit(self):
|
|
53
80
|
"""Commit the transaction."""
|
|
54
81
|
self._core.commit()
|
spiral/expressions/http.py
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import hishel
|
|
2
|
-
import httpx
|
|
3
|
-
import pyarrow as pa
|
|
4
|
-
|
|
5
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
6
|
-
from spiral.expressions.struct import pack
|
|
7
|
-
from spiral.expressions.udf import UDF
|
|
8
|
-
from spiral.settings import APP_DIR
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
|
|
12
|
-
"""Submit a GET request to either a scalar of vector of URLs."""
|
|
13
|
-
to_pack = {"url": url}
|
|
14
|
-
if headers is not None:
|
|
15
|
-
to_pack["headers"] = headers
|
|
16
|
-
return HttpGet(force_cache)(pack(to_pack))
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class HttpGet(UDF):
|
|
20
|
-
RES_DTYPE: pa.DataType = pa.struct(
|
|
21
|
-
[
|
|
22
|
-
pa.field("bytes", pa.large_binary()),
|
|
23
|
-
pa.field("status", pa.int32()),
|
|
24
|
-
pa.field("headers", pa.map_(pa.string(), pa.string())),
|
|
25
|
-
]
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
def __init__(self, force_cache: bool = False):
|
|
29
|
-
super().__init__("http.get")
|
|
30
|
-
self._force_cache = force_cache
|
|
31
|
-
|
|
32
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
33
|
-
return HttpGet.RES_DTYPE
|
|
34
|
-
|
|
35
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
36
|
-
if len(input_args) != 1:
|
|
37
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
38
|
-
result = _http_request(input_args[0], self._force_cache)
|
|
39
|
-
if isinstance(result, pa.ChunkedArray):
|
|
40
|
-
result = result.combine_chunks()
|
|
41
|
-
return result
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
|
|
45
|
-
client = _HttpClient()
|
|
46
|
-
|
|
47
|
-
if isinstance(arg, pa.StructArray):
|
|
48
|
-
# We assume a vector of requests, but with potentially many arguments
|
|
49
|
-
return pa.array(
|
|
50
|
-
[
|
|
51
|
-
_response_dict(
|
|
52
|
-
client.request(
|
|
53
|
-
req.get("method", "GET").upper(),
|
|
54
|
-
req["url"],
|
|
55
|
-
headers=req.get("headers", {}),
|
|
56
|
-
extensions={"force_cache": force_cache},
|
|
57
|
-
)
|
|
58
|
-
)
|
|
59
|
-
for req in arg.to_pylist()
|
|
60
|
-
],
|
|
61
|
-
type=HttpGet.RES_DTYPE,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def _response_dict(response: httpx.Response) -> dict:
|
|
68
|
-
if response.status_code != 200:
|
|
69
|
-
raise ValueError(f"Request failed with status {response.status_code}")
|
|
70
|
-
return {
|
|
71
|
-
"bytes": response.read(),
|
|
72
|
-
"status": response.status_code,
|
|
73
|
-
"headers": dict(response.headers),
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class _HttpClient(hishel.CacheClient):
|
|
78
|
-
_instance: "_HttpClient" = None
|
|
79
|
-
|
|
80
|
-
def __new__(cls, *args, **kwargs):
|
|
81
|
-
if not cls._instance:
|
|
82
|
-
cls._instance = super().__new__(cls)
|
|
83
|
-
return cls._instance
|
|
84
|
-
|
|
85
|
-
def __init__(self):
|
|
86
|
-
super().__init__(storage=hishel.FileStorage(base_path=APP_DIR / "http.cache", ttl=3600))
|
spiral/expressions/io.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import tarfile
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
|
|
4
|
-
import pyarrow as pa
|
|
5
|
-
|
|
6
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
7
|
-
from spiral.expressions.struct import pack
|
|
8
|
-
from spiral.expressions.udf import UDF
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def read_file(path: ExprLike) -> Expr:
|
|
12
|
-
"""
|
|
13
|
-
Read file path(s) from disk into a struct with a single field "bytes" containing the file contents.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
path: Expression evaluating to an array of strings representing local disk paths.
|
|
17
|
-
"""
|
|
18
|
-
to_pack = {"path": path}
|
|
19
|
-
return FileRead()(pack(to_pack))
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class FileRead(UDF):
|
|
23
|
-
RES_DTYPE: pa.DataType = pa.struct(
|
|
24
|
-
[
|
|
25
|
-
pa.field("bytes", pa.large_binary()),
|
|
26
|
-
]
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
def __init__(self):
|
|
30
|
-
super().__init__("file.read")
|
|
31
|
-
|
|
32
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
33
|
-
return FileRead.RES_DTYPE
|
|
34
|
-
|
|
35
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
36
|
-
if len(input_args) != 1:
|
|
37
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
38
|
-
arg = input_args[0]
|
|
39
|
-
|
|
40
|
-
res = []
|
|
41
|
-
for req in arg:
|
|
42
|
-
with open(req["path"].as_py(), "rb") as f:
|
|
43
|
-
res.append({"bytes": f.read()})
|
|
44
|
-
|
|
45
|
-
return pa.array(res, type=FileRead.RES_DTYPE)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def read_tar(path: ExprLike = None, bytes_: ExprLike = None) -> "Expr":
|
|
49
|
-
# Untar a vector of paths / byte arrays representing tarballs.
|
|
50
|
-
if path is None and bytes_ is None:
|
|
51
|
-
raise ValueError("Expected either path or bytes_ to be provided")
|
|
52
|
-
to_pack = {}
|
|
53
|
-
if path is not None:
|
|
54
|
-
to_pack["path"] = path
|
|
55
|
-
if bytes_ is not None:
|
|
56
|
-
to_pack["bytes"] = bytes_
|
|
57
|
-
return TarRead()(pack(to_pack))
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class TarRead(UDF):
|
|
61
|
-
RES_DTYPE = pa.list_(
|
|
62
|
-
pa.struct(
|
|
63
|
-
[
|
|
64
|
-
pa.field("name", pa.string()),
|
|
65
|
-
pa.field("bytes", pa.large_binary()),
|
|
66
|
-
]
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def __init__(self):
|
|
71
|
-
super().__init__("tar.read")
|
|
72
|
-
|
|
73
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
74
|
-
return TarRead.RES_DTYPE
|
|
75
|
-
|
|
76
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
77
|
-
if len(input_args) != 1:
|
|
78
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
79
|
-
arg = input_args[0]
|
|
80
|
-
|
|
81
|
-
res = []
|
|
82
|
-
for req in arg:
|
|
83
|
-
if "path" in req:
|
|
84
|
-
kwargs = {"name": req["path"].as_py()}
|
|
85
|
-
elif "bytes" in req:
|
|
86
|
-
kwargs = {"fileobj": BytesIO(req["bytes"].as_py())}
|
|
87
|
-
else:
|
|
88
|
-
raise ValueError("Expected path or bytes_ to be provided")
|
|
89
|
-
|
|
90
|
-
files = []
|
|
91
|
-
with tarfile.open(**kwargs) as f:
|
|
92
|
-
for m in f.getmembers():
|
|
93
|
-
m: tarfile.TarInfo
|
|
94
|
-
if m.type == tarfile.DIRTYPE:
|
|
95
|
-
continue
|
|
96
|
-
# TODO(ngates): skip other types too maybe? Why are we even skipping directories?
|
|
97
|
-
files.append({"name": m.name, "bytes": f.extractfile(m).read()})
|
|
98
|
-
res.append(files)
|
|
99
|
-
|
|
100
|
-
return pa.array(res, type=TarRead.RES_DTYPE)
|
spiral/expressions/mp4.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import pyarrow as pa
|
|
2
|
-
|
|
3
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
4
|
-
|
|
5
|
-
_MP4_RES_DTYPE: pa.DataType = pa.struct(
|
|
6
|
-
[
|
|
7
|
-
pa.field("pixels", pa.large_binary()),
|
|
8
|
-
pa.field("height", pa.uint32()),
|
|
9
|
-
pa.field("width", pa.uint32()),
|
|
10
|
-
pa.field("frames", pa.uint32()),
|
|
11
|
-
]
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
# TODO(marko): Support optional range and crop.
|
|
16
|
-
# IMPORTANT: Frames is currently broken and defaults to full.
|
|
17
|
-
def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str):
|
|
18
|
-
"""
|
|
19
|
-
Read referenced cell in a `MP4` format. Requires `ffmpeg`.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
expr: The referenced `Mp4` bytes.
|
|
23
|
-
A str is assumed to be the `se.aux` expression.
|
|
24
|
-
frames: The range of frames to read. Each element must be a list of two uint32,
|
|
25
|
-
frame start and frame end, or null / empty list to read all frames.
|
|
26
|
-
A str is assumed to be the `se.aux` expression.
|
|
27
|
-
crop: The crop of the frames to read. Each element must be a list of four uint32,
|
|
28
|
-
x, y, width, height or null / empty list to read full frames.
|
|
29
|
-
A str is assumed to be the `se.aux` expression.
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
An array where each element is a decoded cropped video with fields:
|
|
33
|
-
pixels: RGB8 bytes, frames * width * height * 3.
|
|
34
|
-
width: Width of the image with type `pa.uint32()`.
|
|
35
|
-
height: Height of the image with type `pa.uint32()`.
|
|
36
|
-
frames: Number of frames with type `pa.uint32()`.
|
|
37
|
-
"""
|
|
38
|
-
from spiral import _lib
|
|
39
|
-
from spiral.expressions import aux, lift
|
|
40
|
-
|
|
41
|
-
if isinstance(expr, str):
|
|
42
|
-
expr = aux(
|
|
43
|
-
expr,
|
|
44
|
-
pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
|
|
45
|
-
)
|
|
46
|
-
if isinstance(frames, str):
|
|
47
|
-
frames = aux(frames, pa.list_(pa.uint32()))
|
|
48
|
-
if isinstance(crop, str):
|
|
49
|
-
crop = aux(crop, pa.list_(pa.uint32()))
|
|
50
|
-
|
|
51
|
-
expr = lift(expr)
|
|
52
|
-
frames = lift(frames)
|
|
53
|
-
crop = lift(crop)
|
|
54
|
-
|
|
55
|
-
return Expr(
|
|
56
|
-
_lib.expr.video.read(
|
|
57
|
-
expr.__expr__,
|
|
58
|
-
frames.__expr__,
|
|
59
|
-
crop.__expr__,
|
|
60
|
-
format="mp4",
|
|
61
|
-
)
|
|
62
|
-
)
|
spiral/expressions/png.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def encode(expr: ExprLike) -> Expr:
|
|
5
|
-
"""Encode the given expression as a PNG image.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
expr: The expression to encode.
|
|
9
|
-
Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
The encoded PNG images.
|
|
13
|
-
"""
|
|
14
|
-
from spiral import _lib
|
|
15
|
-
from spiral.expressions import lift
|
|
16
|
-
|
|
17
|
-
expr = lift(expr)
|
|
18
|
-
return Expr(_lib.expr.img.encode(expr.__expr__, format="png"))
|
spiral/expressions/qoi.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def encode(expr: ExprLike) -> Expr:
|
|
5
|
-
"""Encode the given expression as a QOI image.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
expr: The expression to encode.
|
|
9
|
-
Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
The encoded QOI images.
|
|
13
|
-
"""
|
|
14
|
-
from spiral import _lib
|
|
15
|
-
from spiral.expressions import lift
|
|
16
|
-
|
|
17
|
-
expr = lift(expr)
|
|
18
|
-
return Expr(_lib.expr.img.encode(expr.__expr__, format="qoi"))
|