pyspiral 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

Files changed (41) hide show
  1. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
  2. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/RECORD +36 -30
  3. spiral/__init__.py +7 -0
  4. spiral/_lib.abi3.so +0 -0
  5. spiral/cli/iceberg.py +1 -1
  6. spiral/cli/key_spaces.py +15 -1
  7. spiral/cli/tables.py +3 -3
  8. spiral/client.py +12 -11
  9. spiral/core/client/__init__.pyi +8 -8
  10. spiral/core/expr/__init__.pyi +15 -0
  11. spiral/core/expr/images/__init__.pyi +3 -0
  12. spiral/core/expr/list_/__init__.pyi +4 -0
  13. spiral/core/expr/refs/__init__.pyi +4 -0
  14. spiral/core/expr/str_/__init__.pyi +3 -0
  15. spiral/core/expr/struct_/__init__.pyi +6 -0
  16. spiral/core/expr/text/__init__.pyi +5 -0
  17. spiral/core/expr/udf/__init__.pyi +14 -0
  18. spiral/core/expr/video/__init__.pyi +3 -0
  19. spiral/core/table/__init__.pyi +19 -1
  20. spiral/core/table/spec/__init__.pyi +6 -0
  21. spiral/dataloader.py +52 -38
  22. spiral/enrichment.py +153 -0
  23. spiral/expressions/__init__.py +15 -19
  24. spiral/expressions/base.py +9 -4
  25. spiral/expressions/http.py +10 -80
  26. spiral/expressions/s3.py +15 -0
  27. spiral/expressions/tiff.py +2 -3
  28. spiral/expressions/udf.py +38 -24
  29. spiral/project.py +6 -6
  30. spiral/scan.py +76 -33
  31. spiral/settings.py +9 -6
  32. spiral/streaming_/stream.py +1 -1
  33. spiral/table.py +41 -9
  34. spiral/transaction.py +42 -0
  35. spiral/expressions/io.py +0 -100
  36. spiral/expressions/mp4.py +0 -62
  37. spiral/expressions/png.py +0 -18
  38. spiral/expressions/qoi.py +0 -18
  39. spiral/expressions/refs.py +0 -58
  40. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
  41. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0
spiral/scan.py CHANGED
@@ -1,8 +1,10 @@
1
+ from functools import partial
1
2
  from typing import TYPE_CHECKING, Any, Optional
2
3
 
3
4
  import pyarrow as pa
4
5
 
5
6
  from spiral.core.client import Shard, ShuffleConfig
7
+ from spiral.core.table import KeyRange
6
8
  from spiral.core.table import Scan as CoreScan
7
9
  from spiral.core.table.spec import Schema
8
10
  from spiral.settings import CI, DEV
@@ -15,13 +17,15 @@ if TYPE_CHECKING:
15
17
  import streaming # noqa
16
18
  import torch.utils.data as torchdata # noqa
17
19
 
20
+ from spiral.client import Spiral
18
21
  from spiral.dataloader import SpiralDataLoader, World # noqa
19
22
 
20
23
 
21
24
  class Scan:
22
25
  """Scan object."""
23
26
 
24
- def __init__(self, core: CoreScan):
27
+ def __init__(self, spiral: "Spiral", core: CoreScan):
28
+ self.spiral = spiral
25
29
  self.core = core
26
30
 
27
31
  @property
@@ -34,6 +38,11 @@ class Scan:
34
38
  """Returns the schema of the scan."""
35
39
  return self.core.schema()
36
40
 
41
+ @property
42
+ def key_schema(self) -> Schema:
43
+ """Returns the key schema of the scan."""
44
+ return self.core.key_schema()
45
+
37
46
  def is_empty(self) -> bool:
38
47
  """Check if the Spiral is empty for the given key range.
39
48
 
@@ -44,6 +53,8 @@ class Scan:
44
53
 
45
54
  def to_record_batches(
46
55
  self,
56
+ *,
57
+ key_range: KeyRange | None = None,
47
58
  key_table: pa.Table | pa.RecordBatchReader | None = None,
48
59
  batch_size: int | None = None,
49
60
  batch_readahead: int | None = None,
@@ -51,6 +62,9 @@ class Scan:
51
62
  """Read as a stream of RecordBatches.
52
63
 
53
64
  Args:
65
+ key_range: Optional key range to filter the scan.
66
+ If provided, the scan will only return rows within the key range.
67
+ Only one of key_range or key_table can be provided.
54
68
  key_table: a table of keys to "take" (including aux columns for cell-push-down).
55
69
  If None, the scan will be executed without a key table.
56
70
  batch_size: the maximum number of rows per returned batch.
@@ -58,6 +72,9 @@ class Scan:
58
72
  RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
59
73
  batch_readahead: the number of batches to prefetch in the background.
60
74
  """
75
+ if key_range is not None and key_table is not None:
76
+ raise ValueError("Only one of key_range or key_table can be provided.")
77
+
61
78
  if isinstance(key_table, pa.RecordBatchReader):
62
79
  if batch_size is not None:
63
80
  raise ValueError(
@@ -66,46 +83,54 @@ class Scan:
66
83
  elif isinstance(key_table, pa.Table):
67
84
  key_table = key_table.to_reader(max_chunksize=batch_size)
68
85
 
69
- return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
86
+ return self.core.to_record_batches(key_range=key_range, key_table=key_table, batch_readahead=batch_readahead)
70
87
 
71
88
  def to_table(
72
89
  self,
90
+ *,
91
+ key_range: KeyRange | None = None,
73
92
  key_table: pa.Table | pa.RecordBatchReader | None = None,
74
93
  ) -> pa.Table:
75
94
  """Read into a single PyArrow Table.
76
95
 
77
96
  Args:
97
+ key_range: Optional key range to filter the scan.
98
+ If provided, the scan will only return rows within the key range.
99
+ Only one of key_range or key_table can be provided.
78
100
  key_table: a table of keys to "take" (including aux columns for cell-push-down).
79
101
  If None, the scan will be executed without a key table.
80
102
  """
81
103
  # NOTE: Evaluates fully on Rust side which improved debuggability.
82
- if DEV and not CI and key_table is None:
104
+ if DEV and not CI and key_table is None and key_range is None:
83
105
  rb = self.core.to_record_batch()
84
106
  return pa.Table.from_batches([rb])
85
107
 
86
- return self.to_record_batches(key_table=key_table).read_all()
108
+ return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
87
109
 
88
110
  def to_dask(self) -> "dd.DataFrame":
89
111
  """Read into a Dask DataFrame.
90
112
 
91
113
  Requires the `dask` package to be installed.
114
+
115
+ IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
116
+ usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
117
+ encountering such issues, please reach out to the support for assistance.
92
118
  """
93
119
  import dask.dataframe as dd
94
- import pandas as pd
95
120
 
96
- def _read_shard(shard: Shard) -> pd.DataFrame:
97
- # TODO(ngates): we need a way to preserve the existing asofs?
98
- raise NotImplementedError()
99
-
100
- # Fetch a set of partition ranges
121
+ _read_shard = partial(
122
+ _read_shard_task,
123
+ settings_dict=self.spiral.config.model_dump(),
124
+ state_json=self.core.scan_state().to_json(),
125
+ )
101
126
  return dd.from_map(_read_shard, self.shards())
102
127
 
103
- def to_pandas(self) -> "pd.DataFrame":
128
+ def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
104
129
  """Read into a Pandas DataFrame.
105
130
 
106
131
  Requires the `pandas` package to be installed.
107
132
  """
108
- return self.to_table().to_pandas()
133
+ return self.to_table(key_range=key_range).to_pandas()
109
134
 
110
135
  def to_polars(self) -> "pl.DataFrame":
111
136
  """Read into a Polars DataFrame.
@@ -160,16 +185,18 @@ class Scan:
160
185
 
161
186
  Returns:
162
187
  SpiralDataLoader with shards partitioned for this rank.
163
- """
164
- # Example usage:
165
- #
166
- # Auto-detect from PyTorch distributed:
167
- # loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
168
- #
169
- # Explicit world configuration:
170
- # world = World(rank=0, world_size=4)
171
- # loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
172
188
 
189
+ Auto-detect from PyTorch distributed:
190
+ ```python
191
+ loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
192
+ ```
193
+
194
+ Explicit world configuration:
195
+ ```python
196
+ world = World(rank=0, world_size=4)
197
+ loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
198
+ ```
199
+ """
173
200
  from spiral.dataloader import SpiralDataLoader, World
174
201
 
175
202
  if world is None:
@@ -203,19 +230,21 @@ class Scan:
203
230
 
204
231
  Returns:
205
232
  New SpiralDataLoader instance configured to resume from the checkpoint.
233
+
234
+ Save checkpoint during training:
235
+ ```python
236
+ loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
237
+ checkpoint = loader.state_dict()
238
+ ```
239
+
240
+ Resume later - uses same shards from checkpoint:
241
+ ```python
242
+ resumed_loader = scan.resume_data_loader(
243
+ checkpoint,
244
+ batch_size=32,
245
+ transform_fn=my_transform,
246
+ )
206
247
  """
207
- # Example usage:
208
- #
209
- # Save checkpoint during training:
210
- # loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
211
- # checkpoint = loader.state_dict()
212
- #
213
- # Resume later - uses same shards from checkpoint:
214
- # resumed_loader = scan.resume_data_loader(
215
- # checkpoint,
216
- # batch_size=32,
217
- # transform_fn=my_transform,
218
- # )
219
248
  from spiral.dataloader import SpiralDataLoader
220
249
 
221
250
  return SpiralDataLoader.from_state_dict(self, state, **kwargs)
@@ -283,3 +312,17 @@ class Scan:
283
312
  from spiral.debug.metrics import display_metrics
284
313
 
285
314
  display_metrics(self.metrics)
315
+
316
+
317
+ # NOTE(marko): This function must be picklable!
318
+ def _read_shard_task(shard: Shard, *, settings_dict, state_json) -> "pd.DataFrame":
319
+ from spiral import Spiral
320
+ from spiral.core.table import ScanState
321
+ from spiral.settings import Settings
322
+
323
+ settings: Settings = Settings.model_validate(settings_dict)
324
+ sp = Spiral(config=settings)
325
+ state = ScanState.from_json(state_json)
326
+ task_scan = Scan(sp, sp.core.load_scan(state))
327
+
328
+ return task_scan.to_pandas(key_range=shard.key_range)
spiral/settings.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Annotated
5
5
 
6
6
  import typer
7
- from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
7
+ from pydantic import Field, PlainSerializer, ValidatorFunctionWrapHandler, WrapValidator
8
8
  from pydantic_settings import (
9
9
  BaseSettings,
10
10
  InitSettingsSource,
@@ -28,13 +28,16 @@ PACKAGE_NAME = "pyspiral"
28
28
 
29
29
 
30
30
  def validate_token(v, handler: ValidatorFunctionWrapHandler):
31
- if isinstance(v, str):
32
- return Token(v)
33
- else:
34
- raise ValueError("Token value must be a string")
31
+ if not isinstance(v, str):
32
+ raise ValueError("Token value (SPIRAL__SPIRALDB__TOKEN) must be a string")
33
+ return Token(v)
35
34
 
36
35
 
37
- TokenType = Annotated[Token, WrapValidator(validate_token)]
36
+ TokenType = Annotated[
37
+ Token,
38
+ WrapValidator(validate_token),
39
+ PlainSerializer(lambda token: token.expose_secret(), return_type=str),
40
+ ]
38
41
 
39
42
 
40
43
  class SpiralDBSettings(BaseSettings):
@@ -101,7 +101,7 @@ class SpiralStream:
101
101
  return 0
102
102
 
103
103
  # Prepare the shard, writing it to disk.
104
- self._sp._ops().prepare_shard(
104
+ self._sp.internal.prepare_shard(
105
105
  shard_path, self._scan.core, shard.shard, row_block_size=self._shard_row_block_size
106
106
  )
107
107
 
spiral/table.py CHANGED
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
3
3
 
4
4
  from spiral.core.table import Table as CoreTable
5
5
  from spiral.core.table.spec import Schema
6
+ from spiral.enrichment import Enrichment
6
7
  from spiral.expressions.base import Expr, ExprLike
7
8
  from spiral.settings import settings
8
9
  from spiral.snapshot import Snapshot
@@ -12,12 +13,11 @@ if TYPE_CHECKING:
12
13
  import duckdb
13
14
  import polars as pl
14
15
  import pyarrow.dataset as ds
15
- import streaming
16
- import torch.utils.data as torchdata # noqa
17
16
 
18
17
  from spiral.client import Spiral
19
18
  from spiral.dataloader import SpiralDataLoader
20
19
  from spiral.key_space_index import KeySpaceIndex
20
+ from spiral.streaming_ import SpiralStream
21
21
 
22
22
 
23
23
  class Table(Expr):
@@ -50,6 +50,14 @@ class Table(Expr):
50
50
  """Returns the fully qualified identifier of the table."""
51
51
  return self._identifier or self.table_id
52
52
 
53
+ @property
54
+ def project(self) -> str | None:
55
+ """Returns the project of the table."""
56
+ if self._identifier is None:
57
+ return None
58
+ project, _, _ = self._identifier.split(".")
59
+ return project
60
+
53
61
  @property
54
62
  def dataset(self) -> str | None:
55
63
  """Returns the dataset of the table."""
@@ -110,6 +118,30 @@ class Table(Expr):
110
118
  partition_size_bytes=partition_size_bytes,
111
119
  )
112
120
 
121
+ def enrich(
122
+ self,
123
+ *projections: ExprLike,
124
+ where: ExprLike | None = None,
125
+ ) -> Enrichment:
126
+ """Returns an Enrichment object that, when applied, produces new columns.
127
+
128
+ Enrichment can be applied in different ways, e.g. distributed.
129
+
130
+ :param projections: Projection expressions deriving new columns to write back.
131
+ Expressions can be over multiple Spiral tables, but all tables including
132
+ this one must share the same key schema.
133
+ :param where: Optional filter expression to apply when reading the input tables.
134
+ """
135
+ from spiral import expressions as se
136
+
137
+ # Combine table with all projections into a single struct.
138
+ # The table is included to ensure key columns are present in the scan output.
139
+ projection = se.merge(self, *projections)
140
+ if where is not None:
141
+ where = se.lift(where)
142
+
143
+ return Enrichment(self, projection, where)
144
+
113
145
  def drop_columns(self, column_paths: list[str]) -> None:
114
146
  """
115
147
  Drops the specified columns from the table.
@@ -136,7 +168,7 @@ class Table(Expr):
136
168
  it is important that the primary key columns are unique within the transaction.
137
169
  The behavior is undefined if this is not the case.
138
170
  """
139
- return Transaction(self.spiral._core.transaction(self.core, settings().file_format, retries=retries))
171
+ return Transaction(self.spiral.core.transaction(self.core, settings().file_format, retries=retries))
140
172
 
141
173
  def to_dataset(self) -> "ds.Dataset":
142
174
  """Returns a PyArrow Dataset representing the table."""
@@ -175,7 +207,7 @@ class Table(Expr):
175
207
  if index.asof == 0:
176
208
  raise ValueError("Index have to be synced before it can be used.")
177
209
 
178
- shards = self.spiral._core._ops().compute_shards(index=index.core)
210
+ shards = self.spiral.internal.compute_shards(index=index.core)
179
211
 
180
212
  return self.spiral.scan(
181
213
  projection if projection is not None else index.projection,
@@ -208,7 +240,7 @@ class Table(Expr):
208
240
  if index.asof == 0:
209
241
  raise ValueError("Index have to be synced before it can be used.")
210
242
 
211
- shards = self.spiral._core._ops().compute_shards(index=index.core)
243
+ shards = self.spiral.core.internal.compute_shards(index=index.core)
212
244
 
213
245
  return self.spiral.scan(
214
246
  projection if projection is not None else index.projection,
@@ -240,7 +272,7 @@ class Table(Expr):
240
272
  if index.asof == 0:
241
273
  raise ValueError("Index have to be synced before it can be used.")
242
274
 
243
- shards = self.spiral._core._ops().compute_shards(index=index.core)
275
+ shards = self.spiral.core.internal.compute_shards(index=index.core)
244
276
 
245
277
  return self.spiral.scan(
246
278
  index.projection,
@@ -255,7 +287,7 @@ class Table(Expr):
255
287
  projection: Expr | None = None,
256
288
  cache_dir: str | None = None,
257
289
  shard_row_block_size: int | None = None,
258
- ) -> "streaming.Stream":
290
+ ) -> "SpiralStream":
259
291
  """Returns a stream to be used with MosaicML's StreamingDataset.
260
292
 
261
293
  Requires `streaming` package to be installed.
@@ -282,7 +314,7 @@ class Table(Expr):
282
314
  where=index.filter,
283
315
  asof=index.asof,
284
316
  )
285
- shards = self.spiral._core._ops().compute_shards(index=index.core)
317
+ shards = self.spiral.internal.compute_shards(index=index.core)
286
318
 
287
319
  return SpiralStream(
288
320
  sp=self.spiral,
@@ -290,4 +322,4 @@ class Table(Expr):
290
322
  shards=shards,
291
323
  cache_dir=cache_dir,
292
324
  shard_row_block_size=shard_row_block_size,
293
- ) # type: ignore[return-value]
325
+ )
spiral/transaction.py CHANGED
@@ -1,5 +1,8 @@
1
+ from spiral.core.table import KeyRange
1
2
  from spiral.core.table import Transaction as CoreTransaction
3
+ from spiral.core.table.spec import Operation
2
4
  from spiral.expressions.base import ExprLike
5
+ from spiral.scan import Scan
3
6
 
4
7
 
5
8
  class Transaction:
@@ -17,6 +20,10 @@ class Transaction:
17
20
  """The status of the transaction."""
18
21
  return self._core.status
19
22
 
23
+ def is_empty(self) -> bool:
24
+ """Check if the transaction has no operations."""
25
+ return self._core.is_empty()
26
+
20
27
  def __enter__(self):
21
28
  return self
22
29
 
@@ -39,6 +46,27 @@ class Transaction:
39
46
 
40
47
  self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
41
48
 
49
+ def writeback(
50
+ self,
51
+ scan: Scan,
52
+ *,
53
+ key_range: KeyRange | None = None,
54
+ partition_size_bytes: int | None = None,
55
+ batch_readahead: int | None = None,
56
+ ):
57
+ """Write back the results of a scan to the table.
58
+
59
+ :param scan: The scan to write back.
60
+ The scan does NOT need to be over the same table as transaction,
61
+ but it does need to have the same key schema.
62
+ :param key_range: Optional key range to limit the writeback to.
63
+ :param partition_size_bytes: The maximum partition size in bytes.
64
+ :param batch_readahead: The number of batches to read ahead when evaluating the scan.
65
+ """
66
+ self._core.writeback(
67
+ scan.core, key_range=key_range, partition_size_bytes=partition_size_bytes, batch_readahead=batch_readahead
68
+ )
69
+
42
70
  def drop_columns(self, column_paths: list[str]):
43
71
  """
44
72
  Drops the specified columns from the table.
@@ -49,6 +77,20 @@ class Transaction:
49
77
  """
50
78
  self._core.drop_columns(column_paths)
51
79
 
80
+ def take(self) -> list[Operation]:
81
+ """Take the operations from the transaction
82
+
83
+ Transaction can no longer be committed or aborted after calling this method.
84
+ ."""
85
+ return self._core.take()
86
+
87
+ def include(self, ops: list[Operation]):
88
+ """Include the given operations in the transaction.
89
+
90
+ Checks for conflicts between the included operations and any existing operations.
91
+ """
92
+ self._core.include(ops)
93
+
52
94
  def commit(self):
53
95
  """Commit the transaction."""
54
96
  self._core.commit()
spiral/expressions/io.py DELETED
@@ -1,100 +0,0 @@
1
- import tarfile
2
- from io import BytesIO
3
-
4
- import pyarrow as pa
5
-
6
- from spiral.expressions.base import Expr, ExprLike
7
- from spiral.expressions.struct import pack
8
- from spiral.expressions.udf import UDF
9
-
10
-
11
- def read_file(path: ExprLike) -> Expr:
12
- """
13
- Read file path(s) from disk into a struct with a single field "bytes" containing the file contents.
14
-
15
- Args:
16
- path: Expression evaluating to an array of strings representing local disk paths.
17
- """
18
- to_pack = {"path": path}
19
- return FileRead()(pack(to_pack))
20
-
21
-
22
- class FileRead(UDF):
23
- RES_DTYPE: pa.DataType = pa.struct(
24
- [
25
- pa.field("bytes", pa.large_binary()),
26
- ]
27
- )
28
-
29
- def __init__(self):
30
- super().__init__("file.read")
31
-
32
- def return_type(self, *input_types: pa.DataType) -> pa.DataType:
33
- return FileRead.RES_DTYPE
34
-
35
- def invoke(self, *input_args: pa.Array) -> pa.Array:
36
- if len(input_args) != 1:
37
- raise ValueError(f"Expected 1 argument, got {len(input_args)}")
38
- arg = input_args[0]
39
-
40
- res = []
41
- for req in arg:
42
- with open(req["path"].as_py(), "rb") as f:
43
- res.append({"bytes": f.read()})
44
-
45
- return pa.array(res, type=FileRead.RES_DTYPE)
46
-
47
-
48
- def read_tar(path: ExprLike = None, bytes_: ExprLike = None) -> "Expr":
49
- # Untar a vector of paths / byte arrays representing tarballs.
50
- if path is None and bytes_ is None:
51
- raise ValueError("Expected either path or bytes_ to be provided")
52
- to_pack = {}
53
- if path is not None:
54
- to_pack["path"] = path
55
- if bytes_ is not None:
56
- to_pack["bytes"] = bytes_
57
- return TarRead()(pack(to_pack))
58
-
59
-
60
- class TarRead(UDF):
61
- RES_DTYPE = pa.list_(
62
- pa.struct(
63
- [
64
- pa.field("name", pa.string()),
65
- pa.field("bytes", pa.large_binary()),
66
- ]
67
- )
68
- )
69
-
70
- def __init__(self):
71
- super().__init__("tar.read")
72
-
73
- def return_type(self, *input_types: pa.DataType) -> pa.DataType:
74
- return TarRead.RES_DTYPE
75
-
76
- def invoke(self, *input_args: pa.Array) -> pa.Array:
77
- if len(input_args) != 1:
78
- raise ValueError(f"Expected 1 argument, got {len(input_args)}")
79
- arg = input_args[0]
80
-
81
- res = []
82
- for req in arg:
83
- if "path" in req:
84
- kwargs = {"name": req["path"].as_py()}
85
- elif "bytes" in req:
86
- kwargs = {"fileobj": BytesIO(req["bytes"].as_py())}
87
- else:
88
- raise ValueError("Expected path or bytes_ to be provided")
89
-
90
- files = []
91
- with tarfile.open(**kwargs) as f:
92
- for m in f.getmembers():
93
- m: tarfile.TarInfo
94
- if m.type == tarfile.DIRTYPE:
95
- continue
96
- # TODO(ngates): skip other types too maybe? Why are we even skipping directories?
97
- files.append({"name": m.name, "bytes": f.extractfile(m).read()})
98
- res.append(files)
99
-
100
- return pa.array(res, type=TarRead.RES_DTYPE)
spiral/expressions/mp4.py DELETED
@@ -1,62 +0,0 @@
1
- import pyarrow as pa
2
-
3
- from spiral.expressions.base import Expr, ExprLike
4
-
5
- _MP4_RES_DTYPE: pa.DataType = pa.struct(
6
- [
7
- pa.field("pixels", pa.large_binary()),
8
- pa.field("height", pa.uint32()),
9
- pa.field("width", pa.uint32()),
10
- pa.field("frames", pa.uint32()),
11
- ]
12
- )
13
-
14
-
15
- # TODO(marko): Support optional range and crop.
16
- # IMPORTANT: Frames is currently broken and defaults to full.
17
- def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str):
18
- """
19
- Read referenced cell in a `MP4` format. Requires `ffmpeg`.
20
-
21
- Args:
22
- expr: The referenced `Mp4` bytes.
23
- A str is assumed to be the `se.aux` expression.
24
- frames: The range of frames to read. Each element must be a list of two uint32,
25
- frame start and frame end, or null / empty list to read all frames.
26
- A str is assumed to be the `se.aux` expression.
27
- crop: The crop of the frames to read. Each element must be a list of four uint32,
28
- x, y, width, height or null / empty list to read full frames.
29
- A str is assumed to be the `se.aux` expression.
30
-
31
- Returns:
32
- An array where each element is a decoded cropped video with fields:
33
- pixels: RGB8 bytes, frames * width * height * 3.
34
- width: Width of the image with type `pa.uint32()`.
35
- height: Height of the image with type `pa.uint32()`.
36
- frames: Number of frames with type `pa.uint32()`.
37
- """
38
- from spiral import _lib
39
- from spiral.expressions import aux, lift
40
-
41
- if isinstance(expr, str):
42
- expr = aux(
43
- expr,
44
- pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
45
- )
46
- if isinstance(frames, str):
47
- frames = aux(frames, pa.list_(pa.uint32()))
48
- if isinstance(crop, str):
49
- crop = aux(crop, pa.list_(pa.uint32()))
50
-
51
- expr = lift(expr)
52
- frames = lift(frames)
53
- crop = lift(crop)
54
-
55
- return Expr(
56
- _lib.expr.video.read(
57
- expr.__expr__,
58
- frames.__expr__,
59
- crop.__expr__,
60
- format="mp4",
61
- )
62
- )
spiral/expressions/png.py DELETED
@@ -1,18 +0,0 @@
1
- from spiral.expressions.base import Expr, ExprLike
2
-
3
-
4
- def encode(expr: ExprLike) -> Expr:
5
- """Encode the given expression as a PNG image.
6
-
7
- Args:
8
- expr: The expression to encode.
9
- Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
-
11
- Returns:
12
- The encoded PNG images.
13
- """
14
- from spiral import _lib
15
- from spiral.expressions import lift
16
-
17
- expr = lift(expr)
18
- return Expr(_lib.expr.img.encode(expr.__expr__, format="png"))
spiral/expressions/qoi.py DELETED
@@ -1,18 +0,0 @@
1
- from spiral.expressions.base import Expr, ExprLike
2
-
3
-
4
- def encode(expr: ExprLike) -> Expr:
5
- """Encode the given expression as a QOI image.
6
-
7
- Args:
8
- expr: The expression to encode.
9
- Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
-
11
- Returns:
12
- The encoded QOI images.
13
- """
14
- from spiral import _lib
15
- from spiral.expressions import lift
16
-
17
- expr = lift(expr)
18
- return Expr(_lib.expr.img.encode(expr.__expr__, format="qoi"))