pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.3.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spiral/dataset.py CHANGED
@@ -1,22 +1,23 @@
1
- from typing import TYPE_CHECKING, Any
1
+ from typing import Any
2
2
 
3
3
  import pyarrow as pa
4
4
  import pyarrow.compute as pc
5
-
6
- if TYPE_CHECKING:
7
- import pyarrow.dataset
5
+ import pyarrow.dataset as ds
8
6
 
9
7
  from spiral import Scan, Table
10
8
 
11
9
 
12
- class TableDataset(pa.dataset.Dataset):
10
+ class TableDataset(ds.Dataset):
13
11
  def __init__(self, table: Table):
14
12
  self._table = table
15
- self._schema: pa.Schema = table.scan().schema.to_arrow()
13
+ # Once table is converted to a dataset, used pinned snapshot.
14
+ self._asof = table.last_modified_at
15
+ self._schema: pa.Schema = table._table.get_schema(asof=self._asof).to_arrow()
16
16
 
17
17
  # We don't actually initialize a Dataset, we just implement enough of the API
18
18
  # to fool both DuckDB and Polars.
19
19
  # super().__init__()
20
+ self._last_scan = None
20
21
 
21
22
  @property
22
23
  def schema(self) -> pa.Schema:
@@ -28,7 +29,7 @@ class TableDataset(pa.dataset.Dataset):
28
29
  batch_size: int | None = None,
29
30
  batch_readahead: int | None = None,
30
31
  fragment_readahead: int | None = None,
31
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
32
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
32
33
  use_threads: bool = True,
33
34
  memory_pool: pa.MemoryPool = None,
34
35
  ):
@@ -58,11 +59,11 @@ class TableDataset(pa.dataset.Dataset):
58
59
  batch_size: int | None = None,
59
60
  batch_readahead: int | None = None,
60
61
  fragment_readahead: int | None = None,
61
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
62
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
62
63
  use_threads: bool = True,
63
64
  memory_pool: pa.MemoryPool = None,
64
65
  ):
65
- self.scanner(
66
+ return self.scanner(
66
67
  columns,
67
68
  filter,
68
69
  batch_size,
@@ -99,7 +100,7 @@ class TableDataset(pa.dataset.Dataset):
99
100
  batch_size: int | None = None,
100
101
  batch_readahead: int | None = None,
101
102
  fragment_readahead: int | None = None,
102
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
103
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
103
104
  use_threads: bool = True,
104
105
  memory_pool: pa.MemoryPool = None,
105
106
  ) -> "TableScanner":
@@ -111,11 +112,18 @@ class TableDataset(pa.dataset.Dataset):
111
112
  filter.to_substrait(self._schema, allow_arrow_extensions=True),
112
113
  )
113
114
 
114
- scan = self._table.scan(
115
- {c: self._table[c] for c in columns} if columns else self._table,
116
- where=filter,
117
- exclude_keys=True,
115
+ scan = (
116
+ self._table.scan(
117
+ {c: self._table[c] for c in columns},
118
+ where=filter,
119
+ exclude_keys=True,
120
+ asof=self._asof,
121
+ )
122
+ if columns
123
+ else self._table.scan(where=filter, asof=self._asof)
118
124
  )
125
+ self._last_scan = scan
126
+
119
127
  return TableScanner(scan)
120
128
 
121
129
  def sort_by(self, sorting, **kwargs):
@@ -129,7 +137,7 @@ class TableDataset(pa.dataset.Dataset):
129
137
  batch_size: int | None = None,
130
138
  batch_readahead: int | None = None,
131
139
  fragment_readahead: int | None = None,
132
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
140
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
133
141
  use_threads: bool = True,
134
142
  memory_pool: pa.MemoryPool = None,
135
143
  ):
@@ -151,7 +159,7 @@ class TableDataset(pa.dataset.Dataset):
151
159
  batch_size: int | None = None,
152
160
  batch_readahead: int | None = None,
153
161
  fragment_readahead: int | None = None,
154
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
162
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
155
163
  use_threads: bool = True,
156
164
  memory_pool: pa.MemoryPool = None,
157
165
  ):
@@ -173,7 +181,7 @@ class TableDataset(pa.dataset.Dataset):
173
181
  batch_size: int | None = None,
174
182
  batch_readahead: int | None = None,
175
183
  fragment_readahead: int | None = None,
176
- fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
184
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
177
185
  use_threads: bool = True,
178
186
  memory_pool: pa.MemoryPool = None,
179
187
  ):
@@ -189,12 +197,17 @@ class TableDataset(pa.dataset.Dataset):
189
197
  ).to_table()
190
198
 
191
199
 
192
- class TableScanner(pa.dataset.Scanner):
200
+ class TableScanner(ds.Scanner):
193
201
  """A PyArrow Dataset Scanner that reads from a Spiral Table."""
194
202
 
195
- def __init__(self, scan: Scan):
203
+ def __init__(
204
+ self,
205
+ scan: Scan,
206
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
207
+ ):
196
208
  self._scan = scan
197
209
  self._schema = scan.schema
210
+ self.key_table = key_table
198
211
 
199
212
  # We don't actually initialize a Dataset, we just implement enough of the API
200
213
  # to fool both DuckDB and Polars.
@@ -233,7 +246,195 @@ class TableScanner(pa.dataset.Scanner):
233
246
  return self.to_reader()
234
247
 
235
248
  def to_reader(self):
236
- return self._scan.to_record_batches()
249
+ return self._scan.to_record_batches(key_table=self.key_table)
237
250
 
238
251
  def to_table(self):
239
252
  return self.to_reader().read_all()
253
+
254
+
255
+ class ScanDataset(ds.Dataset):
256
+ def __init__(
257
+ self,
258
+ scan: Scan,
259
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
260
+ ):
261
+ self._scan = scan
262
+ self._schema: pa.Schema = scan.schema.to_arrow()
263
+ self._key_table = key_table
264
+
265
+ # We don't actually initialize a Dataset, we just implement enough of the API
266
+ # to fool both DuckDB and Polars.
267
+ # super().__init__()
268
+
269
+ @property
270
+ def schema(self) -> pa.Schema:
271
+ return self._schema
272
+
273
+ def count_rows(
274
+ self,
275
+ filter: pc.Expression | None = None,
276
+ batch_size: int | None = None,
277
+ batch_readahead: int | None = None,
278
+ fragment_readahead: int | None = None,
279
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
280
+ use_threads: bool = True,
281
+ memory_pool: pa.MemoryPool = None,
282
+ ):
283
+ return self.scanner(
284
+ None,
285
+ filter,
286
+ batch_size,
287
+ batch_readahead,
288
+ fragment_readahead,
289
+ fragment_scan_options,
290
+ use_threads,
291
+ memory_pool,
292
+ ).count_rows()
293
+
294
+ def filter(self, expression: pc.Expression) -> "TableDataset":
295
+ raise NotImplementedError("filter not implemented")
296
+
297
+ def get_fragments(self, filter: pc.Expression | None = None):
298
+ """TODO(ngates): perhaps we should return ranges as per our split API?"""
299
+ raise NotImplementedError("get_fragments not implemented")
300
+
301
+ def head(
302
+ self,
303
+ num_rows: int,
304
+ columns: list[str] | None = None,
305
+ filter: pc.Expression | None = None,
306
+ batch_size: int | None = None,
307
+ batch_readahead: int | None = None,
308
+ fragment_readahead: int | None = None,
309
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
310
+ use_threads: bool = True,
311
+ memory_pool: pa.MemoryPool = None,
312
+ ):
313
+ return self.scanner(
314
+ columns,
315
+ filter,
316
+ batch_size,
317
+ batch_readahead,
318
+ fragment_readahead,
319
+ fragment_scan_options,
320
+ use_threads,
321
+ memory_pool,
322
+ ).head(num_rows)
323
+
324
+ def join(
325
+ self,
326
+ right_dataset,
327
+ keys,
328
+ right_keys=None,
329
+ join_type=None,
330
+ left_suffix=None,
331
+ right_suffix=None,
332
+ coalesce_keys=True,
333
+ use_threads=True,
334
+ ):
335
+ raise NotImplementedError("join not implemented")
336
+
337
+ def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
338
+ raise NotImplementedError("join_asof not implemented")
339
+
340
+ def replace_schema(self, schema: pa.Schema) -> "TableDataset":
341
+ raise NotImplementedError("replace_schema not implemented")
342
+
343
+ def scanner(
344
+ self,
345
+ columns: list[str] | None = None,
346
+ filter: pc.Expression | None = None,
347
+ batch_size: int | None = None,
348
+ batch_readahead: int | None = None,
349
+ fragment_readahead: int | None = None,
350
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
351
+ use_threads: bool = True,
352
+ memory_pool: pa.MemoryPool = None,
353
+ ) -> "TableScanner":
354
+ if columns is not None:
355
+ columns = set(columns)
356
+ names = set(self.schema.names)
357
+ if len(columns - names) != 0 or len(names - columns) != 0:
358
+ raise NotImplementedError("columns", columns, self.schema)
359
+ if filter is not None:
360
+ raise NotImplementedError("filter")
361
+ if batch_size is not None:
362
+ raise NotImplementedError("batch_size")
363
+ if batch_readahead is not None:
364
+ raise NotImplementedError("batch_readahead")
365
+ if fragment_readahead is not None:
366
+ raise NotImplementedError("fragment_readahead")
367
+ if fragment_scan_options is not None:
368
+ raise NotImplementedError("fragment_scan_options")
369
+
370
+ return TableScanner(self._scan, key_table=self._key_table)
371
+
372
+ def sort_by(self, sorting, **kwargs):
373
+ raise NotImplementedError("sort_by not implemented")
374
+
375
+ def take(
376
+ self,
377
+ indices: pa.Array | Any,
378
+ columns: list[str] | None = None,
379
+ filter: pc.Expression | None = None,
380
+ batch_size: int | None = None,
381
+ batch_readahead: int | None = None,
382
+ fragment_readahead: int | None = None,
383
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
384
+ use_threads: bool = True,
385
+ memory_pool: pa.MemoryPool = None,
386
+ ):
387
+ return self.scanner(
388
+ columns,
389
+ filter,
390
+ batch_size,
391
+ batch_readahead,
392
+ fragment_readahead,
393
+ fragment_scan_options,
394
+ use_threads,
395
+ memory_pool,
396
+ ).take(indices)
397
+
398
+ def to_batches(
399
+ self,
400
+ columns: list[str] | None = None,
401
+ filter: pc.Expression | None = None,
402
+ batch_size: int | None = None,
403
+ batch_readahead: int | None = None,
404
+ fragment_readahead: int | None = None,
405
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
406
+ use_threads: bool = True,
407
+ memory_pool: pa.MemoryPool = None,
408
+ ):
409
+ return self.scanner(
410
+ columns,
411
+ filter,
412
+ batch_size,
413
+ batch_readahead,
414
+ fragment_readahead,
415
+ fragment_scan_options,
416
+ use_threads,
417
+ memory_pool,
418
+ ).to_batches()
419
+
420
+ def to_table(
421
+ self,
422
+ columns=None,
423
+ filter: pc.Expression | None = None,
424
+ batch_size: int | None = None,
425
+ batch_readahead: int | None = None,
426
+ fragment_readahead: int | None = None,
427
+ fragment_scan_options: ds.FragmentScanOptions | None = None,
428
+ use_threads: bool = True,
429
+ memory_pool: pa.MemoryPool = None,
430
+ ):
431
+ return self.scanner(
432
+ columns,
433
+ filter,
434
+ batch_size,
435
+ batch_readahead,
436
+ fragment_readahead,
437
+ fragment_scan_options,
438
+ use_threads,
439
+ memory_pool,
440
+ ).to_table()
@@ -10,6 +10,9 @@ from spiral import _lib, arrow
10
10
  from . import http as http
11
11
  from . import io as io
12
12
  from . import list_ as list
13
+ from . import mp4 as mp4
14
+ from . import png as png
15
+ from . import qoi as qoi
13
16
  from . import refs as refs
14
17
  from . import str_ as str
15
18
  from . import struct as struct
@@ -42,6 +45,7 @@ __all__ = [
42
45
  "not_",
43
46
  "or_",
44
47
  "pack",
48
+ "keyed",
45
49
  "ref",
46
50
  "refs",
47
51
  "scalar",
@@ -52,6 +56,9 @@ __all__ = [
52
56
  "tiff",
53
57
  "var",
54
58
  "xor",
59
+ "png",
60
+ "qoi",
61
+ "mp4",
55
62
  ]
56
63
 
57
64
  # Inline some of the struct expressions since they're so common
@@ -88,6 +95,10 @@ def lift(expr: ExprLike) -> Expr:
88
95
 
89
96
  # If the value is struct-like, we un-nest any dot-separated field names
90
97
  if isinstance(expr, pa.StructArray | pa.StructScalar):
98
+ if isinstance(expr, pa.StructArray) and expr.null_count != 0:
99
+ raise ValueError("lift: cannot lift a struct array with nulls.")
100
+ if isinstance(expr, pa.StructArray) and not expr.is_valid():
101
+ raise ValueError("lift: cannot lift a struct scalar with nulls.")
91
102
  return lift(arrow.nest_structs(expr))
92
103
 
93
104
  if isinstance(expr, pa.Array):
@@ -97,9 +108,13 @@ def lift(expr: ExprLike) -> Expr:
97
108
  return scalar(expr)
98
109
 
99
110
 
100
- def var(name: builtins.str) -> Expr:
101
- """Create a variable expression."""
102
- return Expr(_lib.spql.expr.var(name))
111
+ def key(name: builtins.str) -> Expr:
112
+ """Create a variable expression referencing a key column.
113
+
114
+ Args:
115
+ name: variable name
116
+ """
117
+ return Expr(_lib.spql.expr.keyed(name))
103
118
 
104
119
 
105
120
  def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
@@ -112,7 +127,7 @@ def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
112
127
  name: variable name
113
128
  dtype: must match dtype of the column in the key table.
114
129
  """
115
- return Expr(_lib.spql.expr.keyed(f"#{name}", dtype))
130
+ return Expr(_lib.spql.expr.keyed(name, dtype))
116
131
 
117
132
 
118
133
  def scalar(value: Any) -> Expr:
@@ -0,0 +1,69 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral.expressions.base import Expr, ExprLike
6
+
7
+ if TYPE_CHECKING:
8
+ from spiral import Table
9
+
10
+ _MP4_RES_DTYPE: pa.DataType = pa.struct(
11
+ [
12
+ pa.field("pixels", pa.large_binary()),
13
+ pa.field("height", pa.uint32()),
14
+ pa.field("width", pa.uint32()),
15
+ pa.field("frames", pa.uint32()),
16
+ ]
17
+ )
18
+
19
+
20
+ # TODO(marko): Support optional range and crop.
21
+ # IMPORTANT: Frames is currently broken and defaults to full.
22
+ def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str, *, table: "Table" = None):
23
+ """
24
+ Read referenced cell in a `MP4` format. Requires `ffmpeg`.
25
+
26
+ Args:
27
+ expr: The referenced `Mp4` bytes.
28
+ A str is assumed to be the `se.keyed` expression.
29
+ frames: The range of frames to read. Each element must be a list of two uint32,
30
+ frame start and frame end, or null / empty list to read all frames.
31
+ A str is assumed to be the `se.keyed` expression.
32
+ crop: The crop of the frames to read. Each element must be a list of four uint32,
33
+ x, y, width, height or null / empty list to read full frames.
34
+ A str is assumed to be the `se.keyed` expression.
35
+ table (optional): The table to de-reference from, if not available in input expression.
36
+
37
+ Returns:
38
+ An array where each element is a decoded cropped video with fields:
39
+ pixels: RGB8 bytes, frames * width * height * 3.
40
+ width: Width of the image with type `pa.uint32()`.
41
+ height: Height of the image with type `pa.uint32()`.
42
+ frames: Number of frames with type `pa.uint32()`.
43
+ """
44
+ from spiral import _lib
45
+ from spiral.expressions import keyed, lift
46
+
47
+ if isinstance(expr, str):
48
+ expr = keyed(
49
+ expr,
50
+ pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
51
+ )
52
+ if isinstance(frames, str):
53
+ frames = keyed(frames, pa.list_(pa.uint32()))
54
+ if isinstance(crop, str):
55
+ crop = keyed(crop, pa.list_(pa.uint32()))
56
+
57
+ expr = lift(expr)
58
+ frames = lift(frames)
59
+ crop = lift(crop)
60
+
61
+ return Expr(
62
+ _lib.spql.expr.video.read(
63
+ expr.__expr__,
64
+ frames.__expr__,
65
+ crop.__expr__,
66
+ format="mp4",
67
+ table=table._table if table is not None else None,
68
+ )
69
+ )
@@ -0,0 +1,18 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def encode(expr: ExprLike) -> Expr:
5
+ """Encode the given expression as a PNG image.
6
+
7
+ Args:
8
+ expr: The expression to encode.
9
+ Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
+
11
+ Returns:
12
+ The encoded PNG images.
13
+ """
14
+ from spiral import _lib
15
+ from spiral.expressions import lift
16
+
17
+ expr = lift(expr)
18
+ return Expr(_lib.spql.expr.img.encode(expr.__expr__, format="png"))
@@ -0,0 +1,18 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def encode(expr: ExprLike) -> Expr:
5
+ """Encode the given expression as a QOI image.
6
+
7
+ Args:
8
+ expr: The expression to encode.
9
+ Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
+
11
+ Returns:
12
+ The encoded QOI images.
13
+ """
14
+ from spiral import _lib
15
+ from spiral.expressions import lift
16
+
17
+ expr = lift(expr)
18
+ return Expr(_lib.spql.expr.img.encode(expr.__expr__, format="qoi"))
@@ -1,5 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
+ import pyarrow as pa
4
+
3
5
  from spiral.expressions.base import Expr, ExprLike
4
6
 
5
7
  if TYPE_CHECKING:
@@ -25,20 +27,38 @@ def ref(expr: ExprLike, field: str | None = None) -> Expr:
25
27
  return Expr(_lib.spql.expr.ref(expr.__expr__, field))
26
28
 
27
29
 
28
- def deref(expr: ExprLike, field: str | None = None, table: "Table" = None) -> Expr:
30
+ def deref(expr: ExprLike | str, field: str | None = None, *, table: "Table" = None) -> Expr:
29
31
  """De-reference referenced values.
30
32
 
31
33
  See `ref` for more information on Spiral's reference values. This expression is used to de-reference referenced
32
34
  column back into their original form, e.g. binary.
33
35
 
34
36
  Args:
35
- expr: The expression to de-reference.
37
+ expr: The expression to de-reference. A str is assumed to be the `se.keyed` expression.
36
38
  field: If the expr evaluates into struct, the field name of that struct that should be de-referenced.
37
39
  If `None`, the expr must evaluate into a reference type.
38
40
  table (optional): The table to de-reference from, if not available in input expression.
39
41
  """
40
42
  from spiral import _lib
43
+ from spiral.expressions import keyed, lift
44
+
45
+ if isinstance(expr, str):
46
+ expr = keyed(
47
+ expr,
48
+ pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
49
+ )
50
+
51
+ expr = lift(expr)
52
+ return Expr(_lib.spql.expr.deref(expr.__expr__, field=field, table=table._table if table is not None else None))
53
+
54
+
55
+ def nbytes(expr: ExprLike) -> Expr:
56
+ """Return the number of bytes in a reference.
57
+
58
+ Args:
59
+ expr: The ref expression to get the number of bytes from.
60
+ """
41
61
  from spiral.expressions import lift
42
62
 
43
63
  expr = lift(expr)
44
- return Expr(_lib.spql.expr.deref(expr.__expr__, field, table._table if table is not None else None))
64
+ return expr["__ref__"]["end"] - expr["__ref__"]["begin"]