pyspiral 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

Files changed (41) hide show
  1. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
  2. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/RECORD +36 -30
  3. spiral/__init__.py +7 -0
  4. spiral/_lib.abi3.so +0 -0
  5. spiral/cli/iceberg.py +1 -1
  6. spiral/cli/key_spaces.py +15 -1
  7. spiral/cli/tables.py +3 -3
  8. spiral/client.py +12 -11
  9. spiral/core/client/__init__.pyi +8 -8
  10. spiral/core/expr/__init__.pyi +15 -0
  11. spiral/core/expr/images/__init__.pyi +3 -0
  12. spiral/core/expr/list_/__init__.pyi +4 -0
  13. spiral/core/expr/refs/__init__.pyi +4 -0
  14. spiral/core/expr/str_/__init__.pyi +3 -0
  15. spiral/core/expr/struct_/__init__.pyi +6 -0
  16. spiral/core/expr/text/__init__.pyi +5 -0
  17. spiral/core/expr/udf/__init__.pyi +14 -0
  18. spiral/core/expr/video/__init__.pyi +3 -0
  19. spiral/core/table/__init__.pyi +19 -1
  20. spiral/core/table/spec/__init__.pyi +6 -0
  21. spiral/dataloader.py +52 -38
  22. spiral/enrichment.py +153 -0
  23. spiral/expressions/__init__.py +15 -19
  24. spiral/expressions/base.py +9 -4
  25. spiral/expressions/http.py +10 -80
  26. spiral/expressions/s3.py +15 -0
  27. spiral/expressions/tiff.py +2 -3
  28. spiral/expressions/udf.py +38 -24
  29. spiral/project.py +6 -6
  30. spiral/scan.py +76 -33
  31. spiral/settings.py +9 -6
  32. spiral/streaming_/stream.py +1 -1
  33. spiral/table.py +41 -9
  34. spiral/transaction.py +42 -0
  35. spiral/expressions/io.py +0 -100
  36. spiral/expressions/mp4.py +0 -62
  37. spiral/expressions/png.py +0 -18
  38. spiral/expressions/qoi.py +0 -18
  39. spiral/expressions/refs.py +0 -58
  40. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
  41. {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0
spiral/enrichment.py ADDED
@@ -0,0 +1,153 @@
1
+ import dataclasses
2
+ import logging
3
+ from functools import partial
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ from spiral.core.client import Shard
7
+ from spiral.core.table.spec import Operation
8
+ from spiral.expressions import Expr
9
+
10
+ if TYPE_CHECKING:
11
+ from spiral import KeySpaceIndex, Table
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Enrichment:
17
+ """
18
+ An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
19
+ with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
20
+ horizontally expanding tables are a powerful primitive.
21
+
22
+ NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ table: "Table",
28
+ projection: Expr,
29
+ where: Expr | None,
30
+ ):
31
+ self._table = table
32
+ self._projection = projection
33
+ self._where = where
34
+
35
+ @property
36
+ def table(self) -> "Table":
37
+ """The table to write back into."""
38
+ return self._table
39
+
40
+ @property
41
+ def projection(self) -> Expr:
42
+ """The projection expression."""
43
+ return self._projection
44
+
45
+ @property
46
+ def where(self) -> Expr | None:
47
+ """The filter expression."""
48
+ return self._where
49
+
50
+ def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
51
+ """Apply the enrichment onto the table in a streaming fashion.
52
+
53
+ For large tables, consider using `apply_dask` for distributed execution.
54
+ """
55
+ scan = self._table.spiral.scan(self._projection, where=self._where)
56
+
57
+ with self._table.txn() as txn:
58
+ txn.writeback(
59
+ scan,
60
+ partition_size_bytes=partition_size_bytes,
61
+ batch_readahead=batch_readahead,
62
+ )
63
+
64
+ # TODO(marko): Need to figure out this sharding with key space index in places.
65
+ # We could compute on-demand instead of requiring a resource.
66
+ def apply_dask(
67
+ self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
68
+ ) -> None:
69
+ """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
70
+
71
+ If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
72
+
73
+ IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
74
+ usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
75
+ encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
76
+ executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
77
+ If that is not possible, please reach out to the support for assistance.
78
+
79
+ Args:
80
+ index: Optional key space index to use for sharding the enrichment.
81
+ If not provided, the table's default sharding will be used.
82
+ **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
83
+ such as `address` to connect to an existing cluster.
84
+ """
85
+ try:
86
+ from dask.distributed import Client
87
+ except ImportError:
88
+ raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
89
+
90
+ # Connect before doing any work.
91
+ dask_client = Client(**kwargs)
92
+
93
+ # Start a transaction BEFORE the planning scan.
94
+ tx = self._table.txn()
95
+ plan_scan = self._table.spiral.scan(self._projection, where=self._where)
96
+
97
+ # Determine the "tasks". Use the index if provided.
98
+ shards = plan_scan.shards()
99
+ if index is not None:
100
+ # TODO(marko): This will use index's asof automatically.
101
+ shards = self._table.spiral.internal.compute_shards(index.core)
102
+
103
+ # Partially bind the enrichment function.
104
+ _compute = partial(
105
+ _enrichment_task,
106
+ settings_dict=self._table.spiral.config.model_dump(),
107
+ state_json=plan_scan.core.scan_state().to_json(),
108
+ output_table_id=self._table.table_id,
109
+ partition_size_bytes=partition_size_bytes,
110
+ )
111
+ enrichments = dask_client.map(_compute, shards)
112
+
113
+ logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
114
+ for result in dask_client.gather(enrichments):
115
+ result: EnrichmentTaskResult
116
+ tx.include(result.ops)
117
+
118
+ if tx.is_empty():
119
+ logger.warning("Transaction not committed. No rows were read for enrichment.")
120
+ return
121
+
122
+ tx.commit()
123
+
124
+
125
+ @dataclasses.dataclass
126
+ class EnrichmentTaskResult:
127
+ ops: list[Operation]
128
+
129
+ def __getstate__(self):
130
+ return {"ops": [op.to_json() for op in self.ops]}
131
+
132
+ def __setstate__(self, state):
133
+ self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
134
+
135
+
136
+ # NOTE(marko): This function must be picklable!
137
+ def _enrichment_task(
138
+ shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
139
+ ) -> EnrichmentTaskResult:
140
+ # Returns operations that can be included in a transaction.
141
+ from spiral import Scan, Spiral
142
+ from spiral.core.table import ScanState
143
+ from spiral.settings import Settings
144
+
145
+ settings: Settings = Settings.model_validate(settings_dict)
146
+ sp = Spiral(config=settings)
147
+ state = ScanState.from_json(state_json)
148
+ task_scan = Scan(sp, sp.core.load_scan(state))
149
+ table = sp.table(output_table_id)
150
+
151
+ task_tx = table.txn()
152
+ task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
153
+ return EnrichmentTaskResult(ops=task_tx.take())
@@ -9,30 +9,23 @@ import pyarrow as pa
9
9
  from spiral import _lib, arrow_
10
10
 
11
11
  from . import http as http
12
- from . import io as io
13
12
  from . import list_ as list
14
- from . import mp4 as mp4
15
- from . import png as png
16
- from . import qoi as qoi
17
- from . import refs as refs
13
+ from . import s3 as s3
18
14
  from . import str_ as str
19
15
  from . import struct as struct
20
16
  from . import text as text
21
- from . import tiff as tiff
22
17
  from .base import Expr, ExprLike, NativeExpr
18
+ from .udf import UDF
23
19
 
24
20
  __all__ = [
25
21
  "Expr",
26
22
  "add",
27
23
  "and_",
28
- "deref",
29
24
  "divide",
30
25
  "eq",
31
26
  "getitem",
32
27
  "gt",
33
28
  "gte",
34
- "http",
35
- "io",
36
29
  "is_not_null",
37
30
  "is_null",
38
31
  "lift",
@@ -48,19 +41,16 @@ __all__ = [
48
41
  "or_",
49
42
  "pack",
50
43
  "aux",
51
- "ref",
52
- "refs",
53
44
  "scalar",
54
45
  "select",
55
46
  "str",
56
47
  "struct",
57
48
  "subtract",
58
- "tiff",
59
49
  "xor",
60
- "png",
61
- "qoi",
62
- "mp4",
63
50
  "text",
51
+ "s3",
52
+ "http",
53
+ "UDF",
64
54
  ]
65
55
 
66
56
  # Inline some of the struct expressions since they're so common
@@ -68,8 +58,6 @@ getitem = struct.getitem
68
58
  merge = struct.merge
69
59
  pack = struct.pack
70
60
  select = struct.select
71
- ref = refs.ref
72
- deref = refs.deref
73
61
 
74
62
 
75
63
  def lift(expr: ExprLike) -> Expr:
@@ -127,9 +115,17 @@ def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
127
115
  return pa.RecordBatchReader.from_batches(expr.schema, [expr])
128
116
  if isinstance(expr, pa.StructArray):
129
117
  return pa.Table.from_struct_array(expr).to_reader()
118
+
130
119
  if isinstance(expr, pa.ChunkedArray):
131
- # TODO(marko): We shouldn't need to combine chunks here._
132
- return evaluate(expr.combine_chunks())
120
+ if not pa.types.is_struct(expr.type):
121
+ raise ValueError("Arrow chunked array must be a struct type.")
122
+
123
+ def _iter_batches():
124
+ for chunk in expr.chunks:
125
+ yield pa.RecordBatch.from_struct_array(chunk)
126
+
127
+ return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
128
+
133
129
  if isinstance(expr, pa.Array):
134
130
  raise ValueError("Arrow array must be a struct array.")
135
131
 
@@ -1,6 +1,5 @@
1
- import builtins
2
1
  import datetime
3
- from typing import TypeAlias
2
+ from typing import TypeAlias, Union
4
3
 
5
4
  import pyarrow as pa
6
5
 
@@ -153,5 +152,11 @@ class Expr:
153
152
 
154
153
 
155
154
  ScalarLike: TypeAlias = bool | int | float | str | list["ScalarLike"] | datetime.datetime | None
156
- ArrowLike: TypeAlias = pa.Array | pa.ChunkedArray | pa.Scalar | pa.RecordBatch | pa.Table
157
- ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | builtins.list | ArrowLike | ScalarLike
155
+ ArrowLike: TypeAlias = Union[
156
+ pa.RecordBatch,
157
+ "pa.Array[pa.Scalar[pa.DataType]]",
158
+ "pa.ChunkedArray[pa.Scalar[pa.DataType]]",
159
+ "pa.Scalar[pa.DataType]",
160
+ pa.Table,
161
+ ]
162
+ ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike
@@ -1,86 +1,16 @@
1
- import hishel
2
- import httpx
3
- import pyarrow as pa
4
-
1
+ from spiral import _lib
5
2
  from spiral.expressions.base import Expr, ExprLike
6
- from spiral.expressions.struct import pack
7
- from spiral.expressions.udf import UDF
8
- from spiral.settings import APP_DIR
9
-
10
-
11
- def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
12
- """Submit a GET request to either a scalar of vector of URLs."""
13
- to_pack = {"url": url}
14
- if headers is not None:
15
- to_pack["headers"] = headers
16
- return HttpGet(force_cache)(pack(to_pack))
17
-
18
-
19
- class HttpGet(UDF):
20
- RES_DTYPE: pa.DataType = pa.struct(
21
- [
22
- pa.field("bytes", pa.large_binary()),
23
- pa.field("status", pa.int32()),
24
- pa.field("headers", pa.map_(pa.string(), pa.string())),
25
- ]
26
- )
27
-
28
- def __init__(self, force_cache: bool = False):
29
- super().__init__("http.get")
30
- self._force_cache = force_cache
31
-
32
- def return_type(self, *input_types: pa.DataType) -> pa.DataType:
33
- return HttpGet.RES_DTYPE
34
-
35
- def invoke(self, *input_args: pa.Array) -> pa.Array:
36
- if len(input_args) != 1:
37
- raise ValueError(f"Expected 1 argument, got {len(input_args)}")
38
- result = _http_request(input_args[0], self._force_cache)
39
- if isinstance(result, pa.ChunkedArray):
40
- result = result.combine_chunks()
41
- return result
42
-
43
-
44
- def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
45
- client = _HttpClient()
46
-
47
- if isinstance(arg, pa.StructArray):
48
- # We assume a vector of requests, but with potentially many arguments
49
- return pa.array(
50
- [
51
- _response_dict(
52
- client.request(
53
- req.get("method", "GET").upper(),
54
- req["url"],
55
- headers=req.get("headers", {}),
56
- extensions={"force_cache": force_cache},
57
- )
58
- )
59
- for req in arg.to_pylist()
60
- ],
61
- type=HttpGet.RES_DTYPE,
62
- )
63
-
64
- raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
65
-
66
3
 
67
- def _response_dict(response: httpx.Response) -> dict:
68
- if response.status_code != 200:
69
- raise ValueError(f"Request failed with status {response.status_code}")
70
- return {
71
- "bytes": response.read(),
72
- "status": response.status_code,
73
- "headers": dict(response.headers),
74
- }
75
4
 
5
+ def get(expr: ExprLike) -> Expr:
6
+ """Read data from the URL.
76
7
 
77
- class _HttpClient(hishel.CacheClient):
78
- _instance: "_HttpClient" = None
8
+ Args:
9
+ expr: URLs of the data that needs to be read.
10
+ """
11
+ from spiral import expressions as se
79
12
 
80
- def __new__(cls, *args, **kwargs):
81
- if not cls._instance:
82
- cls._instance = super().__new__(cls)
83
- return cls._instance
13
+ expr = se.lift(expr)
84
14
 
85
- def __init__(self):
86
- super().__init__(storage=hishel.FileStorage(base_path=APP_DIR / "http.cache", ttl=3600))
15
+ # This just works :)
16
+ return Expr(_lib.expr.s3.get(expr.__expr__))
@@ -0,0 +1,15 @@
1
+ from spiral import _lib
2
+ from spiral.expressions.base import Expr, ExprLike
3
+
4
+
5
+ def get(expr: ExprLike) -> Expr:
6
+ """Read data from object storage by the object's URL.
7
+
8
+ Args:
9
+ expr: URLs of the data that needs to be read from object storage.
10
+ """
11
+ from spiral import expressions as se
12
+
13
+ expr = se.lift(expr)
14
+
15
+ return Expr(_lib.expr.s3.get(expr.__expr__))
@@ -2,7 +2,6 @@ import numpy as np
2
2
  import pyarrow as pa
3
3
 
4
4
  from spiral.expressions.base import Expr, ExprLike
5
- from spiral.expressions.udf import RefUDF
6
5
 
7
6
  _TIFF_RES_DTYPE: pa.DataType = pa.struct(
8
7
  [
@@ -78,7 +77,7 @@ def select(
78
77
  return TiffSelectUDF()(expr, shape, indexes)
79
78
 
80
79
 
81
- class TiffReadUDF(RefUDF):
80
+ class TiffReadUDF:
82
81
  def __init__(self):
83
82
  super().__init__("tiff.read")
84
83
 
@@ -122,7 +121,7 @@ class TiffReadUDF(RefUDF):
122
121
  return _return_result(result, indexes)
123
122
 
124
123
 
125
- class TiffSelectUDF(RefUDF):
124
+ class TiffSelectUDF:
126
125
  def __init__(self):
127
126
  super().__init__("tiff.select")
128
127
 
spiral/expressions/udf.py CHANGED
@@ -3,44 +3,58 @@ import abc
3
3
  import pyarrow as pa
4
4
 
5
5
  from spiral import _lib
6
- from spiral.expressions.base import Expr
6
+ from spiral.expressions.base import Expr, ExprLike
7
7
 
8
8
 
9
- class BaseUDF:
10
- def __init__(self, udf):
11
- self._udf = udf
9
+ class UDF(abc.ABC):
10
+ """A User-Defined Function (UDF). This class should be subclassed to define custom UDFs.
12
11
 
13
- def __call__(self, *args) -> Expr:
14
- """Create an expression that calls this UDF with the given arguments."""
15
- from spiral import expressions as se
12
+ Example:
16
13
 
17
- args = [se.lift(arg).__expr__ for arg in args]
18
- return Expr(self._udf(args))
14
+ ```python
15
+ from spiral import expressions as se
16
+ import pyarrow as pa
19
17
 
20
- @abc.abstractmethod
21
- def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
18
+ class MyAdd(se.UDF):
19
+ def __init__(self):
20
+ super().__init__("my_add")
22
21
 
22
+ def return_type(self, scope: pa.DataType):
23
+ if not isinstance(scope, pa.StructType):
24
+ raise ValueError("Expected struct type as input")
25
+ return scope.field(0).type
23
26
 
24
- class UDF(BaseUDF):
25
- """A User-Defined Function (UDF)."""
27
+ def invoke(self, scope: pa.Array):
28
+ if not isinstance(scope, pa.StructArray):
29
+ raise ValueError("Expected struct array as input")
30
+ return pa.compute.add(scope.field(0), scope.field(1))
26
31
 
27
- def __init__(self, name: str):
28
- super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
32
+ my_add = MyAdd()
29
33
 
30
- @abc.abstractmethod
31
- def invoke(self, *input_args: pa.Array) -> pa.Array: ...
34
+ expr = my_add(table.select("first_arg", "second_arg"))
35
+ ```
36
+ """
32
37
 
38
+ def __init__(self, name: str):
39
+ self._udf = _lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke)
33
40
 
34
- class RefUDF(BaseUDF):
35
- """A UDF over a single ref cell, and therefore can access the file object."""
41
+ def __call__(self, scope: ExprLike) -> Expr:
42
+ """Create an expression that calls this UDF with the given arguments."""
43
+ from spiral import expressions as se
36
44
 
37
- def __init__(self, name: str):
38
- super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
45
+ return Expr(self._udf(se.lift(scope).__expr__))
39
46
 
40
47
  @abc.abstractmethod
41
- def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
42
- """Invoke the UDF with the given arguments.
48
+ def return_type(self, scope: pa.DataType) -> pa.DataType:
49
+ """Must return the return type of the UDF given the input scope type.
43
50
 
44
- NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
51
+ IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
52
+ including nested structs, meaning that all fields in structs must also be nullable,
53
+ and if those fields are structs, their fields must also be nullable, and so on.
45
54
  """
46
55
  ...
56
+
57
+ @abc.abstractmethod
58
+ def invoke(self, scope: pa.Array) -> pa.Array:
59
+ """Must implement the UDF logic given the input scope array."""
60
+ ...
spiral/project.py CHANGED
@@ -53,7 +53,7 @@ class Project:
53
53
  res = res[0]
54
54
 
55
55
  return Table(
56
- self._spiral, self._spiral._core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
56
+ self._spiral, self._spiral.core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
57
57
  )
58
58
 
59
59
  def create_table(
@@ -78,7 +78,7 @@ class Project:
78
78
  key_schema = pa.schema(key_schema)
79
79
  key_schema = Schema.from_arrow(key_schema)
80
80
 
81
- core_table = self._spiral._core.create_table(
81
+ core_table = self._spiral.core.create_table(
82
82
  project_id=self._id,
83
83
  dataset=dataset,
84
84
  table=table,
@@ -105,7 +105,7 @@ class Project:
105
105
  raise ValueError(f"Index not found: {name}")
106
106
  res = res[0]
107
107
 
108
- return TextIndex(self._spiral._core.text_index(res.id), name=name)
108
+ return TextIndex(self._spiral.core.text_index(res.id), name=name)
109
109
 
110
110
  def create_text_index(
111
111
  self,
@@ -135,7 +135,7 @@ class Project:
135
135
  if where is not None:
136
136
  where = se.lift(where)
137
137
 
138
- core_index = self._spiral._core.create_text_index(
138
+ core_index = self._spiral.core.create_text_index(
139
139
  project_id=self._id,
140
140
  name=name,
141
141
  projection=projection.__expr__,
@@ -154,7 +154,7 @@ class Project:
154
154
  raise ValueError(f"Index not found: {name}")
155
155
  res = res[0]
156
156
 
157
- return KeySpaceIndex(self._spiral._core.key_space_index(res.id), name=name)
157
+ return KeySpaceIndex(self._spiral.core.key_space_index(res.id), name=name)
158
158
 
159
159
  def create_key_space_index(
160
160
  self,
@@ -185,7 +185,7 @@ class Project:
185
185
  if where is not None:
186
186
  where = se.lift(where)
187
187
 
188
- core_index = self._spiral._core.create_key_space_index(
188
+ core_index = self._spiral.core.create_key_space_index(
189
189
  project_id=self._id,
190
190
  name=name,
191
191
  granularity=granularity,