pyspiral 0.6.11__cp312-abi3-macosx_11_0_arm64.whl → 0.6.13__cp312-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/RECORD +36 -30
- spiral/__init__.py +7 -0
- spiral/_lib.abi3.so +0 -0
- spiral/cli/iceberg.py +1 -1
- spiral/cli/key_spaces.py +15 -1
- spiral/cli/tables.py +3 -3
- spiral/client.py +12 -11
- spiral/core/client/__init__.pyi +8 -8
- spiral/core/expr/__init__.pyi +15 -0
- spiral/core/expr/images/__init__.pyi +3 -0
- spiral/core/expr/list_/__init__.pyi +4 -0
- spiral/core/expr/refs/__init__.pyi +4 -0
- spiral/core/expr/str_/__init__.pyi +3 -0
- spiral/core/expr/struct_/__init__.pyi +6 -0
- spiral/core/expr/text/__init__.pyi +5 -0
- spiral/core/expr/udf/__init__.pyi +14 -0
- spiral/core/expr/video/__init__.pyi +3 -0
- spiral/core/table/__init__.pyi +19 -1
- spiral/core/table/spec/__init__.pyi +6 -0
- spiral/dataloader.py +52 -38
- spiral/enrichment.py +153 -0
- spiral/expressions/__init__.py +15 -19
- spiral/expressions/base.py +9 -4
- spiral/expressions/http.py +10 -80
- spiral/expressions/s3.py +15 -0
- spiral/expressions/tiff.py +2 -3
- spiral/expressions/udf.py +38 -24
- spiral/project.py +6 -6
- spiral/scan.py +76 -33
- spiral/settings.py +9 -6
- spiral/streaming_/stream.py +1 -1
- spiral/table.py +41 -9
- spiral/transaction.py +42 -0
- spiral/expressions/io.py +0 -100
- spiral/expressions/mp4.py +0 -62
- spiral/expressions/png.py +0 -18
- spiral/expressions/qoi.py +0 -18
- spiral/expressions/refs.py +0 -58
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0
spiral/enrichment.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
from functools import partial
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
from spiral.core.client import Shard
|
|
7
|
+
from spiral.core.table.spec import Operation
|
|
8
|
+
from spiral.expressions import Expr
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from spiral import KeySpaceIndex, Table
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Enrichment:
|
|
17
|
+
"""
|
|
18
|
+
An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
|
|
19
|
+
with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
|
|
20
|
+
horizontally expanding tables are a powerful primitive.
|
|
21
|
+
|
|
22
|
+
NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
table: "Table",
|
|
28
|
+
projection: Expr,
|
|
29
|
+
where: Expr | None,
|
|
30
|
+
):
|
|
31
|
+
self._table = table
|
|
32
|
+
self._projection = projection
|
|
33
|
+
self._where = where
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def table(self) -> "Table":
|
|
37
|
+
"""The table to write back into."""
|
|
38
|
+
return self._table
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def projection(self) -> Expr:
|
|
42
|
+
"""The projection expression."""
|
|
43
|
+
return self._projection
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def where(self) -> Expr | None:
|
|
47
|
+
"""The filter expression."""
|
|
48
|
+
return self._where
|
|
49
|
+
|
|
50
|
+
def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
|
|
51
|
+
"""Apply the enrichment onto the table in a streaming fashion.
|
|
52
|
+
|
|
53
|
+
For large tables, consider using `apply_dask` for distributed execution.
|
|
54
|
+
"""
|
|
55
|
+
scan = self._table.spiral.scan(self._projection, where=self._where)
|
|
56
|
+
|
|
57
|
+
with self._table.txn() as txn:
|
|
58
|
+
txn.writeback(
|
|
59
|
+
scan,
|
|
60
|
+
partition_size_bytes=partition_size_bytes,
|
|
61
|
+
batch_readahead=batch_readahead,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# TODO(marko): Need to figure out this sharding with key space index in places.
|
|
65
|
+
# We could compute on-demand instead of requiring a resource.
|
|
66
|
+
def apply_dask(
|
|
67
|
+
self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
|
|
68
|
+
) -> None:
|
|
69
|
+
"""Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
|
|
70
|
+
|
|
71
|
+
If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
|
|
72
|
+
|
|
73
|
+
IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
|
|
74
|
+
usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
|
|
75
|
+
encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
|
|
76
|
+
executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
|
|
77
|
+
If that is not possible, please reach out to the support for assistance.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
index: Optional key space index to use for sharding the enrichment.
|
|
81
|
+
If not provided, the table's default sharding will be used.
|
|
82
|
+
**kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
|
|
83
|
+
such as `address` to connect to an existing cluster.
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
from dask.distributed import Client
|
|
87
|
+
except ImportError:
|
|
88
|
+
raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
|
|
89
|
+
|
|
90
|
+
# Connect before doing any work.
|
|
91
|
+
dask_client = Client(**kwargs)
|
|
92
|
+
|
|
93
|
+
# Start a transaction BEFORE the planning scan.
|
|
94
|
+
tx = self._table.txn()
|
|
95
|
+
plan_scan = self._table.spiral.scan(self._projection, where=self._where)
|
|
96
|
+
|
|
97
|
+
# Determine the "tasks". Use the index if provided.
|
|
98
|
+
shards = plan_scan.shards()
|
|
99
|
+
if index is not None:
|
|
100
|
+
# TODO(marko): This will use index's asof automatically.
|
|
101
|
+
shards = self._table.spiral.internal.compute_shards(index.core)
|
|
102
|
+
|
|
103
|
+
# Partially bind the enrichment function.
|
|
104
|
+
_compute = partial(
|
|
105
|
+
_enrichment_task,
|
|
106
|
+
settings_dict=self._table.spiral.config.model_dump(),
|
|
107
|
+
state_json=plan_scan.core.scan_state().to_json(),
|
|
108
|
+
output_table_id=self._table.table_id,
|
|
109
|
+
partition_size_bytes=partition_size_bytes,
|
|
110
|
+
)
|
|
111
|
+
enrichments = dask_client.map(_compute, shards)
|
|
112
|
+
|
|
113
|
+
logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
|
|
114
|
+
for result in dask_client.gather(enrichments):
|
|
115
|
+
result: EnrichmentTaskResult
|
|
116
|
+
tx.include(result.ops)
|
|
117
|
+
|
|
118
|
+
if tx.is_empty():
|
|
119
|
+
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
tx.commit()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclasses.dataclass
|
|
126
|
+
class EnrichmentTaskResult:
|
|
127
|
+
ops: list[Operation]
|
|
128
|
+
|
|
129
|
+
def __getstate__(self):
|
|
130
|
+
return {"ops": [op.to_json() for op in self.ops]}
|
|
131
|
+
|
|
132
|
+
def __setstate__(self, state):
|
|
133
|
+
self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# NOTE(marko): This function must be picklable!
|
|
137
|
+
def _enrichment_task(
|
|
138
|
+
shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
|
|
139
|
+
) -> EnrichmentTaskResult:
|
|
140
|
+
# Returns operations that can be included in a transaction.
|
|
141
|
+
from spiral import Scan, Spiral
|
|
142
|
+
from spiral.core.table import ScanState
|
|
143
|
+
from spiral.settings import Settings
|
|
144
|
+
|
|
145
|
+
settings: Settings = Settings.model_validate(settings_dict)
|
|
146
|
+
sp = Spiral(config=settings)
|
|
147
|
+
state = ScanState.from_json(state_json)
|
|
148
|
+
task_scan = Scan(sp, sp.core.load_scan(state))
|
|
149
|
+
table = sp.table(output_table_id)
|
|
150
|
+
|
|
151
|
+
task_tx = table.txn()
|
|
152
|
+
task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
|
|
153
|
+
return EnrichmentTaskResult(ops=task_tx.take())
|
spiral/expressions/__init__.py
CHANGED
|
@@ -9,30 +9,23 @@ import pyarrow as pa
|
|
|
9
9
|
from spiral import _lib, arrow_
|
|
10
10
|
|
|
11
11
|
from . import http as http
|
|
12
|
-
from . import io as io
|
|
13
12
|
from . import list_ as list
|
|
14
|
-
from . import
|
|
15
|
-
from . import png as png
|
|
16
|
-
from . import qoi as qoi
|
|
17
|
-
from . import refs as refs
|
|
13
|
+
from . import s3 as s3
|
|
18
14
|
from . import str_ as str
|
|
19
15
|
from . import struct as struct
|
|
20
16
|
from . import text as text
|
|
21
|
-
from . import tiff as tiff
|
|
22
17
|
from .base import Expr, ExprLike, NativeExpr
|
|
18
|
+
from .udf import UDF
|
|
23
19
|
|
|
24
20
|
__all__ = [
|
|
25
21
|
"Expr",
|
|
26
22
|
"add",
|
|
27
23
|
"and_",
|
|
28
|
-
"deref",
|
|
29
24
|
"divide",
|
|
30
25
|
"eq",
|
|
31
26
|
"getitem",
|
|
32
27
|
"gt",
|
|
33
28
|
"gte",
|
|
34
|
-
"http",
|
|
35
|
-
"io",
|
|
36
29
|
"is_not_null",
|
|
37
30
|
"is_null",
|
|
38
31
|
"lift",
|
|
@@ -48,19 +41,16 @@ __all__ = [
|
|
|
48
41
|
"or_",
|
|
49
42
|
"pack",
|
|
50
43
|
"aux",
|
|
51
|
-
"ref",
|
|
52
|
-
"refs",
|
|
53
44
|
"scalar",
|
|
54
45
|
"select",
|
|
55
46
|
"str",
|
|
56
47
|
"struct",
|
|
57
48
|
"subtract",
|
|
58
|
-
"tiff",
|
|
59
49
|
"xor",
|
|
60
|
-
"png",
|
|
61
|
-
"qoi",
|
|
62
|
-
"mp4",
|
|
63
50
|
"text",
|
|
51
|
+
"s3",
|
|
52
|
+
"http",
|
|
53
|
+
"UDF",
|
|
64
54
|
]
|
|
65
55
|
|
|
66
56
|
# Inline some of the struct expressions since they're so common
|
|
@@ -68,8 +58,6 @@ getitem = struct.getitem
|
|
|
68
58
|
merge = struct.merge
|
|
69
59
|
pack = struct.pack
|
|
70
60
|
select = struct.select
|
|
71
|
-
ref = refs.ref
|
|
72
|
-
deref = refs.deref
|
|
73
61
|
|
|
74
62
|
|
|
75
63
|
def lift(expr: ExprLike) -> Expr:
|
|
@@ -127,9 +115,17 @@ def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
|
|
|
127
115
|
return pa.RecordBatchReader.from_batches(expr.schema, [expr])
|
|
128
116
|
if isinstance(expr, pa.StructArray):
|
|
129
117
|
return pa.Table.from_struct_array(expr).to_reader()
|
|
118
|
+
|
|
130
119
|
if isinstance(expr, pa.ChunkedArray):
|
|
131
|
-
|
|
132
|
-
|
|
120
|
+
if not pa.types.is_struct(expr.type):
|
|
121
|
+
raise ValueError("Arrow chunked array must be a struct type.")
|
|
122
|
+
|
|
123
|
+
def _iter_batches():
|
|
124
|
+
for chunk in expr.chunks:
|
|
125
|
+
yield pa.RecordBatch.from_struct_array(chunk)
|
|
126
|
+
|
|
127
|
+
return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
|
|
128
|
+
|
|
133
129
|
if isinstance(expr, pa.Array):
|
|
134
130
|
raise ValueError("Arrow array must be a struct array.")
|
|
135
131
|
|
spiral/expressions/base.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import builtins
|
|
2
1
|
import datetime
|
|
3
|
-
from typing import TypeAlias
|
|
2
|
+
from typing import TypeAlias, Union
|
|
4
3
|
|
|
5
4
|
import pyarrow as pa
|
|
6
5
|
|
|
@@ -153,5 +152,11 @@ class Expr:
|
|
|
153
152
|
|
|
154
153
|
|
|
155
154
|
ScalarLike: TypeAlias = bool | int | float | str | list["ScalarLike"] | datetime.datetime | None
|
|
156
|
-
ArrowLike: TypeAlias =
|
|
157
|
-
|
|
155
|
+
ArrowLike: TypeAlias = Union[
|
|
156
|
+
pa.RecordBatch,
|
|
157
|
+
"pa.Array[pa.Scalar[pa.DataType]]",
|
|
158
|
+
"pa.ChunkedArray[pa.Scalar[pa.DataType]]",
|
|
159
|
+
"pa.Scalar[pa.DataType]",
|
|
160
|
+
pa.Table,
|
|
161
|
+
]
|
|
162
|
+
ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike
|
spiral/expressions/http.py
CHANGED
|
@@ -1,86 +1,16 @@
|
|
|
1
|
-
import
|
|
2
|
-
import httpx
|
|
3
|
-
import pyarrow as pa
|
|
4
|
-
|
|
1
|
+
from spiral import _lib
|
|
5
2
|
from spiral.expressions.base import Expr, ExprLike
|
|
6
|
-
from spiral.expressions.struct import pack
|
|
7
|
-
from spiral.expressions.udf import UDF
|
|
8
|
-
from spiral.settings import APP_DIR
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
|
|
12
|
-
"""Submit a GET request to either a scalar of vector of URLs."""
|
|
13
|
-
to_pack = {"url": url}
|
|
14
|
-
if headers is not None:
|
|
15
|
-
to_pack["headers"] = headers
|
|
16
|
-
return HttpGet(force_cache)(pack(to_pack))
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class HttpGet(UDF):
|
|
20
|
-
RES_DTYPE: pa.DataType = pa.struct(
|
|
21
|
-
[
|
|
22
|
-
pa.field("bytes", pa.large_binary()),
|
|
23
|
-
pa.field("status", pa.int32()),
|
|
24
|
-
pa.field("headers", pa.map_(pa.string(), pa.string())),
|
|
25
|
-
]
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
def __init__(self, force_cache: bool = False):
|
|
29
|
-
super().__init__("http.get")
|
|
30
|
-
self._force_cache = force_cache
|
|
31
|
-
|
|
32
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
33
|
-
return HttpGet.RES_DTYPE
|
|
34
|
-
|
|
35
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
36
|
-
if len(input_args) != 1:
|
|
37
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
38
|
-
result = _http_request(input_args[0], self._force_cache)
|
|
39
|
-
if isinstance(result, pa.ChunkedArray):
|
|
40
|
-
result = result.combine_chunks()
|
|
41
|
-
return result
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
|
|
45
|
-
client = _HttpClient()
|
|
46
|
-
|
|
47
|
-
if isinstance(arg, pa.StructArray):
|
|
48
|
-
# We assume a vector of requests, but with potentially many arguments
|
|
49
|
-
return pa.array(
|
|
50
|
-
[
|
|
51
|
-
_response_dict(
|
|
52
|
-
client.request(
|
|
53
|
-
req.get("method", "GET").upper(),
|
|
54
|
-
req["url"],
|
|
55
|
-
headers=req.get("headers", {}),
|
|
56
|
-
extensions={"force_cache": force_cache},
|
|
57
|
-
)
|
|
58
|
-
)
|
|
59
|
-
for req in arg.to_pylist()
|
|
60
|
-
],
|
|
61
|
-
type=HttpGet.RES_DTYPE,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
|
|
65
|
-
|
|
66
3
|
|
|
67
|
-
def _response_dict(response: httpx.Response) -> dict:
|
|
68
|
-
if response.status_code != 200:
|
|
69
|
-
raise ValueError(f"Request failed with status {response.status_code}")
|
|
70
|
-
return {
|
|
71
|
-
"bytes": response.read(),
|
|
72
|
-
"status": response.status_code,
|
|
73
|
-
"headers": dict(response.headers),
|
|
74
|
-
}
|
|
75
4
|
|
|
5
|
+
def get(expr: ExprLike) -> Expr:
|
|
6
|
+
"""Read data from the URL.
|
|
76
7
|
|
|
77
|
-
|
|
78
|
-
|
|
8
|
+
Args:
|
|
9
|
+
expr: URLs of the data that needs to be read.
|
|
10
|
+
"""
|
|
11
|
+
from spiral import expressions as se
|
|
79
12
|
|
|
80
|
-
|
|
81
|
-
if not cls._instance:
|
|
82
|
-
cls._instance = super().__new__(cls)
|
|
83
|
-
return cls._instance
|
|
13
|
+
expr = se.lift(expr)
|
|
84
14
|
|
|
85
|
-
|
|
86
|
-
|
|
15
|
+
# This just works :)
|
|
16
|
+
return Expr(_lib.expr.s3.get(expr.__expr__))
|
spiral/expressions/s3.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from spiral import _lib
|
|
2
|
+
from spiral.expressions.base import Expr, ExprLike
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get(expr: ExprLike) -> Expr:
|
|
6
|
+
"""Read data from object storage by the object's URL.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
expr: URLs of the data that needs to be read from object storage.
|
|
10
|
+
"""
|
|
11
|
+
from spiral import expressions as se
|
|
12
|
+
|
|
13
|
+
expr = se.lift(expr)
|
|
14
|
+
|
|
15
|
+
return Expr(_lib.expr.s3.get(expr.__expr__))
|
spiral/expressions/tiff.py
CHANGED
|
@@ -2,7 +2,6 @@ import numpy as np
|
|
|
2
2
|
import pyarrow as pa
|
|
3
3
|
|
|
4
4
|
from spiral.expressions.base import Expr, ExprLike
|
|
5
|
-
from spiral.expressions.udf import RefUDF
|
|
6
5
|
|
|
7
6
|
_TIFF_RES_DTYPE: pa.DataType = pa.struct(
|
|
8
7
|
[
|
|
@@ -78,7 +77,7 @@ def select(
|
|
|
78
77
|
return TiffSelectUDF()(expr, shape, indexes)
|
|
79
78
|
|
|
80
79
|
|
|
81
|
-
class TiffReadUDF
|
|
80
|
+
class TiffReadUDF:
|
|
82
81
|
def __init__(self):
|
|
83
82
|
super().__init__("tiff.read")
|
|
84
83
|
|
|
@@ -122,7 +121,7 @@ class TiffReadUDF(RefUDF):
|
|
|
122
121
|
return _return_result(result, indexes)
|
|
123
122
|
|
|
124
123
|
|
|
125
|
-
class TiffSelectUDF
|
|
124
|
+
class TiffSelectUDF:
|
|
126
125
|
def __init__(self):
|
|
127
126
|
super().__init__("tiff.select")
|
|
128
127
|
|
spiral/expressions/udf.py
CHANGED
|
@@ -3,44 +3,58 @@ import abc
|
|
|
3
3
|
import pyarrow as pa
|
|
4
4
|
|
|
5
5
|
from spiral import _lib
|
|
6
|
-
from spiral.expressions.base import Expr
|
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
|
|
11
|
-
self._udf = udf
|
|
9
|
+
class UDF(abc.ABC):
|
|
10
|
+
"""A User-Defined Function (UDF). This class should be subclassed to define custom UDFs.
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
"""Create an expression that calls this UDF with the given arguments."""
|
|
15
|
-
from spiral import expressions as se
|
|
12
|
+
Example:
|
|
16
13
|
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
```python
|
|
15
|
+
from spiral import expressions as se
|
|
16
|
+
import pyarrow as pa
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
class MyAdd(se.UDF):
|
|
19
|
+
def __init__(self):
|
|
20
|
+
super().__init__("my_add")
|
|
22
21
|
|
|
22
|
+
def return_type(self, scope: pa.DataType):
|
|
23
|
+
if not isinstance(scope, pa.StructType):
|
|
24
|
+
raise ValueError("Expected struct type as input")
|
|
25
|
+
return scope.field(0).type
|
|
23
26
|
|
|
24
|
-
|
|
25
|
-
|
|
27
|
+
def invoke(self, scope: pa.Array):
|
|
28
|
+
if not isinstance(scope, pa.StructArray):
|
|
29
|
+
raise ValueError("Expected struct array as input")
|
|
30
|
+
return pa.compute.add(scope.field(0), scope.field(1))
|
|
26
31
|
|
|
27
|
-
|
|
28
|
-
super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
|
|
32
|
+
my_add = MyAdd()
|
|
29
33
|
|
|
30
|
-
|
|
31
|
-
|
|
34
|
+
expr = my_add(table.select("first_arg", "second_arg"))
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
32
37
|
|
|
38
|
+
def __init__(self, name: str):
|
|
39
|
+
self._udf = _lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke)
|
|
33
40
|
|
|
34
|
-
|
|
35
|
-
|
|
41
|
+
def __call__(self, scope: ExprLike) -> Expr:
|
|
42
|
+
"""Create an expression that calls this UDF with the given arguments."""
|
|
43
|
+
from spiral import expressions as se
|
|
36
44
|
|
|
37
|
-
|
|
38
|
-
super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
|
|
45
|
+
return Expr(self._udf(se.lift(scope).__expr__))
|
|
39
46
|
|
|
40
47
|
@abc.abstractmethod
|
|
41
|
-
def
|
|
42
|
-
"""
|
|
48
|
+
def return_type(self, scope: pa.DataType) -> pa.DataType:
|
|
49
|
+
"""Must return the return type of the UDF given the input scope type.
|
|
43
50
|
|
|
44
|
-
|
|
51
|
+
IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
|
|
52
|
+
including nested structs, meaning that all fields in structs must also be nullable,
|
|
53
|
+
and if those fields are structs, their fields must also be nullable, and so on.
|
|
45
54
|
"""
|
|
46
55
|
...
|
|
56
|
+
|
|
57
|
+
@abc.abstractmethod
|
|
58
|
+
def invoke(self, scope: pa.Array) -> pa.Array:
|
|
59
|
+
"""Must implement the UDF logic given the input scope array."""
|
|
60
|
+
...
|
spiral/project.py
CHANGED
|
@@ -53,7 +53,7 @@ class Project:
|
|
|
53
53
|
res = res[0]
|
|
54
54
|
|
|
55
55
|
return Table(
|
|
56
|
-
self._spiral, self._spiral.
|
|
56
|
+
self._spiral, self._spiral.core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
def create_table(
|
|
@@ -78,7 +78,7 @@ class Project:
|
|
|
78
78
|
key_schema = pa.schema(key_schema)
|
|
79
79
|
key_schema = Schema.from_arrow(key_schema)
|
|
80
80
|
|
|
81
|
-
core_table = self._spiral.
|
|
81
|
+
core_table = self._spiral.core.create_table(
|
|
82
82
|
project_id=self._id,
|
|
83
83
|
dataset=dataset,
|
|
84
84
|
table=table,
|
|
@@ -105,7 +105,7 @@ class Project:
|
|
|
105
105
|
raise ValueError(f"Index not found: {name}")
|
|
106
106
|
res = res[0]
|
|
107
107
|
|
|
108
|
-
return TextIndex(self._spiral.
|
|
108
|
+
return TextIndex(self._spiral.core.text_index(res.id), name=name)
|
|
109
109
|
|
|
110
110
|
def create_text_index(
|
|
111
111
|
self,
|
|
@@ -135,7 +135,7 @@ class Project:
|
|
|
135
135
|
if where is not None:
|
|
136
136
|
where = se.lift(where)
|
|
137
137
|
|
|
138
|
-
core_index = self._spiral.
|
|
138
|
+
core_index = self._spiral.core.create_text_index(
|
|
139
139
|
project_id=self._id,
|
|
140
140
|
name=name,
|
|
141
141
|
projection=projection.__expr__,
|
|
@@ -154,7 +154,7 @@ class Project:
|
|
|
154
154
|
raise ValueError(f"Index not found: {name}")
|
|
155
155
|
res = res[0]
|
|
156
156
|
|
|
157
|
-
return KeySpaceIndex(self._spiral.
|
|
157
|
+
return KeySpaceIndex(self._spiral.core.key_space_index(res.id), name=name)
|
|
158
158
|
|
|
159
159
|
def create_key_space_index(
|
|
160
160
|
self,
|
|
@@ -185,7 +185,7 @@ class Project:
|
|
|
185
185
|
if where is not None:
|
|
186
186
|
where = se.lift(where)
|
|
187
187
|
|
|
188
|
-
core_index = self._spiral.
|
|
188
|
+
core_index = self._spiral.core.create_key_space_index(
|
|
189
189
|
project_id=self._id,
|
|
190
190
|
name=name,
|
|
191
191
|
granularity=granularity,
|