pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.4.0.dist-info/METADATA +46 -0
- pyspiral-0.4.0.dist-info/RECORD +98 -0
- pyspiral-0.4.0.dist-info/WHEEL +4 -0
- pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
- spiral/__init__.py +10 -0
- spiral/_lib.pypy310-pp73-darwin.so +0 -0
- spiral/adbc.py +393 -0
- spiral/api/__init__.py +64 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +160 -0
- spiral/api/filesystems.py +153 -0
- spiral/api/organizations.py +77 -0
- spiral/api/projects.py +197 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/types.py +20 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +221 -0
- spiral/cli/__init__.py +79 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +16 -0
- spiral/cli/app.py +65 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +112 -0
- spiral/cli/iceberg/__init__.py +7 -0
- spiral/cli/iceberg/namespaces.py +47 -0
- spiral/cli/iceberg/tables.py +60 -0
- spiral/cli/indexes/__init__.py +19 -0
- spiral/cli/login.py +22 -0
- spiral/cli/orgs.py +90 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +136 -0
- spiral/cli/state.py +5 -0
- spiral/cli/tables/__init__.py +121 -0
- spiral/cli/telemetry.py +18 -0
- spiral/cli/types.py +51 -0
- spiral/cli/workloads.py +59 -0
- spiral/client.py +79 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/client/__init__.pyi +117 -0
- spiral/core/index/__init__.pyi +15 -0
- spiral/core/table/__init__.pyi +108 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +62 -0
- spiral/core/table/spec/__init__.pyi +214 -0
- spiral/datetime_.py +27 -0
- spiral/expressions/__init__.py +245 -0
- spiral/expressions/base.py +149 -0
- spiral/expressions/http.py +86 -0
- spiral/expressions/io.py +100 -0
- spiral/expressions/list_.py +68 -0
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +58 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +223 -0
- spiral/expressions/udf.py +46 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg/__init__.py +3 -0
- spiral/iceberg/client.py +33 -0
- spiral/indexes/__init__.py +5 -0
- spiral/indexes/client.py +137 -0
- spiral/indexes/index.py +34 -0
- spiral/indexes/scan.py +22 -0
- spiral/project.py +46 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
- spiral/protogen/_/scandal/__init__.py +178 -0
- spiral/protogen/_/spiral/__init__.py +0 -0
- spiral/protogen/_/spiral/table/__init__.py +22 -0
- spiral/protogen/_/substrait/__init__.py +3399 -0
- spiral/protogen/_/substrait/extensions/__init__.py +115 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/substrait/__init__.py +3399 -0
- spiral/protogen/substrait/extensions/__init__.py +115 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/server.py +17 -0
- spiral/settings.py +101 -0
- spiral/substrait_.py +279 -0
- spiral/tables/__init__.py +12 -0
- spiral/tables/client.py +130 -0
- spiral/tables/dataset.py +250 -0
- spiral/tables/debug/__init__.py +0 -0
- spiral/tables/debug/manifests.py +70 -0
- spiral/tables/debug/metrics.py +56 -0
- spiral/tables/debug/scan.py +248 -0
- spiral/tables/maintenance.py +12 -0
- spiral/tables/scan.py +193 -0
- spiral/tables/snapshot.py +78 -0
- spiral/tables/table.py +157 -0
- spiral/tables/transaction.py +52 -0
- spiral/types_.py +6 -0
@@ -0,0 +1,245 @@
|
|
1
|
+
import builtins
|
2
|
+
import functools
|
3
|
+
import operator
|
4
|
+
import warnings
|
5
|
+
from typing import Any
|
6
|
+
|
7
|
+
import pyarrow as pa
|
8
|
+
|
9
|
+
from spiral import _lib, arrow_
|
10
|
+
|
11
|
+
from . import http as http
|
12
|
+
from . import io as io
|
13
|
+
from . import list_ as list
|
14
|
+
from . import mp4 as mp4
|
15
|
+
from . import png as png
|
16
|
+
from . import qoi as qoi
|
17
|
+
from . import refs as refs
|
18
|
+
from . import str_ as str
|
19
|
+
from . import struct as struct
|
20
|
+
from . import text as text
|
21
|
+
from . import tiff as tiff
|
22
|
+
from .base import Expr, ExprLike, NativeExpr
|
23
|
+
|
24
|
+
__all__ = [
|
25
|
+
"Expr",
|
26
|
+
"add",
|
27
|
+
"and_",
|
28
|
+
"deref",
|
29
|
+
"divide",
|
30
|
+
"eq",
|
31
|
+
"getitem",
|
32
|
+
"gt",
|
33
|
+
"gte",
|
34
|
+
"http",
|
35
|
+
"io",
|
36
|
+
"is_not_null",
|
37
|
+
"is_null",
|
38
|
+
"lift",
|
39
|
+
"list",
|
40
|
+
"lt",
|
41
|
+
"lte",
|
42
|
+
"merge",
|
43
|
+
"modulo",
|
44
|
+
"multiply",
|
45
|
+
"negate",
|
46
|
+
"neq",
|
47
|
+
"not_",
|
48
|
+
"or_",
|
49
|
+
"pack",
|
50
|
+
"keyed",
|
51
|
+
"ref",
|
52
|
+
"refs",
|
53
|
+
"scalar",
|
54
|
+
"select",
|
55
|
+
"str",
|
56
|
+
"struct",
|
57
|
+
"subtract",
|
58
|
+
"tiff",
|
59
|
+
"xor",
|
60
|
+
"png",
|
61
|
+
"qoi",
|
62
|
+
"mp4",
|
63
|
+
"text",
|
64
|
+
]
|
65
|
+
|
66
|
+
# Inline some of the struct expressions since they're so common
|
67
|
+
getitem = struct.getitem
|
68
|
+
merge = struct.merge
|
69
|
+
pack = struct.pack
|
70
|
+
select = struct.select
|
71
|
+
ref = refs.ref
|
72
|
+
deref = refs.deref
|
73
|
+
|
74
|
+
|
75
|
+
def lift(expr: ExprLike) -> Expr:
|
76
|
+
# Convert an ExprLike into an Expr.
|
77
|
+
if isinstance(expr, Expr):
|
78
|
+
return expr
|
79
|
+
if isinstance(expr, NativeExpr):
|
80
|
+
return Expr(expr)
|
81
|
+
|
82
|
+
if isinstance(expr, dict):
|
83
|
+
# NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
|
84
|
+
# this is in fact a struct scalar, but the user can always create one of those manually.
|
85
|
+
|
86
|
+
# First we un-nest any dot-separated field names
|
87
|
+
expr: dict = arrow_.nest_structs(expr)
|
88
|
+
|
89
|
+
return pack({k: lift(v) for k, v in expr.items()})
|
90
|
+
|
91
|
+
if isinstance(expr, builtins.list):
|
92
|
+
return lift(pa.array(expr))
|
93
|
+
|
94
|
+
# Unpack tables and chunked arrays
|
95
|
+
if isinstance(expr, pa.Table):
|
96
|
+
expr = expr.to_struct_array()
|
97
|
+
if isinstance(expr, pa.ChunkedArray):
|
98
|
+
expr = expr.combine_chunks()
|
99
|
+
|
100
|
+
# If the value is struct-like, we un-nest any dot-separated field names
|
101
|
+
if isinstance(expr, pa.StructArray | pa.StructScalar):
|
102
|
+
# TODO(marko): Figure out what to do with nullable struct arrays when unpacking them.
|
103
|
+
# We need to merge struct validity into the child validity?
|
104
|
+
if isinstance(expr, pa.StructArray) and expr.null_count != 0:
|
105
|
+
# raise ValueError("lift: cannot lift a struct array with nulls.")
|
106
|
+
warnings.warn("found a struct array with nulls", stacklevel=2)
|
107
|
+
if isinstance(expr, pa.StructScalar) and not expr.is_valid():
|
108
|
+
# raise ValueError("lift: cannot lift a struct scalar with nulls.")
|
109
|
+
warnings.warn("found a struct scalar with nulls", stacklevel=2)
|
110
|
+
return lift(arrow_.nest_structs(expr))
|
111
|
+
|
112
|
+
if isinstance(expr, pa.Array):
|
113
|
+
return Expr(_lib.expr.array_lit(expr))
|
114
|
+
|
115
|
+
# Otherwise, assume it's a scalar.
|
116
|
+
return scalar(expr)
|
117
|
+
|
118
|
+
|
119
|
+
def key(name: builtins.str) -> Expr:
|
120
|
+
"""Create a variable expression referencing a key column.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
name: variable name
|
124
|
+
"""
|
125
|
+
return Expr(_lib.expr.keyed(name))
|
126
|
+
|
127
|
+
|
128
|
+
def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
|
129
|
+
"""Create a variable expression referencing a column in the key table.
|
130
|
+
|
131
|
+
Key table is optionally given to `Scan#to_record_batches` function when reading only specific keys
|
132
|
+
or doing cell pushdown.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
name: variable name
|
136
|
+
dtype: must match dtype of the column in the key table.
|
137
|
+
"""
|
138
|
+
return Expr(_lib.expr.keyed(name, dtype))
|
139
|
+
|
140
|
+
|
141
|
+
def scalar(value: Any) -> Expr:
|
142
|
+
"""Create a scalar expression."""
|
143
|
+
if not isinstance(value, pa.Scalar):
|
144
|
+
value = pa.scalar(value)
|
145
|
+
# TODO(marko): Use Vortex scalar instead of passing as array.
|
146
|
+
return Expr(_lib.expr.scalar(pa.array([value.as_py()], type=value.type)))
|
147
|
+
|
148
|
+
|
149
|
+
def cast(expr: ExprLike, dtype: pa.DataType) -> Expr:
|
150
|
+
"""Cast an expression into another PyArrow DataType."""
|
151
|
+
expr = lift(expr)
|
152
|
+
return Expr(_lib.expr.cast(expr.__expr__, dtype))
|
153
|
+
|
154
|
+
|
155
|
+
def and_(expr: ExprLike, *exprs: ExprLike) -> Expr:
|
156
|
+
"""Create a conjunction of one or more expressions."""
|
157
|
+
|
158
|
+
return functools.reduce(operator.and_, [lift(e) for e in exprs], lift(expr))
|
159
|
+
|
160
|
+
|
161
|
+
def or_(expr: ExprLike, *exprs: ExprLike) -> Expr:
|
162
|
+
"""Create a disjunction of one or more expressions."""
|
163
|
+
return functools.reduce(operator.or_, [lift(e) for e in exprs], lift(expr))
|
164
|
+
|
165
|
+
|
166
|
+
def eq(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
167
|
+
"""Create an equality comparison."""
|
168
|
+
return operator.eq(lift(lhs), rhs)
|
169
|
+
|
170
|
+
|
171
|
+
def neq(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
172
|
+
"""Create a not-equal comparison."""
|
173
|
+
return operator.ne(lift(lhs), rhs)
|
174
|
+
|
175
|
+
|
176
|
+
def xor(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
177
|
+
"""Create a XOR comparison."""
|
178
|
+
return operator.xor(lift(lhs), rhs)
|
179
|
+
|
180
|
+
|
181
|
+
def lt(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
182
|
+
"""Create a less-than comparison."""
|
183
|
+
return operator.lt(lift(lhs), rhs)
|
184
|
+
|
185
|
+
|
186
|
+
def lte(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
187
|
+
"""Create a less-than-or-equal comparison."""
|
188
|
+
return operator.le(lift(lhs), rhs)
|
189
|
+
|
190
|
+
|
191
|
+
def gt(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
192
|
+
"""Create a greater-than comparison."""
|
193
|
+
return operator.gt(lift(lhs), rhs)
|
194
|
+
|
195
|
+
|
196
|
+
def gte(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
197
|
+
"""Create a greater-than-or-equal comparison."""
|
198
|
+
return operator.ge(lift(lhs), rhs)
|
199
|
+
|
200
|
+
|
201
|
+
def negate(expr: ExprLike) -> Expr:
|
202
|
+
"""Negate the given expression."""
|
203
|
+
return operator.neg(lift(expr))
|
204
|
+
|
205
|
+
|
206
|
+
def not_(expr: ExprLike) -> Expr:
|
207
|
+
"""Negate the given expression."""
|
208
|
+
expr = lift(expr)
|
209
|
+
return Expr(_lib.expr.not_(expr.__expr__))
|
210
|
+
|
211
|
+
|
212
|
+
def is_null(expr: ExprLike) -> Expr:
|
213
|
+
"""Check if the given expression is null."""
|
214
|
+
expr = lift(expr)
|
215
|
+
return Expr(_lib.expr.is_null(expr.__expr__))
|
216
|
+
|
217
|
+
|
218
|
+
def is_not_null(expr: ExprLike) -> Expr:
|
219
|
+
"""Check if the given expression is not null."""
|
220
|
+
return not_(is_null(expr))
|
221
|
+
|
222
|
+
|
223
|
+
def add(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
224
|
+
"""Add two expressions."""
|
225
|
+
return operator.add(lift(lhs), rhs)
|
226
|
+
|
227
|
+
|
228
|
+
def subtract(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
229
|
+
"""Subtract two expressions."""
|
230
|
+
return operator.sub(lift(lhs), rhs)
|
231
|
+
|
232
|
+
|
233
|
+
def multiply(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
234
|
+
"""Multiply two expressions."""
|
235
|
+
return operator.mul(lift(lhs), rhs)
|
236
|
+
|
237
|
+
|
238
|
+
def divide(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
239
|
+
"""Divide two expressions."""
|
240
|
+
return operator.truediv(lift(lhs), rhs)
|
241
|
+
|
242
|
+
|
243
|
+
def modulo(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
244
|
+
"""Modulo two expressions."""
|
245
|
+
return operator.mod(lift(lhs), rhs)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import datetime
|
2
|
+
from typing import TypeAlias
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
from spiral import _lib
|
7
|
+
|
8
|
+
NativeExpr: TypeAlias = _lib.expr.Expr
|
9
|
+
|
10
|
+
|
11
|
+
class Expr:
|
12
|
+
"""Base class for Spiral expressions. All expressions support comparison and basic arithmetic operations."""
|
13
|
+
|
14
|
+
def __init__(self, native: NativeExpr) -> None:
|
15
|
+
if not isinstance(native, NativeExpr):
|
16
|
+
raise TypeError(f"Expected a native expression, got {type(native)}")
|
17
|
+
self._native = native
|
18
|
+
|
19
|
+
@property
|
20
|
+
def __expr__(self) -> NativeExpr:
|
21
|
+
return self._native
|
22
|
+
|
23
|
+
def __str__(self):
|
24
|
+
return str(self.__expr__)
|
25
|
+
|
26
|
+
def __repr__(self):
|
27
|
+
return repr(self.__expr__)
|
28
|
+
|
29
|
+
def __getitem__(self, item: str | int) -> "Expr":
|
30
|
+
"""
|
31
|
+
Get an item from a struct or list.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
item: The key or index to get.
|
35
|
+
If item is a string, it is assumed to be a field in a struct. Dot-separated string is supported
|
36
|
+
to access nested fields. If item is an integer, it is assumed to be an index in a list.
|
37
|
+
"""
|
38
|
+
from spiral import expressions as se
|
39
|
+
|
40
|
+
expr = self
|
41
|
+
|
42
|
+
if isinstance(item, int):
|
43
|
+
# Assume list and get an element.
|
44
|
+
expr = se.list_.element_at(expr, item)
|
45
|
+
else:
|
46
|
+
# Walk into the struct.
|
47
|
+
for part in item.split("."):
|
48
|
+
expr = se.getitem(expr, part)
|
49
|
+
|
50
|
+
return expr
|
51
|
+
|
52
|
+
def __eq__(self, other: "ExprLike") -> "Expr":
|
53
|
+
return self._binary("eq", other)
|
54
|
+
|
55
|
+
def __ne__(self, other: "ExprLike") -> "Expr":
|
56
|
+
return self._binary("neq", other)
|
57
|
+
|
58
|
+
def __lt__(self, other: "ExprLike") -> "Expr":
|
59
|
+
return self._binary("lt", other)
|
60
|
+
|
61
|
+
def __le__(self, other: "ExprLike") -> "Expr":
|
62
|
+
return self._binary("lte", other)
|
63
|
+
|
64
|
+
def __gt__(self, other: "ExprLike") -> "Expr":
|
65
|
+
return self._binary("gt", other)
|
66
|
+
|
67
|
+
def __ge__(self, other: "ExprLike") -> "Expr":
|
68
|
+
return self._binary("gte", other)
|
69
|
+
|
70
|
+
def __and__(self, other: "ExprLike") -> "Expr":
|
71
|
+
return self._binary("and", other)
|
72
|
+
|
73
|
+
def __or__(self, other: "ExprLike") -> "Expr":
|
74
|
+
return self._binary("or", other)
|
75
|
+
|
76
|
+
def __xor__(self, other: "ExprLike") -> "Expr":
|
77
|
+
raise NotImplementedError
|
78
|
+
|
79
|
+
def __add__(self, other: "ExprLike") -> "Expr":
|
80
|
+
return self._binary("add", other)
|
81
|
+
|
82
|
+
def __sub__(self, other: "ExprLike") -> "Expr":
|
83
|
+
return self._binary("sub", other)
|
84
|
+
|
85
|
+
def __mul__(self, other: "ExprLike") -> "Expr":
|
86
|
+
return self._binary("mul", other)
|
87
|
+
|
88
|
+
def __truediv__(self, other: "ExprLike") -> "Expr":
|
89
|
+
return self._binary("div", other)
|
90
|
+
|
91
|
+
def __mod__(self, other: "ExprLike") -> "Expr":
|
92
|
+
return self._binary("mod", other)
|
93
|
+
|
94
|
+
def __neg__(self):
|
95
|
+
return Expr(_lib.expr.unary("neg", self.__expr__))
|
96
|
+
|
97
|
+
def in_(self, other: "ExprLike") -> "Expr":
|
98
|
+
from spiral import expressions as se
|
99
|
+
|
100
|
+
other = se.lift(other)
|
101
|
+
return Expr(_lib.expr.list.contains(other.__expr__, self.__expr__))
|
102
|
+
|
103
|
+
def contains(self, other: "ExprLike") -> "Expr":
|
104
|
+
from spiral import expressions as se
|
105
|
+
|
106
|
+
return se.lift(other).in_(self)
|
107
|
+
|
108
|
+
def cast(self, dtype: pa.DataType) -> "Expr":
|
109
|
+
"""Cast the expression result to a different data type."""
|
110
|
+
return Expr(_lib.expr.cast(self.__expr__, dtype))
|
111
|
+
|
112
|
+
def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
|
113
|
+
"""Select fields from a struct-like expression.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
*paths: Field names to select. If a path contains a dot, it is assumed to be a nested struct field.
|
117
|
+
exclude: List of field names to exclude from result.
|
118
|
+
"""
|
119
|
+
from spiral import expressions as se
|
120
|
+
|
121
|
+
# If any of the paths contain nested fields, then we re-pack nested select statements.
|
122
|
+
if any("." in p for p in paths):
|
123
|
+
fields = {}
|
124
|
+
for p in paths:
|
125
|
+
if "." in p:
|
126
|
+
parent, child = p.split(".", 1)
|
127
|
+
fields[parent] = self[parent].select(child)
|
128
|
+
else:
|
129
|
+
fields[p] = self[p]
|
130
|
+
packed = se.pack(fields)
|
131
|
+
if exclude:
|
132
|
+
packed = packed.select(exclude=exclude)
|
133
|
+
return packed
|
134
|
+
|
135
|
+
if not paths:
|
136
|
+
return self
|
137
|
+
|
138
|
+
return se.select(self, names=list(paths), exclude=exclude)
|
139
|
+
|
140
|
+
def _binary(self, op: str, rhs: "ExprLike") -> "Expr":
|
141
|
+
"""Create a comparison expression."""
|
142
|
+
from spiral import expressions as se
|
143
|
+
|
144
|
+
rhs = se.lift(rhs)
|
145
|
+
return Expr(_lib.expr.binary(op, self.__expr__, rhs.__expr__))
|
146
|
+
|
147
|
+
|
148
|
+
ScalarLike: TypeAlias = bool | int | float | str | list | datetime.datetime | None
|
149
|
+
ExprLike: TypeAlias = Expr | dict | ScalarLike
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import hishel
|
2
|
+
import httpx
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from spiral.expressions.base import Expr, ExprLike
|
6
|
+
from spiral.expressions.struct import pack
|
7
|
+
from spiral.expressions.udf import UDF
|
8
|
+
from spiral.settings import APP_DIR
|
9
|
+
|
10
|
+
|
11
|
+
def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
|
12
|
+
"""Submit a GET request to either a scalar of vector of URLs."""
|
13
|
+
to_pack = {"url": url}
|
14
|
+
if headers is not None:
|
15
|
+
to_pack["headers"] = headers
|
16
|
+
return HttpGet(force_cache)(pack(to_pack))
|
17
|
+
|
18
|
+
|
19
|
+
class HttpGet(UDF):
|
20
|
+
RES_DTYPE: pa.DataType = pa.struct(
|
21
|
+
[
|
22
|
+
pa.field("bytes", pa.large_binary()),
|
23
|
+
pa.field("status", pa.int32()),
|
24
|
+
pa.field("headers", pa.map_(pa.string(), pa.string())),
|
25
|
+
]
|
26
|
+
)
|
27
|
+
|
28
|
+
def __init__(self, force_cache: bool = False):
|
29
|
+
super().__init__("http.get")
|
30
|
+
self._force_cache = force_cache
|
31
|
+
|
32
|
+
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
33
|
+
return HttpGet.RES_DTYPE
|
34
|
+
|
35
|
+
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
36
|
+
if len(input_args) != 1:
|
37
|
+
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
38
|
+
result = _http_request(input_args[0], self._force_cache)
|
39
|
+
if isinstance(result, pa.ChunkedArray):
|
40
|
+
result = result.combine_chunks()
|
41
|
+
return result
|
42
|
+
|
43
|
+
|
44
|
+
def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
|
45
|
+
client = _HttpClient()
|
46
|
+
|
47
|
+
if isinstance(arg, pa.StructArray):
|
48
|
+
# We assume a vector of requests, but with potentially many arguments
|
49
|
+
return pa.array(
|
50
|
+
[
|
51
|
+
_response_dict(
|
52
|
+
client.request(
|
53
|
+
req.get("method", "GET").upper(),
|
54
|
+
req["url"],
|
55
|
+
headers=req.get("headers", {}),
|
56
|
+
extensions={"force_cache": force_cache},
|
57
|
+
)
|
58
|
+
)
|
59
|
+
for req in arg.to_pylist()
|
60
|
+
],
|
61
|
+
type=HttpGet.RES_DTYPE,
|
62
|
+
)
|
63
|
+
|
64
|
+
raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
|
65
|
+
|
66
|
+
|
67
|
+
def _response_dict(response: httpx.Response) -> dict:
|
68
|
+
if response.status_code != 200:
|
69
|
+
raise ValueError(f"Request failed with status {response.status_code}")
|
70
|
+
return {
|
71
|
+
"bytes": response.read(),
|
72
|
+
"status": response.status_code,
|
73
|
+
"headers": dict(response.headers),
|
74
|
+
}
|
75
|
+
|
76
|
+
|
77
|
+
class _HttpClient(hishel.CacheClient):
|
78
|
+
_instance: "_HttpClient" = None
|
79
|
+
|
80
|
+
def __new__(cls, *args, **kwargs):
|
81
|
+
if not cls._instance:
|
82
|
+
cls._instance = super().__new__(cls)
|
83
|
+
return cls._instance
|
84
|
+
|
85
|
+
def __init__(self):
|
86
|
+
super().__init__(storage=hishel.FileStorage(base_path=APP_DIR / "http.cache", ttl=3600))
|
spiral/expressions/io.py
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
import tarfile
|
2
|
+
from io import BytesIO
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
7
|
+
from spiral.expressions.struct import pack
|
8
|
+
from spiral.expressions.udf import UDF
|
9
|
+
|
10
|
+
|
11
|
+
def read_file(path: ExprLike) -> Expr:
|
12
|
+
"""
|
13
|
+
Read file path(s) from disk into a struct with a single field "bytes" containing the file contents.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
path: Expression evaluating to an array of strings representing local disk paths.
|
17
|
+
"""
|
18
|
+
to_pack = {"path": path}
|
19
|
+
return FileRead()(pack(to_pack))
|
20
|
+
|
21
|
+
|
22
|
+
class FileRead(UDF):
|
23
|
+
RES_DTYPE: pa.DataType = pa.struct(
|
24
|
+
[
|
25
|
+
pa.field("bytes", pa.large_binary()),
|
26
|
+
]
|
27
|
+
)
|
28
|
+
|
29
|
+
def __init__(self):
|
30
|
+
super().__init__("file.read")
|
31
|
+
|
32
|
+
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
33
|
+
return FileRead.RES_DTYPE
|
34
|
+
|
35
|
+
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
36
|
+
if len(input_args) != 1:
|
37
|
+
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
38
|
+
arg = input_args[0]
|
39
|
+
|
40
|
+
res = []
|
41
|
+
for req in arg:
|
42
|
+
with open(req["path"].as_py(), "rb") as f:
|
43
|
+
res.append({"bytes": f.read()})
|
44
|
+
|
45
|
+
return pa.array(res, type=FileRead.RES_DTYPE)
|
46
|
+
|
47
|
+
|
48
|
+
def read_tar(path: ExprLike = None, bytes_: ExprLike = None) -> "Expr":
|
49
|
+
# Untar a vector of paths / byte arrays representing tarballs.
|
50
|
+
if path is None and bytes_ is None:
|
51
|
+
raise ValueError("Expected either path or bytes_ to be provided")
|
52
|
+
to_pack = {}
|
53
|
+
if path is not None:
|
54
|
+
to_pack["path"] = path
|
55
|
+
if bytes_ is not None:
|
56
|
+
to_pack["bytes"] = bytes_
|
57
|
+
return TarRead()(pack(to_pack))
|
58
|
+
|
59
|
+
|
60
|
+
class TarRead(UDF):
|
61
|
+
RES_DTYPE = pa.list_(
|
62
|
+
pa.struct(
|
63
|
+
[
|
64
|
+
pa.field("name", pa.string()),
|
65
|
+
pa.field("bytes", pa.large_binary()),
|
66
|
+
]
|
67
|
+
)
|
68
|
+
)
|
69
|
+
|
70
|
+
def __init__(self):
|
71
|
+
super().__init__("tar.read")
|
72
|
+
|
73
|
+
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
74
|
+
return TarRead.RES_DTYPE
|
75
|
+
|
76
|
+
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
77
|
+
if len(input_args) != 1:
|
78
|
+
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
79
|
+
arg = input_args[0]
|
80
|
+
|
81
|
+
res = []
|
82
|
+
for req in arg:
|
83
|
+
if "path" in req:
|
84
|
+
kwargs = {"name": req["path"].as_py()}
|
85
|
+
elif "bytes" in req:
|
86
|
+
kwargs = {"fileobj": BytesIO(req["bytes"].as_py())}
|
87
|
+
else:
|
88
|
+
raise ValueError("Expected path or bytes_ to be provided")
|
89
|
+
|
90
|
+
files = []
|
91
|
+
with tarfile.open(**kwargs) as f:
|
92
|
+
for m in f.getmembers():
|
93
|
+
m: tarfile.TarInfo
|
94
|
+
if m.type == tarfile.DIRTYPE:
|
95
|
+
continue
|
96
|
+
# TODO(ngates): skip other types too maybe? Why are we even skipping directories?
|
97
|
+
files.append({"name": m.name, "bytes": f.extractfile(m).read()})
|
98
|
+
res.append(files)
|
99
|
+
|
100
|
+
return pa.array(res, type=TarRead.RES_DTYPE)
|
@@ -0,0 +1,68 @@
|
|
1
|
+
from spiral.expressions.base import Expr, ExprLike
|
2
|
+
|
3
|
+
|
4
|
+
def in_(expr: ExprLike, values: ExprLike) -> Expr:
|
5
|
+
"""Check if a value is in a list.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
expr: The value to check.
|
9
|
+
values: The list array expression to check against.
|
10
|
+
"""
|
11
|
+
# `se.list.in_(Array[2, 4], Array[[1, 2], [1, 2]]) -> Array[True, False]`
|
12
|
+
from spiral.expressions import lift
|
13
|
+
|
14
|
+
expr = lift(expr)
|
15
|
+
return expr.in_(values)
|
16
|
+
|
17
|
+
|
18
|
+
def element_at(expr: ExprLike, index: ExprLike) -> Expr:
|
19
|
+
"""Get the element at the given index.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
expr: The list array expression.
|
23
|
+
index: The index to get.
|
24
|
+
"""
|
25
|
+
# e.g. `se.list.element_at([1, 2, 3], 1) -> 2`
|
26
|
+
...
|
27
|
+
from spiral import _lib
|
28
|
+
from spiral.expressions import lift
|
29
|
+
|
30
|
+
expr = lift(expr)
|
31
|
+
index = lift(index)
|
32
|
+
return Expr(_lib.expr.list.element_at(expr.__expr__, index.__expr__))
|
33
|
+
|
34
|
+
|
35
|
+
def of(*expr: ExprLike) -> Expr:
|
36
|
+
# Creates an array or scalar list from a series of expressions, all values must be of the same type.
|
37
|
+
# The expressions must all also have the same length (1 for scalars).
|
38
|
+
#
|
39
|
+
# e.g. `se.list.of(1+3, 2, 3) -> [4, 2, 3]`
|
40
|
+
...
|
41
|
+
|
42
|
+
|
43
|
+
def zip(*lists: ExprLike) -> Expr:
|
44
|
+
# Merge the given lists, with duplicates.
|
45
|
+
#
|
46
|
+
# e.g. `se.list.merge([1, 2], [3, 4]) -> [(1, 2), (3, 4)]`
|
47
|
+
...
|
48
|
+
|
49
|
+
|
50
|
+
def concat(*lists: ExprLike) -> Expr:
|
51
|
+
# Concatenate the given lists. The types of all the lists must be the same.
|
52
|
+
#
|
53
|
+
# e.g. `se.list.concat([1, 2], [3, 4]) -> [1, 2, 3, 4]`
|
54
|
+
...
|
55
|
+
|
56
|
+
|
57
|
+
def slice_(expr: ExprLike, start: int | None = None, stop: int | None = None) -> Expr:
|
58
|
+
# Slice a list.
|
59
|
+
#
|
60
|
+
# e.g. `se.list.slice_([0, 1, 2], slice(0,2)) -> [0, 1]`
|
61
|
+
...
|
62
|
+
|
63
|
+
|
64
|
+
def length(expr: ExprLike) -> Expr:
|
65
|
+
# Get the length of a list.
|
66
|
+
#
|
67
|
+
# e.g. `se.list.length([1, 2, 3]) -> 3`
|
68
|
+
...
|