pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. pyspiral-0.4.0.dist-info/METADATA +46 -0
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. pyspiral-0.4.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +10 -0
  6. spiral/_lib.pypy310-pp73-darwin.so +0 -0
  7. spiral/adbc.py +393 -0
  8. spiral/api/__init__.py +64 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +160 -0
  11. spiral/api/filesystems.py +153 -0
  12. spiral/api/organizations.py +77 -0
  13. spiral/api/projects.py +197 -0
  14. spiral/api/telemetry.py +19 -0
  15. spiral/api/types.py +20 -0
  16. spiral/api/workloads.py +52 -0
  17. spiral/arrow_.py +221 -0
  18. spiral/cli/__init__.py +79 -0
  19. spiral/cli/__main__.py +4 -0
  20. spiral/cli/admin.py +16 -0
  21. spiral/cli/app.py +65 -0
  22. spiral/cli/console.py +95 -0
  23. spiral/cli/fs.py +112 -0
  24. spiral/cli/iceberg/__init__.py +7 -0
  25. spiral/cli/iceberg/namespaces.py +47 -0
  26. spiral/cli/iceberg/tables.py +60 -0
  27. spiral/cli/indexes/__init__.py +19 -0
  28. spiral/cli/login.py +22 -0
  29. spiral/cli/orgs.py +90 -0
  30. spiral/cli/printer.py +53 -0
  31. spiral/cli/projects.py +136 -0
  32. spiral/cli/state.py +5 -0
  33. spiral/cli/tables/__init__.py +121 -0
  34. spiral/cli/telemetry.py +18 -0
  35. spiral/cli/types.py +51 -0
  36. spiral/cli/workloads.py +59 -0
  37. spiral/client.py +79 -0
  38. spiral/core/__init__.pyi +0 -0
  39. spiral/core/client/__init__.pyi +117 -0
  40. spiral/core/index/__init__.pyi +15 -0
  41. spiral/core/table/__init__.pyi +108 -0
  42. spiral/core/table/manifests/__init__.pyi +35 -0
  43. spiral/core/table/metastore/__init__.pyi +62 -0
  44. spiral/core/table/spec/__init__.pyi +214 -0
  45. spiral/datetime_.py +27 -0
  46. spiral/expressions/__init__.py +245 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/mp4.py +62 -0
  52. spiral/expressions/png.py +18 -0
  53. spiral/expressions/qoi.py +18 -0
  54. spiral/expressions/refs.py +58 -0
  55. spiral/expressions/str_.py +39 -0
  56. spiral/expressions/struct.py +59 -0
  57. spiral/expressions/text.py +62 -0
  58. spiral/expressions/tiff.py +223 -0
  59. spiral/expressions/udf.py +46 -0
  60. spiral/grpc_.py +32 -0
  61. spiral/iceberg/__init__.py +3 -0
  62. spiral/iceberg/client.py +33 -0
  63. spiral/indexes/__init__.py +5 -0
  64. spiral/indexes/client.py +137 -0
  65. spiral/indexes/index.py +34 -0
  66. spiral/indexes/scan.py +22 -0
  67. spiral/project.py +46 -0
  68. spiral/protogen/_/__init__.py +0 -0
  69. spiral/protogen/_/arrow/__init__.py +0 -0
  70. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  71. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  72. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  73. spiral/protogen/_/scandal/__init__.py +178 -0
  74. spiral/protogen/_/spiral/__init__.py +0 -0
  75. spiral/protogen/_/spiral/table/__init__.py +22 -0
  76. spiral/protogen/_/substrait/__init__.py +3399 -0
  77. spiral/protogen/_/substrait/extensions/__init__.py +115 -0
  78. spiral/protogen/__init__.py +0 -0
  79. spiral/protogen/substrait/__init__.py +3399 -0
  80. spiral/protogen/substrait/extensions/__init__.py +115 -0
  81. spiral/protogen/util.py +41 -0
  82. spiral/py.typed +0 -0
  83. spiral/server.py +17 -0
  84. spiral/settings.py +101 -0
  85. spiral/substrait_.py +279 -0
  86. spiral/tables/__init__.py +12 -0
  87. spiral/tables/client.py +130 -0
  88. spiral/tables/dataset.py +250 -0
  89. spiral/tables/debug/__init__.py +0 -0
  90. spiral/tables/debug/manifests.py +70 -0
  91. spiral/tables/debug/metrics.py +56 -0
  92. spiral/tables/debug/scan.py +248 -0
  93. spiral/tables/maintenance.py +12 -0
  94. spiral/tables/scan.py +193 -0
  95. spiral/tables/snapshot.py +78 -0
  96. spiral/tables/table.py +157 -0
  97. spiral/tables/transaction.py +52 -0
  98. spiral/types_.py +6 -0
@@ -0,0 +1,245 @@
1
+ import builtins
2
+ import functools
3
+ import operator
4
+ import warnings
5
+ from typing import Any
6
+
7
+ import pyarrow as pa
8
+
9
+ from spiral import _lib, arrow_
10
+
11
+ from . import http as http
12
+ from . import io as io
13
+ from . import list_ as list
14
+ from . import mp4 as mp4
15
+ from . import png as png
16
+ from . import qoi as qoi
17
+ from . import refs as refs
18
+ from . import str_ as str
19
+ from . import struct as struct
20
+ from . import text as text
21
+ from . import tiff as tiff
22
+ from .base import Expr, ExprLike, NativeExpr
23
+
24
+ __all__ = [
25
+ "Expr",
26
+ "add",
27
+ "and_",
28
+ "deref",
29
+ "divide",
30
+ "eq",
31
+ "getitem",
32
+ "gt",
33
+ "gte",
34
+ "http",
35
+ "io",
36
+ "is_not_null",
37
+ "is_null",
38
+ "lift",
39
+ "list",
40
+ "lt",
41
+ "lte",
42
+ "merge",
43
+ "modulo",
44
+ "multiply",
45
+ "negate",
46
+ "neq",
47
+ "not_",
48
+ "or_",
49
+ "pack",
50
+ "keyed",
51
+ "ref",
52
+ "refs",
53
+ "scalar",
54
+ "select",
55
+ "str",
56
+ "struct",
57
+ "subtract",
58
+ "tiff",
59
+ "xor",
60
+ "png",
61
+ "qoi",
62
+ "mp4",
63
+ "text",
64
+ ]
65
+
66
+ # Inline some of the struct expressions since they're so common
67
+ getitem = struct.getitem
68
+ merge = struct.merge
69
+ pack = struct.pack
70
+ select = struct.select
71
+ ref = refs.ref
72
+ deref = refs.deref
73
+
74
+
75
+ def lift(expr: ExprLike) -> Expr:
76
+ # Convert an ExprLike into an Expr.
77
+ if isinstance(expr, Expr):
78
+ return expr
79
+ if isinstance(expr, NativeExpr):
80
+ return Expr(expr)
81
+
82
+ if isinstance(expr, dict):
83
+ # NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
84
+ # this is in fact a struct scalar, but the user can always create one of those manually.
85
+
86
+ # First we un-nest any dot-separated field names
87
+ expr: dict = arrow_.nest_structs(expr)
88
+
89
+ return pack({k: lift(v) for k, v in expr.items()})
90
+
91
+ if isinstance(expr, builtins.list):
92
+ return lift(pa.array(expr))
93
+
94
+ # Unpack tables and chunked arrays
95
+ if isinstance(expr, pa.Table):
96
+ expr = expr.to_struct_array()
97
+ if isinstance(expr, pa.ChunkedArray):
98
+ expr = expr.combine_chunks()
99
+
100
+ # If the value is struct-like, we un-nest any dot-separated field names
101
+ if isinstance(expr, pa.StructArray | pa.StructScalar):
102
+ # TODO(marko): Figure out what to do with nullable struct arrays when unpacking them.
103
+ # We need to merge struct validity into the child validity?
104
+ if isinstance(expr, pa.StructArray) and expr.null_count != 0:
105
+ # raise ValueError("lift: cannot lift a struct array with nulls.")
106
+ warnings.warn("found a struct array with nulls", stacklevel=2)
107
+ if isinstance(expr, pa.StructScalar) and not expr.is_valid():
108
+ # raise ValueError("lift: cannot lift a struct scalar with nulls.")
109
+ warnings.warn("found a struct scalar with nulls", stacklevel=2)
110
+ return lift(arrow_.nest_structs(expr))
111
+
112
+ if isinstance(expr, pa.Array):
113
+ return Expr(_lib.expr.array_lit(expr))
114
+
115
+ # Otherwise, assume it's a scalar.
116
+ return scalar(expr)
117
+
118
+
119
+ def key(name: builtins.str) -> Expr:
120
+ """Create a variable expression referencing a key column.
121
+
122
+ Args:
123
+ name: variable name
124
+ """
125
+ return Expr(_lib.expr.keyed(name))
126
+
127
+
128
+ def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
129
+ """Create a variable expression referencing a column in the key table.
130
+
131
+ Key table is optionally given to `Scan#to_record_batches` function when reading only specific keys
132
+ or doing cell pushdown.
133
+
134
+ Args:
135
+ name: variable name
136
+ dtype: must match dtype of the column in the key table.
137
+ """
138
+ return Expr(_lib.expr.keyed(name, dtype))
139
+
140
+
141
+ def scalar(value: Any) -> Expr:
142
+ """Create a scalar expression."""
143
+ if not isinstance(value, pa.Scalar):
144
+ value = pa.scalar(value)
145
+ # TODO(marko): Use Vortex scalar instead of passing as array.
146
+ return Expr(_lib.expr.scalar(pa.array([value.as_py()], type=value.type)))
147
+
148
+
149
+ def cast(expr: ExprLike, dtype: pa.DataType) -> Expr:
150
+ """Cast an expression into another PyArrow DataType."""
151
+ expr = lift(expr)
152
+ return Expr(_lib.expr.cast(expr.__expr__, dtype))
153
+
154
+
155
+ def and_(expr: ExprLike, *exprs: ExprLike) -> Expr:
156
+ """Create a conjunction of one or more expressions."""
157
+
158
+ return functools.reduce(operator.and_, [lift(e) for e in exprs], lift(expr))
159
+
160
+
161
+ def or_(expr: ExprLike, *exprs: ExprLike) -> Expr:
162
+ """Create a disjunction of one or more expressions."""
163
+ return functools.reduce(operator.or_, [lift(e) for e in exprs], lift(expr))
164
+
165
+
166
+ def eq(lhs: ExprLike, rhs: ExprLike) -> Expr:
167
+ """Create an equality comparison."""
168
+ return operator.eq(lift(lhs), rhs)
169
+
170
+
171
+ def neq(lhs: ExprLike, rhs: ExprLike) -> Expr:
172
+ """Create a not-equal comparison."""
173
+ return operator.ne(lift(lhs), rhs)
174
+
175
+
176
+ def xor(lhs: ExprLike, rhs: ExprLike) -> Expr:
177
+ """Create a XOR comparison."""
178
+ return operator.xor(lift(lhs), rhs)
179
+
180
+
181
+ def lt(lhs: ExprLike, rhs: ExprLike) -> Expr:
182
+ """Create a less-than comparison."""
183
+ return operator.lt(lift(lhs), rhs)
184
+
185
+
186
+ def lte(lhs: ExprLike, rhs: ExprLike) -> Expr:
187
+ """Create a less-than-or-equal comparison."""
188
+ return operator.le(lift(lhs), rhs)
189
+
190
+
191
+ def gt(lhs: ExprLike, rhs: ExprLike) -> Expr:
192
+ """Create a greater-than comparison."""
193
+ return operator.gt(lift(lhs), rhs)
194
+
195
+
196
+ def gte(lhs: ExprLike, rhs: ExprLike) -> Expr:
197
+ """Create a greater-than-or-equal comparison."""
198
+ return operator.ge(lift(lhs), rhs)
199
+
200
+
201
+ def negate(expr: ExprLike) -> Expr:
202
+ """Negate the given expression."""
203
+ return operator.neg(lift(expr))
204
+
205
+
206
+ def not_(expr: ExprLike) -> Expr:
207
+ """Negate the given expression."""
208
+ expr = lift(expr)
209
+ return Expr(_lib.expr.not_(expr.__expr__))
210
+
211
+
212
+ def is_null(expr: ExprLike) -> Expr:
213
+ """Check if the given expression is null."""
214
+ expr = lift(expr)
215
+ return Expr(_lib.expr.is_null(expr.__expr__))
216
+
217
+
218
+ def is_not_null(expr: ExprLike) -> Expr:
219
+ """Check if the given expression is not null."""
220
+ return not_(is_null(expr))
221
+
222
+
223
+ def add(lhs: ExprLike, rhs: ExprLike) -> Expr:
224
+ """Add two expressions."""
225
+ return operator.add(lift(lhs), rhs)
226
+
227
+
228
+ def subtract(lhs: ExprLike, rhs: ExprLike) -> Expr:
229
+ """Subtract two expressions."""
230
+ return operator.sub(lift(lhs), rhs)
231
+
232
+
233
+ def multiply(lhs: ExprLike, rhs: ExprLike) -> Expr:
234
+ """Multiply two expressions."""
235
+ return operator.mul(lift(lhs), rhs)
236
+
237
+
238
+ def divide(lhs: ExprLike, rhs: ExprLike) -> Expr:
239
+ """Divide two expressions."""
240
+ return operator.truediv(lift(lhs), rhs)
241
+
242
+
243
+ def modulo(lhs: ExprLike, rhs: ExprLike) -> Expr:
244
+ """Modulo two expressions."""
245
+ return operator.mod(lift(lhs), rhs)
@@ -0,0 +1,149 @@
1
+ import datetime
2
+ from typing import TypeAlias
3
+
4
+ import pyarrow as pa
5
+
6
+ from spiral import _lib
7
+
8
+ NativeExpr: TypeAlias = _lib.expr.Expr
9
+
10
+
11
+ class Expr:
12
+ """Base class for Spiral expressions. All expressions support comparison and basic arithmetic operations."""
13
+
14
+ def __init__(self, native: NativeExpr) -> None:
15
+ if not isinstance(native, NativeExpr):
16
+ raise TypeError(f"Expected a native expression, got {type(native)}")
17
+ self._native = native
18
+
19
+ @property
20
+ def __expr__(self) -> NativeExpr:
21
+ return self._native
22
+
23
+ def __str__(self):
24
+ return str(self.__expr__)
25
+
26
+ def __repr__(self):
27
+ return repr(self.__expr__)
28
+
29
+ def __getitem__(self, item: str | int) -> "Expr":
30
+ """
31
+ Get an item from a struct or list.
32
+
33
+ Args:
34
+ item: The key or index to get.
35
+ If item is a string, it is assumed to be a field in a struct. Dot-separated string is supported
36
+ to access nested fields. If item is an integer, it is assumed to be an index in a list.
37
+ """
38
+ from spiral import expressions as se
39
+
40
+ expr = self
41
+
42
+ if isinstance(item, int):
43
+ # Assume list and get an element.
44
+ expr = se.list_.element_at(expr, item)
45
+ else:
46
+ # Walk into the struct.
47
+ for part in item.split("."):
48
+ expr = se.getitem(expr, part)
49
+
50
+ return expr
51
+
52
+ def __eq__(self, other: "ExprLike") -> "Expr":
53
+ return self._binary("eq", other)
54
+
55
+ def __ne__(self, other: "ExprLike") -> "Expr":
56
+ return self._binary("neq", other)
57
+
58
+ def __lt__(self, other: "ExprLike") -> "Expr":
59
+ return self._binary("lt", other)
60
+
61
+ def __le__(self, other: "ExprLike") -> "Expr":
62
+ return self._binary("lte", other)
63
+
64
+ def __gt__(self, other: "ExprLike") -> "Expr":
65
+ return self._binary("gt", other)
66
+
67
+ def __ge__(self, other: "ExprLike") -> "Expr":
68
+ return self._binary("gte", other)
69
+
70
+ def __and__(self, other: "ExprLike") -> "Expr":
71
+ return self._binary("and", other)
72
+
73
+ def __or__(self, other: "ExprLike") -> "Expr":
74
+ return self._binary("or", other)
75
+
76
+ def __xor__(self, other: "ExprLike") -> "Expr":
77
+ raise NotImplementedError
78
+
79
+ def __add__(self, other: "ExprLike") -> "Expr":
80
+ return self._binary("add", other)
81
+
82
+ def __sub__(self, other: "ExprLike") -> "Expr":
83
+ return self._binary("sub", other)
84
+
85
+ def __mul__(self, other: "ExprLike") -> "Expr":
86
+ return self._binary("mul", other)
87
+
88
+ def __truediv__(self, other: "ExprLike") -> "Expr":
89
+ return self._binary("div", other)
90
+
91
+ def __mod__(self, other: "ExprLike") -> "Expr":
92
+ return self._binary("mod", other)
93
+
94
+ def __neg__(self):
95
+ return Expr(_lib.expr.unary("neg", self.__expr__))
96
+
97
+ def in_(self, other: "ExprLike") -> "Expr":
98
+ from spiral import expressions as se
99
+
100
+ other = se.lift(other)
101
+ return Expr(_lib.expr.list.contains(other.__expr__, self.__expr__))
102
+
103
+ def contains(self, other: "ExprLike") -> "Expr":
104
+ from spiral import expressions as se
105
+
106
+ return se.lift(other).in_(self)
107
+
108
+ def cast(self, dtype: pa.DataType) -> "Expr":
109
+ """Cast the expression result to a different data type."""
110
+ return Expr(_lib.expr.cast(self.__expr__, dtype))
111
+
112
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
113
+ """Select fields from a struct-like expression.
114
+
115
+ Args:
116
+ *paths: Field names to select. If a path contains a dot, it is assumed to be a nested struct field.
117
+ exclude: List of field names to exclude from result.
118
+ """
119
+ from spiral import expressions as se
120
+
121
+ # If any of the paths contain nested fields, then we re-pack nested select statements.
122
+ if any("." in p for p in paths):
123
+ fields = {}
124
+ for p in paths:
125
+ if "." in p:
126
+ parent, child = p.split(".", 1)
127
+ fields[parent] = self[parent].select(child)
128
+ else:
129
+ fields[p] = self[p]
130
+ packed = se.pack(fields)
131
+ if exclude:
132
+ packed = packed.select(exclude=exclude)
133
+ return packed
134
+
135
+ if not paths:
136
+ return self
137
+
138
+ return se.select(self, names=list(paths), exclude=exclude)
139
+
140
+ def _binary(self, op: str, rhs: "ExprLike") -> "Expr":
141
+ """Create a comparison expression."""
142
+ from spiral import expressions as se
143
+
144
+ rhs = se.lift(rhs)
145
+ return Expr(_lib.expr.binary(op, self.__expr__, rhs.__expr__))
146
+
147
+
148
+ ScalarLike: TypeAlias = bool | int | float | str | list | datetime.datetime | None
149
+ ExprLike: TypeAlias = Expr | dict | ScalarLike
@@ -0,0 +1,86 @@
1
+ import hishel
2
+ import httpx
3
+ import pyarrow as pa
4
+
5
+ from spiral.expressions.base import Expr, ExprLike
6
+ from spiral.expressions.struct import pack
7
+ from spiral.expressions.udf import UDF
8
+ from spiral.settings import APP_DIR
9
+
10
+
11
+ def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
12
+ """Submit a GET request to either a scalar of vector of URLs."""
13
+ to_pack = {"url": url}
14
+ if headers is not None:
15
+ to_pack["headers"] = headers
16
+ return HttpGet(force_cache)(pack(to_pack))
17
+
18
+
19
+ class HttpGet(UDF):
20
+ RES_DTYPE: pa.DataType = pa.struct(
21
+ [
22
+ pa.field("bytes", pa.large_binary()),
23
+ pa.field("status", pa.int32()),
24
+ pa.field("headers", pa.map_(pa.string(), pa.string())),
25
+ ]
26
+ )
27
+
28
+ def __init__(self, force_cache: bool = False):
29
+ super().__init__("http.get")
30
+ self._force_cache = force_cache
31
+
32
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
33
+ return HttpGet.RES_DTYPE
34
+
35
+ def invoke(self, *input_args: pa.Array) -> pa.Array:
36
+ if len(input_args) != 1:
37
+ raise ValueError(f"Expected 1 argument, got {len(input_args)}")
38
+ result = _http_request(input_args[0], self._force_cache)
39
+ if isinstance(result, pa.ChunkedArray):
40
+ result = result.combine_chunks()
41
+ return result
42
+
43
+
44
+ def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
45
+ client = _HttpClient()
46
+
47
+ if isinstance(arg, pa.StructArray):
48
+ # We assume a vector of requests, but with potentially many arguments
49
+ return pa.array(
50
+ [
51
+ _response_dict(
52
+ client.request(
53
+ req.get("method", "GET").upper(),
54
+ req["url"],
55
+ headers=req.get("headers", {}),
56
+ extensions={"force_cache": force_cache},
57
+ )
58
+ )
59
+ for req in arg.to_pylist()
60
+ ],
61
+ type=HttpGet.RES_DTYPE,
62
+ )
63
+
64
+ raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
65
+
66
+
67
+ def _response_dict(response: httpx.Response) -> dict:
68
+ if response.status_code != 200:
69
+ raise ValueError(f"Request failed with status {response.status_code}")
70
+ return {
71
+ "bytes": response.read(),
72
+ "status": response.status_code,
73
+ "headers": dict(response.headers),
74
+ }
75
+
76
+
77
+ class _HttpClient(hishel.CacheClient):
78
+ _instance: "_HttpClient" = None
79
+
80
+ def __new__(cls, *args, **kwargs):
81
+ if not cls._instance:
82
+ cls._instance = super().__new__(cls)
83
+ return cls._instance
84
+
85
+ def __init__(self):
86
+ super().__init__(storage=hishel.FileStorage(base_path=APP_DIR / "http.cache", ttl=3600))
@@ -0,0 +1,100 @@
1
+ import tarfile
2
+ from io import BytesIO
3
+
4
+ import pyarrow as pa
5
+
6
+ from spiral.expressions.base import Expr, ExprLike
7
+ from spiral.expressions.struct import pack
8
+ from spiral.expressions.udf import UDF
9
+
10
+
11
+ def read_file(path: ExprLike) -> Expr:
12
+ """
13
+ Read file path(s) from disk into a struct with a single field "bytes" containing the file contents.
14
+
15
+ Args:
16
+ path: Expression evaluating to an array of strings representing local disk paths.
17
+ """
18
+ to_pack = {"path": path}
19
+ return FileRead()(pack(to_pack))
20
+
21
+
22
+ class FileRead(UDF):
23
+ RES_DTYPE: pa.DataType = pa.struct(
24
+ [
25
+ pa.field("bytes", pa.large_binary()),
26
+ ]
27
+ )
28
+
29
+ def __init__(self):
30
+ super().__init__("file.read")
31
+
32
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
33
+ return FileRead.RES_DTYPE
34
+
35
+ def invoke(self, *input_args: pa.Array) -> pa.Array:
36
+ if len(input_args) != 1:
37
+ raise ValueError(f"Expected 1 argument, got {len(input_args)}")
38
+ arg = input_args[0]
39
+
40
+ res = []
41
+ for req in arg:
42
+ with open(req["path"].as_py(), "rb") as f:
43
+ res.append({"bytes": f.read()})
44
+
45
+ return pa.array(res, type=FileRead.RES_DTYPE)
46
+
47
+
48
+ def read_tar(path: ExprLike = None, bytes_: ExprLike = None) -> "Expr":
49
+ # Untar a vector of paths / byte arrays representing tarballs.
50
+ if path is None and bytes_ is None:
51
+ raise ValueError("Expected either path or bytes_ to be provided")
52
+ to_pack = {}
53
+ if path is not None:
54
+ to_pack["path"] = path
55
+ if bytes_ is not None:
56
+ to_pack["bytes"] = bytes_
57
+ return TarRead()(pack(to_pack))
58
+
59
+
60
+ class TarRead(UDF):
61
+ RES_DTYPE = pa.list_(
62
+ pa.struct(
63
+ [
64
+ pa.field("name", pa.string()),
65
+ pa.field("bytes", pa.large_binary()),
66
+ ]
67
+ )
68
+ )
69
+
70
+ def __init__(self):
71
+ super().__init__("tar.read")
72
+
73
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
74
+ return TarRead.RES_DTYPE
75
+
76
+ def invoke(self, *input_args: pa.Array) -> pa.Array:
77
+ if len(input_args) != 1:
78
+ raise ValueError(f"Expected 1 argument, got {len(input_args)}")
79
+ arg = input_args[0]
80
+
81
+ res = []
82
+ for req in arg:
83
+ if "path" in req:
84
+ kwargs = {"name": req["path"].as_py()}
85
+ elif "bytes" in req:
86
+ kwargs = {"fileobj": BytesIO(req["bytes"].as_py())}
87
+ else:
88
+ raise ValueError("Expected path or bytes_ to be provided")
89
+
90
+ files = []
91
+ with tarfile.open(**kwargs) as f:
92
+ for m in f.getmembers():
93
+ m: tarfile.TarInfo
94
+ if m.type == tarfile.DIRTYPE:
95
+ continue
96
+ # TODO(ngates): skip other types too maybe? Why are we even skipping directories?
97
+ files.append({"name": m.name, "bytes": f.extractfile(m).read()})
98
+ res.append(files)
99
+
100
+ return pa.array(res, type=TarRead.RES_DTYPE)
@@ -0,0 +1,68 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def in_(expr: ExprLike, values: ExprLike) -> Expr:
5
+ """Check if a value is in a list.
6
+
7
+ Args:
8
+ expr: The value to check.
9
+ values: The list array expression to check against.
10
+ """
11
+ # `se.list.in_(Array[2, 4], Array[[1, 2], [1, 2]]) -> Array[True, False]`
12
+ from spiral.expressions import lift
13
+
14
+ expr = lift(expr)
15
+ return expr.in_(values)
16
+
17
+
18
+ def element_at(expr: ExprLike, index: ExprLike) -> Expr:
19
+ """Get the element at the given index.
20
+
21
+ Args:
22
+ expr: The list array expression.
23
+ index: The index to get.
24
+ """
25
+ # e.g. `se.list.element_at([1, 2, 3], 1) -> 2`
26
+ ...
27
+ from spiral import _lib
28
+ from spiral.expressions import lift
29
+
30
+ expr = lift(expr)
31
+ index = lift(index)
32
+ return Expr(_lib.expr.list.element_at(expr.__expr__, index.__expr__))
33
+
34
+
35
+ def of(*expr: ExprLike) -> Expr:
36
+ # Creates an array or scalar list from a series of expressions, all values must be of the same type.
37
+ # The expressions must all also have the same length (1 for scalars).
38
+ #
39
+ # e.g. `se.list.of(1+3, 2, 3) -> [4, 2, 3]`
40
+ ...
41
+
42
+
43
+ def zip(*lists: ExprLike) -> Expr:
44
+ # Merge the given lists, with duplicates.
45
+ #
46
+ # e.g. `se.list.merge([1, 2], [3, 4]) -> [(1, 2), (3, 4)]`
47
+ ...
48
+
49
+
50
+ def concat(*lists: ExprLike) -> Expr:
51
+ # Concatenate the given lists. The types of all the lists must be the same.
52
+ #
53
+ # e.g. `se.list.concat([1, 2], [3, 4]) -> [1, 2, 3, 4]`
54
+ ...
55
+
56
+
57
+ def slice_(expr: ExprLike, start: int | None = None, stop: int | None = None) -> Expr:
58
+ # Slice a list.
59
+ #
60
+ # e.g. `se.list.slice_([0, 1, 2], slice(0,2)) -> [0, 1]`
61
+ ...
62
+
63
+
64
+ def length(expr: ExprLike) -> Expr:
65
+ # Get the length of a list.
66
+ #
67
+ # e.g. `se.list.length([1, 2, 3]) -> 3`
68
+ ...