pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. pyspiral-0.4.0.dist-info/METADATA +46 -0
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. pyspiral-0.4.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +10 -0
  6. spiral/_lib.pypy310-pp73-darwin.so +0 -0
  7. spiral/adbc.py +393 -0
  8. spiral/api/__init__.py +64 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +160 -0
  11. spiral/api/filesystems.py +153 -0
  12. spiral/api/organizations.py +77 -0
  13. spiral/api/projects.py +197 -0
  14. spiral/api/telemetry.py +19 -0
  15. spiral/api/types.py +20 -0
  16. spiral/api/workloads.py +52 -0
  17. spiral/arrow_.py +221 -0
  18. spiral/cli/__init__.py +79 -0
  19. spiral/cli/__main__.py +4 -0
  20. spiral/cli/admin.py +16 -0
  21. spiral/cli/app.py +65 -0
  22. spiral/cli/console.py +95 -0
  23. spiral/cli/fs.py +112 -0
  24. spiral/cli/iceberg/__init__.py +7 -0
  25. spiral/cli/iceberg/namespaces.py +47 -0
  26. spiral/cli/iceberg/tables.py +60 -0
  27. spiral/cli/indexes/__init__.py +19 -0
  28. spiral/cli/login.py +22 -0
  29. spiral/cli/orgs.py +90 -0
  30. spiral/cli/printer.py +53 -0
  31. spiral/cli/projects.py +136 -0
  32. spiral/cli/state.py +5 -0
  33. spiral/cli/tables/__init__.py +121 -0
  34. spiral/cli/telemetry.py +18 -0
  35. spiral/cli/types.py +51 -0
  36. spiral/cli/workloads.py +59 -0
  37. spiral/client.py +79 -0
  38. spiral/core/__init__.pyi +0 -0
  39. spiral/core/client/__init__.pyi +117 -0
  40. spiral/core/index/__init__.pyi +15 -0
  41. spiral/core/table/__init__.pyi +108 -0
  42. spiral/core/table/manifests/__init__.pyi +35 -0
  43. spiral/core/table/metastore/__init__.pyi +62 -0
  44. spiral/core/table/spec/__init__.pyi +214 -0
  45. spiral/datetime_.py +27 -0
  46. spiral/expressions/__init__.py +245 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/mp4.py +62 -0
  52. spiral/expressions/png.py +18 -0
  53. spiral/expressions/qoi.py +18 -0
  54. spiral/expressions/refs.py +58 -0
  55. spiral/expressions/str_.py +39 -0
  56. spiral/expressions/struct.py +59 -0
  57. spiral/expressions/text.py +62 -0
  58. spiral/expressions/tiff.py +223 -0
  59. spiral/expressions/udf.py +46 -0
  60. spiral/grpc_.py +32 -0
  61. spiral/iceberg/__init__.py +3 -0
  62. spiral/iceberg/client.py +33 -0
  63. spiral/indexes/__init__.py +5 -0
  64. spiral/indexes/client.py +137 -0
  65. spiral/indexes/index.py +34 -0
  66. spiral/indexes/scan.py +22 -0
  67. spiral/project.py +46 -0
  68. spiral/protogen/_/__init__.py +0 -0
  69. spiral/protogen/_/arrow/__init__.py +0 -0
  70. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  71. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  72. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  73. spiral/protogen/_/scandal/__init__.py +178 -0
  74. spiral/protogen/_/spiral/__init__.py +0 -0
  75. spiral/protogen/_/spiral/table/__init__.py +22 -0
  76. spiral/protogen/_/substrait/__init__.py +3399 -0
  77. spiral/protogen/_/substrait/extensions/__init__.py +115 -0
  78. spiral/protogen/__init__.py +0 -0
  79. spiral/protogen/substrait/__init__.py +3399 -0
  80. spiral/protogen/substrait/extensions/__init__.py +115 -0
  81. spiral/protogen/util.py +41 -0
  82. spiral/py.typed +0 -0
  83. spiral/server.py +17 -0
  84. spiral/settings.py +101 -0
  85. spiral/substrait_.py +279 -0
  86. spiral/tables/__init__.py +12 -0
  87. spiral/tables/client.py +130 -0
  88. spiral/tables/dataset.py +250 -0
  89. spiral/tables/debug/__init__.py +0 -0
  90. spiral/tables/debug/manifests.py +70 -0
  91. spiral/tables/debug/metrics.py +56 -0
  92. spiral/tables/debug/scan.py +248 -0
  93. spiral/tables/maintenance.py +12 -0
  94. spiral/tables/scan.py +193 -0
  95. spiral/tables/snapshot.py +78 -0
  96. spiral/tables/table.py +157 -0
  97. spiral/tables/transaction.py +52 -0
  98. spiral/types_.py +6 -0
@@ -0,0 +1,62 @@
1
+ import pyarrow as pa
2
+
3
+ from spiral.expressions.base import Expr, ExprLike
4
+
5
+ _MP4_RES_DTYPE: pa.DataType = pa.struct(
6
+ [
7
+ pa.field("pixels", pa.large_binary()),
8
+ pa.field("height", pa.uint32()),
9
+ pa.field("width", pa.uint32()),
10
+ pa.field("frames", pa.uint32()),
11
+ ]
12
+ )
13
+
14
+
15
+ # TODO(marko): Support optional range and crop.
16
+ # IMPORTANT: Frames is currently broken and defaults to full.
17
+ def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str):
18
+ """
19
+ Read referenced cell in a `MP4` format. Requires `ffmpeg`.
20
+
21
+ Args:
22
+ expr: The referenced `Mp4` bytes.
23
+ A str is assumed to be the `se.keyed` expression.
24
+ frames: The range of frames to read. Each element must be a list of two uint32,
25
+ frame start and frame end, or null / empty list to read all frames.
26
+ A str is assumed to be the `se.keyed` expression.
27
+ crop: The crop of the frames to read. Each element must be a list of four uint32,
28
+ x, y, width, height or null / empty list to read full frames.
29
+ A str is assumed to be the `se.keyed` expression.
30
+
31
+ Returns:
32
+ An array where each element is a decoded cropped video with fields:
33
+ pixels: RGB8 bytes, frames * width * height * 3.
34
+ width: Width of the image with type `pa.uint32()`.
35
+ height: Height of the image with type `pa.uint32()`.
36
+ frames: Number of frames with type `pa.uint32()`.
37
+ """
38
+ from spiral import _lib
39
+ from spiral.expressions import keyed, lift
40
+
41
+ if isinstance(expr, str):
42
+ expr = keyed(
43
+ expr,
44
+ pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
45
+ )
46
+ if isinstance(frames, str):
47
+ frames = keyed(frames, pa.list_(pa.uint32()))
48
+ if isinstance(crop, str):
49
+ crop = keyed(crop, pa.list_(pa.uint32()))
50
+
51
+ expr = lift(expr)
52
+ frames = lift(frames)
53
+ crop = lift(crop)
54
+
55
+ return Expr(
56
+ _lib.expr.video.read(
57
+ expr.__expr__,
58
+ frames.__expr__,
59
+ crop.__expr__,
60
+ format="mp4",
61
+ )
62
+ )
@@ -0,0 +1,18 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def encode(expr: ExprLike) -> Expr:
5
+ """Encode the given expression as a PNG image.
6
+
7
+ Args:
8
+ expr: The expression to encode.
9
+ Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
+
11
+ Returns:
12
+ The encoded PNG images.
13
+ """
14
+ from spiral import _lib
15
+ from spiral.expressions import lift
16
+
17
+ expr = lift(expr)
18
+ return Expr(_lib.expr.img.encode(expr.__expr__, format="png"))
@@ -0,0 +1,18 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def encode(expr: ExprLike) -> Expr:
5
+ """Encode the given expression as a QOI image.
6
+
7
+ Args:
8
+ expr: The expression to encode.
9
+ Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
10
+
11
+ Returns:
12
+ The encoded QOI images.
13
+ """
14
+ from spiral import _lib
15
+ from spiral.expressions import lift
16
+
17
+ expr = lift(expr)
18
+ return Expr(_lib.expr.img.encode(expr.__expr__, format="qoi"))
@@ -0,0 +1,58 @@
1
+ import pyarrow as pa
2
+
3
+ from spiral.expressions.base import Expr, ExprLike
4
+
5
+
6
+ def ref(expr: ExprLike, field: str | None = None) -> Expr:
7
+ """Store binary values as references. This expression can only be used on write.
8
+
9
+ It is often better to store large cell values, such as bytes columns, that aren't used in filter expressions as
10
+ references. This enables more efficient scan pruning. Many of the Spiral's cell pushdown expressions work
11
+ over references.
12
+
13
+ Args:
14
+ expr: The expression to store as a reference.
15
+ field: If the expr evaluates into struct, the field name of that struct that should be referenced.
16
+ If `None`, the expr must evaluate into a type that supports referencing.
17
+ """
18
+ from spiral import _lib
19
+ from spiral.expressions import lift
20
+
21
+ expr = lift(expr)
22
+ return Expr(_lib.expr.refs.ref(expr.__expr__, field))
23
+
24
+
25
+ def deref(expr: ExprLike | str, field: str | None = None) -> Expr:
26
+ """De-reference referenced values.
27
+
28
+ See `ref` for more information on Spiral's reference values. This expression is used to de-reference referenced
29
+ column back into their original form, e.g. binary.
30
+
31
+ Args:
32
+ expr: The expression to de-reference. A str is assumed to be the `se.keyed` expression.
33
+ field: If the expr evaluates into struct, the field name of that struct that should be de-referenced.
34
+ If `None`, the expr must evaluate into a reference type.
35
+ """
36
+ from spiral import _lib
37
+ from spiral.expressions import keyed, lift
38
+
39
+ if isinstance(expr, str):
40
+ expr = keyed(
41
+ expr,
42
+ pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
43
+ )
44
+
45
+ expr = lift(expr)
46
+ return Expr(_lib.expr.refs.deref(expr.__expr__, field=field))
47
+
48
+
49
+ def nbytes(expr: ExprLike) -> Expr:
50
+ """Return the number of bytes in a reference.
51
+
52
+ Args:
53
+ expr: The ref expression to get the number of bytes from.
54
+ """
55
+ from spiral.expressions import lift
56
+
57
+ expr = lift(expr)
58
+ return expr["__ref__"]["end"] - expr["__ref__"]["begin"]
@@ -0,0 +1,39 @@
1
+ import pyarrow as pa
2
+ import pyarrow.compute as pc
3
+ import re2 as re
4
+
5
+ from spiral import _lib
6
+ from spiral.expressions.base import Expr, ExprLike
7
+
8
+ # TODO(ngates): we can add a symmetric "ascii" expression namespace in the future if
9
+ # the performance is required.
10
+
11
+
12
+ def substr(expr: ExprLike = None, *, begin: int = 0, end: int | None = None) -> Expr:
13
+ """Slice a string.
14
+
15
+ Args:
16
+ expr: The string expression to slice.
17
+ begin: The starting index of the slice.
18
+ end: The ending index of the slice.
19
+ """
20
+ from spiral import expressions as se
21
+
22
+ expr = se.lift(expr)
23
+ return Expr(_lib.spql.str.substr(expr.__expr__, begin=begin, end=end))
24
+
25
+
26
+ def extract_regex(pattern: str, *, strings: ExprLike) -> Expr:
27
+ # Extract the first occurrence of a regex pattern from a string.
28
+ raise NotImplementedError
29
+
30
+
31
+ def _extract_regex(arg: pa.Array | pa.Scalar, pattern: str) -> pa.Array | pa.Scalar:
32
+ # Compute the return type based on the regex groups
33
+ m = re.compile(pattern)
34
+ dtype = pa.struct([pa.field(k, type=pa.string()) for k in m.groupindex.keys()])
35
+
36
+ if pa.types.is_string(arg.type):
37
+ return pc.extract_regex(arg, pattern=pattern).cast(dtype)
38
+
39
+ raise TypeError("Input argument does not have the expected type")
@@ -0,0 +1,59 @@
1
+ from spiral import _lib
2
+ from spiral.expressions.base import Expr, ExprLike
3
+
4
+
5
+ def getitem(expr: ExprLike, field: str) -> Expr:
6
+ """Get field from a struct.
7
+
8
+ Args:
9
+ expr: The struct expression to get the field from.
10
+ field: The field to get. Dot-separated string is supported to access nested fields.
11
+ """
12
+ from spiral import expressions as se
13
+
14
+ expr = se.lift(expr)
15
+ return Expr(_lib.expr.struct.getitem(expr.__expr__, field))
16
+
17
+
18
+ def pack(fields: dict[str, ExprLike], *, nullable: bool = False) -> Expr:
19
+ """Assemble a new struct from the given named fields.
20
+
21
+ Args:
22
+ fields: A dictionary of field names to expressions. The field names will be used as the struct field names.
23
+ """
24
+ from spiral import expressions as se
25
+
26
+ return Expr(
27
+ _lib.expr.struct.pack(list(fields.keys()), [se.lift(expr).__expr__ for expr in fields.values()], nullable)
28
+ )
29
+
30
+
31
+ def merge(*structs: "ExprLike") -> Expr:
32
+ """Merge fields from the given structs into a single struct.
33
+
34
+ Args:
35
+ *structs: Each expression must evaluate to a struct.
36
+
37
+ Returns:
38
+ A single struct containing all the fields from the input structs.
39
+ If a field is present in multiple structs, the value from the last struct is used.
40
+ """
41
+ from spiral import expressions as se
42
+
43
+ if len(structs) == 1:
44
+ return se.lift(structs[0])
45
+ return Expr(_lib.expr.struct.merge([se.lift(struct).__expr__ for struct in structs]))
46
+
47
+
48
+ def select(expr: ExprLike, names: list[str] = None, exclude: list[str] = None) -> Expr:
49
+ """Select fields from a struct.
50
+
51
+ Args:
52
+ expr: The struct-like expression to select fields from.
53
+ names: Field names to select. If a path contains a dot, it is assumed to be a nested struct field.
54
+ exclude: List of field names to exclude from result. Exactly one of `names` or `exclude` must be provided.
55
+ """
56
+ from spiral import expressions as se
57
+
58
+ expr = se.lift(expr)
59
+ return Expr(_lib.expr.struct.select(expr.__expr__, names, exclude))
@@ -0,0 +1,62 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def field(expr: ExprLike, field_name: str | None = None, tokenizer: str | None = None) -> Expr:
5
+ """Configure a column for text indexing.
6
+
7
+ Args:
8
+ expr: An input column. The expression must either evaluate to a UTF-8,
9
+ or, if a `field_name` is provided, to a struct with a field of that name.
10
+ field_name: If provided, the expression must evaluate to a struct with a field of that name.
11
+ The given field will be indexed.
12
+ tokenizer: If provided, the text will be tokenized using the given tokenizer.
13
+
14
+ Returns:
15
+ An expression that can be used to construct a text index.
16
+ """
17
+ from spiral import _lib
18
+ from spiral.expressions import getitem, lift, merge, pack
19
+
20
+ expr = lift(expr)
21
+ if field_name is None:
22
+ return Expr(_lib.expr.text.field(expr.__expr__, tokenizer))
23
+
24
+ child = _lib.expr.text.field(getitem(expr, field_name).__expr__)
25
+ return merge(
26
+ expr,
27
+ pack({field_name: child}),
28
+ )
29
+
30
+
31
+ def find(expr: ExprLike, term: str) -> Expr:
32
+ """Search for a term in the text.
33
+
34
+ Args:
35
+ expr: An index field.
36
+ term: The term to search for.
37
+
38
+ Returns:
39
+ An expression that can be used in ranking for text search.
40
+ """
41
+ from spiral import _lib
42
+ from spiral.expressions import lift
43
+
44
+ expr = lift(expr)
45
+ return Expr(_lib.expr.text.find(expr.__expr__, term))
46
+
47
+
48
+ def boost(expr: ExprLike, factor: float) -> Expr:
49
+ """Boost the relevance of a ranking expression.
50
+
51
+ Args:
52
+ expr: Rank by expression.
53
+ factor: The factor by which to boost the relevance.
54
+
55
+ Returns:
56
+ An expression that can be used in ranking for text search.
57
+ """
58
+ from spiral import _lib
59
+ from spiral.expressions import lift
60
+
61
+ expr = lift(expr)
62
+ return Expr(_lib.expr.text.boost(expr.__expr__, factor))
@@ -0,0 +1,223 @@
1
+ import numpy as np
2
+ import pyarrow as pa
3
+
4
+ from spiral.expressions.base import Expr, ExprLike
5
+ from spiral.expressions.udf import RefUDF
6
+
7
+ _TIFF_RES_DTYPE: pa.DataType = pa.struct(
8
+ [
9
+ pa.field("pixels", pa.large_binary()),
10
+ pa.field("height", pa.uint32()),
11
+ pa.field("width", pa.uint32()),
12
+ pa.field("channels", pa.uint8()),
13
+ pa.field("channel_bit_depth", pa.uint8()),
14
+ ]
15
+ )
16
+
17
+
18
+ def read(
19
+ expr: ExprLike,
20
+ indexes: ExprLike | int | None = None,
21
+ window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
22
+ boundless: ExprLike | bool | None = None,
23
+ ) -> Expr:
24
+ """
25
+ Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
26
+
27
+ Args:
28
+ expr: The referenced `TIFF` bytes.
29
+ indexes: The band indexes to read. Defaults to all.
30
+ window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
31
+ boundless: If `True`, windows that extend beyond the dataset's extent
32
+ are permitted and partially or completely filled arrays will be returned as appropriate.
33
+
34
+ Returns:
35
+ An array where each element is a decoded image with fields:
36
+ pixels: bytes of shape (channels, width, height).
37
+ width: Width of the image with type `pa.uint32()`.
38
+ height: Height of the image with type `pa.uint32()`.
39
+ channels: Number of channels of the image with type `pa.uint8()`.
40
+ If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
41
+ channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
42
+ """
43
+ try:
44
+ import rasterio # noqa: F401
45
+ except ImportError:
46
+ raise ImportError("`rasterio` is required for tiff.read")
47
+
48
+ return TiffReadUDF()(expr, indexes, window, boundless)
49
+
50
+
51
+ def select(
52
+ expr: ExprLike,
53
+ shape: ExprLike | dict,
54
+ indexes: ExprLike | int | None = None,
55
+ ) -> Expr:
56
+ """
57
+ Select the shape out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
58
+
59
+ Args:
60
+ expr: The referenced `TIFF` bytes.
61
+ shape: [GeoJSON-like](https://geojson.org/) shape.
62
+ indexes: The band indexes to read. Defaults to all.
63
+
64
+ Returns:
65
+ An array where each element is a decoded image with fields:
66
+ pixels: bytes of shape (len(indexes) or 1, width, height).
67
+ width: Width of the image with type `pa.uint32()`.
68
+ height: Height of the image with type `pa.uint32()`.
69
+ channels: Number of channels of the image with type `pa.uint8()`.
70
+ If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
71
+ channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
72
+ """
73
+ try:
74
+ import rasterio # noqa: F401
75
+ except ImportError:
76
+ raise ImportError("`rasterio` is required for tiff.select")
77
+
78
+ return TiffSelectUDF()(expr, shape, indexes)
79
+
80
+
81
+ class TiffReadUDF(RefUDF):
82
+ def __init__(self):
83
+ super().__init__("tiff.read")
84
+
85
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
86
+ return _TIFF_RES_DTYPE
87
+
88
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
89
+ try:
90
+ import rasterio
91
+ except ImportError:
92
+ raise ImportError("`rasterio` is required for tiff.read")
93
+
94
+ from rasterio.windows import Window
95
+
96
+ if len(input_args) != 4:
97
+ raise ValueError("tiff.read expects exactly 4 arguments: expr, indexes, window, boundless")
98
+
99
+ _, indexes, window, boundless = input_args
100
+
101
+ indexes = indexes[0].as_py()
102
+ if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
103
+ raise ValueError(f"tiff.read expects indexes to be None or an int or a list, got {indexes}")
104
+
105
+ boundless = boundless[0].as_py()
106
+ if boundless is not None and not isinstance(boundless, bool):
107
+ raise ValueError(f"tiff.read expects boundless to be None or a bool, got {boundless}")
108
+
109
+ window = window[0].as_py()
110
+ if window is not None:
111
+ if len(window) != 2:
112
+ raise ValueError(f"tiff.read window invalid, got {window}")
113
+ window = Window.from_slices(slice(*window[0]), slice(*window[1]), boundless=boundless or False)
114
+
115
+ opener = _VsiOpener(fp)
116
+ with rasterio.open("ref", opener=opener) as src:
117
+ src: rasterio.DatasetReader
118
+ # TODO(marko): We know the size and dtype so we should be able to preallocate the result and read into it.
119
+ # This matters more if we want to rewrite this function to work with multiple inputs at once, in which
120
+ # case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
121
+ result: np.ndarray = src.read(indexes=indexes, window=window)
122
+ return _return_result(result, indexes)
123
+
124
+
125
+ class TiffSelectUDF(RefUDF):
126
+ def __init__(self):
127
+ super().__init__("tiff.select")
128
+
129
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
130
+ return _TIFF_RES_DTYPE
131
+
132
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
133
+ try:
134
+ import rasterio
135
+ except ImportError:
136
+ raise ImportError("`rasterio` is required for tiff.select")
137
+
138
+ from rasterio.mask import raster_geometry_mask
139
+
140
+ if len(input_args) != 3:
141
+ raise ValueError("tiff.select expects exactly 3 arguments: expr, shape, indexes")
142
+
143
+ _, shape, indexes = input_args
144
+
145
+ shape = shape[0].as_py()
146
+ if shape is None:
147
+ raise ValueError("tiff.select expects shape to be a GeoJSON-like shape")
148
+
149
+ indexes = indexes[0].as_py()
150
+ if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
151
+ raise ValueError(f"tiff.select expects indexes to be None or an int or a list, got {indexes}")
152
+
153
+ opener = _VsiOpener(fp)
154
+ with rasterio.open("ref", opener=opener) as src:
155
+ src: rasterio.DatasetReader
156
+
157
+ shape_mask, _, window = raster_geometry_mask(src, [shape], crop=True)
158
+ out_shape = (src.count,) + shape_mask.shape
159
+
160
+ result: np.ndarray = src.read(window=window, indexes=indexes, out_shape=out_shape, masked=True)
161
+ return _return_result(result, indexes)
162
+
163
+
164
+ def _return_result(result: np.ndarray, indexes) -> pa.Array:
165
+ channels = result.shape[0]
166
+ if indexes is None:
167
+ pass
168
+ elif isinstance(indexes, int):
169
+ assert channels == 1, f"Expected 1 channel, got {channels}"
170
+ else:
171
+ assert channels == len(indexes), f"Expected {len(indexes)} channels, got {channels}"
172
+
173
+ if result.dtype == np.uint8:
174
+ channel_bit_depth = 8
175
+ elif result.dtype == np.uint16:
176
+ channel_bit_depth = 16
177
+ else:
178
+ raise ValueError(f"Unsupported bit width: {result.dtype}")
179
+
180
+ return pa.array(
181
+ [
182
+ {
183
+ "pixels": result.tobytes(),
184
+ "height": result.shape[1],
185
+ "width": result.shape[2],
186
+ "channels": channels,
187
+ "channel_bit_depth": channel_bit_depth,
188
+ }
189
+ ],
190
+ type=_TIFF_RES_DTYPE,
191
+ )
192
+
193
+
194
+ class _VsiOpener:
195
+ """
196
+ VSI file opener which returns a constant file-like on open.
197
+
198
+ Must match https://rasterio.readthedocs.io/en/stable/topics/vsi.html#python-file-and-filesystem-openers spec but
199
+ only `open` is needed when going through rasterio.
200
+ """
201
+
202
+ def __init__(self, file_like):
203
+ self._file_like = file_like
204
+
205
+ def open(self, _path, mode):
206
+ if mode not in {"r", "rb"}:
207
+ raise ValueError(f"Unsupported mode: {mode}")
208
+ return self._file_like
209
+
210
+ def isdir(self, _):
211
+ return False
212
+
213
+ def isfile(self, _):
214
+ return False
215
+
216
+ def mtime(self, _):
217
+ return 0
218
+
219
+ def size(self, _):
220
+ return self._file_like.size()
221
+
222
+ def modified(self, _):
223
+ raise NotImplementedError
@@ -0,0 +1,46 @@
1
+ import abc
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral import _lib
6
+ from spiral.expressions.base import Expr
7
+
8
+
9
+ class BaseUDF:
10
+ def __init__(self, udf):
11
+ self._udf = udf
12
+
13
+ def __call__(self, *args) -> Expr:
14
+ """Create an expression that calls this UDF with the given arguments."""
15
+ from spiral import expressions as se
16
+
17
+ args = [se.lift(arg).__expr__ for arg in args]
18
+ return Expr(self._udf(args))
19
+
20
+ @abc.abstractmethod
21
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
22
+
23
+
24
+ class UDF(BaseUDF):
25
+ """A User-Defined Function (UDF)."""
26
+
27
+ def __init__(self, name: str):
28
+ super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
29
+
30
+ @abc.abstractmethod
31
+ def invoke(self, *input_args: pa.Array) -> pa.Array: ...
32
+
33
+
34
+ class RefUDF(BaseUDF):
35
+ """A UDF over a single ref cell, and therefore can access the file object."""
36
+
37
+ def __init__(self, name: str):
38
+ super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
39
+
40
+ @abc.abstractmethod
41
+ def invoke(self, fp: _lib.FileObject, *input_args: pa.Array) -> pa.Array:
42
+ """Invoke the UDF with the given arguments.
43
+
44
+ NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
45
+ """
46
+ ...
spiral/grpc_.py ADDED
@@ -0,0 +1,32 @@
1
+ from collections.abc import AsyncIterator, Awaitable, Callable
2
+ from typing import TypeVar
3
+
4
+ R = TypeVar("R")
5
+ T = TypeVar("T")
6
+
7
+
8
+ async def paged(stub_fn: Callable[[R], Awaitable[T]], request: R, page_size: int = None) -> AsyncIterator[T]:
9
+ """Page through a gRPC paged API.
10
+
11
+ Assumes fields exist as per https://cloud.google.com/apis/design/design_patterns#list_pagination
12
+ """
13
+ next_page_token: str | None = None
14
+ while True:
15
+ request.page_size = page_size
16
+ request.page_token = next_page_token
17
+ res = await stub_fn(request)
18
+ if not res.next_page_token:
19
+ # No more items
20
+ yield res
21
+ break
22
+
23
+ next_page_token = res.next_page_token
24
+ yield res
25
+
26
+
27
+ async def paged_items(
28
+ stub_fn: Callable[[R], Awaitable[T]], request: R, collection_name: str, page_size: int = None
29
+ ) -> AsyncIterator:
30
+ async for page in paged(stub_fn, request, page_size=page_size):
31
+ for item in getattr(page, collection_name):
32
+ yield item
@@ -0,0 +1,3 @@
1
+ from spiral.iceberg.client import Iceberg
2
+
3
+ __all__ = ["Iceberg"]
@@ -0,0 +1,33 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from pyiceberg.catalog import Catalog
5
+
6
+ from spiral.client import Spiral
7
+
8
+
9
+ class Iceberg:
10
+ """
11
+ Apache Iceberg is a powerful open-source table format designed for high-performance data lakes.
12
+ Iceberg brings reliability, scalability, and advanced features like time travel, schema evolution,
13
+ and ACID transactions to your warehouse.
14
+ """
15
+
16
+ def __init__(self, spiral: "Spiral", *, project_id: str | None = None):
17
+ self._spiral = spiral
18
+ self._project_id = project_id
19
+
20
+ self._api = self._spiral.config.api
21
+
22
+ def catalog(self) -> "Catalog":
23
+ """Open the Iceberg catalog."""
24
+ from pyiceberg.catalog import load_catalog
25
+
26
+ return load_catalog(
27
+ "default",
28
+ **{
29
+ "type": "rest",
30
+ "uri": self._spiral.config.spiraldb.uri + "/iceberg",
31
+ "token": self._spiral.config.authn.token().expose_secret(),
32
+ },
33
+ )
@@ -0,0 +1,5 @@
1
+ from spiral.indexes.client import Indexes
2
+ from spiral.indexes.index import TextIndex
3
+ from spiral.indexes.scan import SearchScan
4
+
5
+ __all__ = ["Indexes", "SearchScan", "TextIndex"]