pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.4.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/METADATA +12 -14
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +6 -7
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +21 -14
  7. spiral/api/__init__.py +15 -172
  8. spiral/api/admin.py +12 -26
  9. spiral/api/client.py +160 -0
  10. spiral/api/filesystems.py +100 -72
  11. spiral/api/organizations.py +45 -58
  12. spiral/api/projects.py +171 -134
  13. spiral/api/telemetry.py +19 -0
  14. spiral/api/types.py +20 -0
  15. spiral/api/workloads.py +32 -25
  16. spiral/{arrow.py → arrow_.py} +12 -0
  17. spiral/cli/__init__.py +2 -5
  18. spiral/cli/admin.py +7 -12
  19. spiral/cli/app.py +23 -6
  20. spiral/cli/console.py +1 -1
  21. spiral/cli/fs.py +83 -18
  22. spiral/cli/iceberg/__init__.py +7 -0
  23. spiral/cli/iceberg/namespaces.py +47 -0
  24. spiral/cli/iceberg/tables.py +60 -0
  25. spiral/cli/indexes/__init__.py +19 -0
  26. spiral/cli/login.py +14 -5
  27. spiral/cli/orgs.py +90 -0
  28. spiral/cli/printer.py +9 -1
  29. spiral/cli/projects.py +136 -0
  30. spiral/cli/state.py +2 -0
  31. spiral/cli/tables/__init__.py +121 -0
  32. spiral/cli/telemetry.py +18 -0
  33. spiral/cli/types.py +8 -10
  34. spiral/cli/{workload.py → workloads.py} +11 -11
  35. spiral/{catalog.py → client.py} +22 -21
  36. spiral/core/client/__init__.pyi +117 -0
  37. spiral/core/index/__init__.pyi +15 -0
  38. spiral/core/table/__init__.pyi +108 -0
  39. spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
  40. spiral/core/table/metastore/__init__.pyi +62 -0
  41. spiral/core/{spec → table/spec}/__init__.pyi +49 -92
  42. spiral/datetime_.py +27 -0
  43. spiral/expressions/__init__.py +40 -17
  44. spiral/expressions/base.py +5 -5
  45. spiral/expressions/list_.py +1 -1
  46. spiral/expressions/mp4.py +62 -0
  47. spiral/expressions/png.py +18 -0
  48. spiral/expressions/qoi.py +18 -0
  49. spiral/expressions/refs.py +23 -9
  50. spiral/expressions/struct.py +7 -5
  51. spiral/expressions/text.py +62 -0
  52. spiral/expressions/tiff.py +88 -88
  53. spiral/expressions/udf.py +3 -3
  54. spiral/iceberg/__init__.py +3 -0
  55. spiral/iceberg/client.py +33 -0
  56. spiral/indexes/__init__.py +5 -0
  57. spiral/indexes/client.py +137 -0
  58. spiral/indexes/index.py +34 -0
  59. spiral/indexes/scan.py +22 -0
  60. spiral/project.py +19 -110
  61. spiral/{proto → protogen}/_/scandal/__init__.py +32 -77
  62. spiral/protogen/_/spiral/table/__init__.py +22 -0
  63. spiral/protogen/substrait/__init__.py +3399 -0
  64. spiral/protogen/substrait/extensions/__init__.py +115 -0
  65. spiral/server.py +17 -0
  66. spiral/settings.py +31 -87
  67. spiral/substrait_.py +10 -6
  68. spiral/tables/__init__.py +12 -0
  69. spiral/tables/client.py +130 -0
  70. spiral/{dataset.py → tables/dataset.py} +36 -25
  71. spiral/tables/debug/manifests.py +70 -0
  72. spiral/tables/debug/metrics.py +56 -0
  73. spiral/{debug.py → tables/debug/scan.py} +6 -9
  74. spiral/tables/maintenance.py +12 -0
  75. spiral/tables/scan.py +193 -0
  76. spiral/tables/snapshot.py +78 -0
  77. spiral/tables/table.py +157 -0
  78. spiral/tables/transaction.py +52 -0
  79. pyspiral-0.2.5.dist-info/RECORD +0 -81
  80. spiral/api/tables.py +0 -94
  81. spiral/api/tokens.py +0 -56
  82. spiral/authn/authn.py +0 -89
  83. spiral/authn/device.py +0 -206
  84. spiral/authn/github_.py +0 -33
  85. spiral/authn/modal_.py +0 -18
  86. spiral/cli/org.py +0 -90
  87. spiral/cli/project.py +0 -107
  88. spiral/cli/table.py +0 -20
  89. spiral/cli/token.py +0 -27
  90. spiral/config.py +0 -26
  91. spiral/core/core/__init__.pyi +0 -53
  92. spiral/core/metastore/__init__.pyi +0 -91
  93. spiral/proto/_/spfs/__init__.py +0 -36
  94. spiral/proto/_/spiral/table/__init__.py +0 -225
  95. spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
  96. spiral/proto/__init__.py +0 -0
  97. spiral/proto/scandal/__init__.py +0 -45
  98. spiral/proto/spiral/__init__.py +0 -0
  99. spiral/proto/spiral/table/__init__.py +0 -96
  100. spiral/scan_.py +0 -168
  101. spiral/table.py +0 -157
  102. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/entry_points.txt +0 -0
  103. /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
  104. /spiral/{core → protogen/_}/__init__.py +0 -0
  105. /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
  106. /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
  107. /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
  108. /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
  109. /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
  110. /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
  111. /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
  112. /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
  113. /spiral/{proto → protogen}/util.py +0 -0
  114. /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
@@ -0,0 +1,62 @@
1
+ from spiral.expressions.base import Expr, ExprLike
2
+
3
+
4
+ def field(expr: ExprLike, field_name: str | None = None, tokenizer: str | None = None) -> Expr:
5
+ """Configure a column for text indexing.
6
+
7
+ Args:
8
+ expr: An input column. The expression must either evaluate to a UTF-8,
9
+ or, if a `field_name` is provided, to a struct with a field of that name.
10
+ field_name: If provided, the expression must evaluate to a struct with a field of that name.
11
+ The given field will be indexed.
12
+ tokenizer: If provided, the text will be tokenized using the given tokenizer.
13
+
14
+ Returns:
15
+ An expression that can be used to construct a text index.
16
+ """
17
+ from spiral import _lib
18
+ from spiral.expressions import getitem, lift, merge, pack
19
+
20
+ expr = lift(expr)
21
+ if field_name is None:
22
+ return Expr(_lib.expr.text.field(expr.__expr__, tokenizer))
23
+
24
+ child = _lib.expr.text.field(getitem(expr, field_name).__expr__)
25
+ return merge(
26
+ expr,
27
+ pack({field_name: child}),
28
+ )
29
+
30
+
31
+ def find(expr: ExprLike, term: str) -> Expr:
32
+ """Search for a term in the text.
33
+
34
+ Args:
35
+ expr: An index field.
36
+ term: The term to search for.
37
+
38
+ Returns:
39
+ An expression that can be used in ranking for text search.
40
+ """
41
+ from spiral import _lib
42
+ from spiral.expressions import lift
43
+
44
+ expr = lift(expr)
45
+ return Expr(_lib.expr.text.find(expr.__expr__, term))
46
+
47
+
48
+ def boost(expr: ExprLike, factor: float) -> Expr:
49
+ """Boost the relevance of a ranking expression.
50
+
51
+ Args:
52
+ expr: Rank by expression.
53
+ factor: The factor by which to boost the relevance.
54
+
55
+ Returns:
56
+ An expression that can be used in ranking for text search.
57
+ """
58
+ from spiral import _lib
59
+ from spiral.expressions import lift
60
+
61
+ expr = lift(expr)
62
+ return Expr(_lib.expr.text.boost(expr.__expr__, factor))
@@ -1,42 +1,44 @@
1
1
  import numpy as np
2
2
  import pyarrow as pa
3
3
 
4
- from spiral.expressions.base import ExprLike
4
+ from spiral.expressions.base import Expr, ExprLike
5
5
  from spiral.expressions.udf import RefUDF
6
6
 
7
+ _TIFF_RES_DTYPE: pa.DataType = pa.struct(
8
+ [
9
+ pa.field("pixels", pa.large_binary()),
10
+ pa.field("height", pa.uint32()),
11
+ pa.field("width", pa.uint32()),
12
+ pa.field("channels", pa.uint8()),
13
+ pa.field("channel_bit_depth", pa.uint8()),
14
+ ]
15
+ )
16
+
7
17
 
8
18
  def read(
9
19
  expr: ExprLike,
10
- indexes: ExprLike | int | list[int] | None = None,
20
+ indexes: ExprLike | int | None = None,
11
21
  window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
12
22
  boundless: ExprLike | bool | None = None,
13
- ):
23
+ ) -> Expr:
14
24
  """
15
25
  Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
16
26
 
17
27
  Args:
18
28
  expr: The referenced `TIFF` bytes.
19
- indexes: The band indexes to read. Defaults to first band. The first dimension of the result's `shape` field
20
- is either 1 or the number of indexes.
29
+ indexes: The band indexes to read. Defaults to all.
21
30
  window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
22
31
  boundless: If `True`, windows that extend beyond the dataset's extent
23
32
  are permitted and partially or completely filled arrays will be returned as appropriate.
24
33
 
25
34
  Returns:
26
- An array where each element is a NumPy array represented as a struct with fields:
27
- bytes: Array bytes with type `pa.large_binary()`.
28
- shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
29
- dtype: String representation of NumPy dtype with type `pa.string()`.
30
-
31
- Example:
32
- A way to get the i-th element in the result as NumPy array:
33
-
34
- ```
35
- array: np.ndarray = np.frombuffer(
36
- result["bytes"][i].as_py(),
37
- dtype=np.dtype(result["dtype"][i].as_py()),
38
- ).reshape(tuple(result["shape"][i].as_py()))
39
- ```
35
+ An array where each element is a decoded image with fields:
36
+ pixels: bytes of shape (channels, width, height).
37
+ width: Width of the image with type `pa.uint32()`.
38
+ height: Height of the image with type `pa.uint32()`.
39
+ channels: Number of channels of the image with type `pa.uint8()`.
40
+ If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
41
+ channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
40
42
  """
41
43
  try:
42
44
  import rasterio # noqa: F401
@@ -46,55 +48,42 @@ def read(
46
48
  return TiffReadUDF()(expr, indexes, window, boundless)
47
49
 
48
50
 
49
- def crop(
51
+ def select(
50
52
  expr: ExprLike,
51
- shape: ExprLike,
52
- ):
53
+ shape: ExprLike | dict,
54
+ indexes: ExprLike | int | None = None,
55
+ ) -> Expr:
53
56
  """
54
- Crop shapes out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
57
+ Select the shape out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
55
58
 
56
59
  Args:
57
60
  expr: The referenced `TIFF` bytes.
58
61
  shape: [GeoJSON-like](https://geojson.org/) shape.
62
+ indexes: The band indexes to read. Defaults to all.
59
63
 
60
64
  Returns:
61
- An array where each element is a NumPy array represented as a struct with fields:
62
- bytes: Array bytes with type `pa.large_binary()`.
63
- shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
64
- dtype: String representation of NumPy dtype with type `pa.string()`.
65
-
66
- Example:
67
- A way to get the i-th element in the result as NumPy array:
68
-
69
- ```
70
- array: np.ndarray = np.frombuffer(
71
- result["bytes"][i].as_py(),
72
- dtype=np.dtype(result["dtype"][i].as_py()),
73
- ).reshape(tuple(result["shape"][i].as_py()))
74
- ```
65
+ An array where each element is a decoded image with fields:
66
+ pixels: bytes of shape (len(indexes) or 1, width, height).
67
+ width: Width of the image with type `pa.uint32()`.
68
+ height: Height of the image with type `pa.uint32()`.
69
+ channels: Number of channels of the image with type `pa.uint8()`.
70
+ If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
71
+ channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
75
72
  """
76
73
  try:
77
74
  import rasterio # noqa: F401
78
75
  except ImportError:
79
- raise ImportError("`rasterio` is required for tiff.crop")
76
+ raise ImportError("`rasterio` is required for tiff.select")
80
77
 
81
- return TiffCropUDF()(expr, shape)
78
+ return TiffSelectUDF()(expr, shape, indexes)
82
79
 
83
80
 
84
81
  class TiffReadUDF(RefUDF):
85
- RES_DTYPE: pa.DataType = pa.struct(
86
- [
87
- pa.field("bytes", pa.large_binary()),
88
- pa.field("shape", pa.list_(pa.uint32(), 3)),
89
- pa.field("dtype", pa.string()),
90
- ]
91
- )
92
-
93
82
  def __init__(self):
94
83
  super().__init__("tiff.read")
95
84
 
96
85
  def return_type(self, *input_types: pa.DataType) -> pa.DataType:
97
- return TiffReadUDF.RES_DTYPE
86
+ return _TIFF_RES_DTYPE
98
87
 
99
88
  def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
100
89
  try:
@@ -130,65 +119,76 @@ class TiffReadUDF(RefUDF):
130
119
  # This matters more if we want to rewrite this function to work with multiple inputs at once, in which
131
120
  # case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
132
121
  result: np.ndarray = src.read(indexes=indexes, window=window)
133
- return pa.array(
134
- [
135
- {
136
- "bytes": result.tobytes(),
137
- "shape": list(result.shape),
138
- "dtype": str(result.dtype),
139
- }
140
- ],
141
- type=TiffReadUDF.RES_DTYPE,
142
- )
143
-
144
-
145
- class TiffCropUDF(RefUDF):
146
- RES_DTYPE: pa.DataType = pa.struct(
147
- [
148
- pa.field("bytes", pa.large_binary()),
149
- pa.field("shape", pa.list_(pa.uint32()), 3),
150
- pa.field("dtype", pa.string()),
151
- ]
152
- )
122
+ return _return_result(result, indexes)
123
+
153
124
 
125
+ class TiffSelectUDF(RefUDF):
154
126
  def __init__(self):
155
- super().__init__("tiff.crop")
127
+ super().__init__("tiff.select")
156
128
 
157
129
  def return_type(self, *input_types: pa.DataType) -> pa.DataType:
158
- return TiffCropUDF.RES_DTYPE
130
+ return _TIFF_RES_DTYPE
159
131
 
160
132
  def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
161
133
  try:
162
134
  import rasterio
163
135
  except ImportError:
164
- raise ImportError("`rasterio` is required for tiff.crop")
136
+ raise ImportError("`rasterio` is required for tiff.select")
165
137
 
166
- from rasterio.mask import mask as rio_mask
138
+ from rasterio.mask import raster_geometry_mask
167
139
 
168
- if len(input_args) != 2:
169
- raise ValueError("tiff.crop expects exactly 2 arguments: expr, shape")
140
+ if len(input_args) != 3:
141
+ raise ValueError("tiff.select expects exactly 3 arguments: expr, shape, indexes")
170
142
 
171
- _, shape = input_args
143
+ _, shape, indexes = input_args
172
144
 
173
145
  shape = shape[0].as_py()
174
146
  if shape is None:
175
- raise ValueError("tiff.crop expects shape to be a GeoJSON-like shape")
147
+ raise ValueError("tiff.select expects shape to be a GeoJSON-like shape")
148
+
149
+ indexes = indexes[0].as_py()
150
+ if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
151
+ raise ValueError(f"tiff.select expects indexes to be None or an int or a list, got {indexes}")
176
152
 
177
153
  opener = _VsiOpener(fp)
178
154
  with rasterio.open("ref", opener=opener) as src:
179
155
  src: rasterio.DatasetReader
180
- result, _ = rio_mask(src, shapes=[shape], crop=True)
181
- result: np.ndarray
182
- return pa.array(
183
- [
184
- {
185
- "bytes": result.tobytes(),
186
- "shape": list(result.shape),
187
- "dtype": str(result.dtype),
188
- }
189
- ],
190
- type=TiffCropUDF.RES_DTYPE,
191
- )
156
+
157
+ shape_mask, _, window = raster_geometry_mask(src, [shape], crop=True)
158
+ out_shape = (src.count,) + shape_mask.shape
159
+
160
+ result: np.ndarray = src.read(window=window, indexes=indexes, out_shape=out_shape, masked=True)
161
+ return _return_result(result, indexes)
162
+
163
+
164
+ def _return_result(result: np.ndarray, indexes) -> pa.Array:
165
+ channels = result.shape[0]
166
+ if indexes is None:
167
+ pass
168
+ elif isinstance(indexes, int):
169
+ assert channels == 1, f"Expected 1 channel, got {channels}"
170
+ else:
171
+ assert channels == len(indexes), f"Expected {len(indexes)} channels, got {channels}"
172
+
173
+ if result.dtype == np.uint8:
174
+ channel_bit_depth = 8
175
+ elif result.dtype == np.uint16:
176
+ channel_bit_depth = 16
177
+ else:
178
+ raise ValueError(f"Unsupported bit width: {result.dtype}")
179
+
180
+ return pa.array(
181
+ [
182
+ {
183
+ "pixels": result.tobytes(),
184
+ "height": result.shape[1],
185
+ "width": result.shape[2],
186
+ "channels": channels,
187
+ "channel_bit_depth": channel_bit_depth,
188
+ }
189
+ ],
190
+ type=_TIFF_RES_DTYPE,
191
+ )
192
192
 
193
193
 
194
194
  class _VsiOpener:
spiral/expressions/udf.py CHANGED
@@ -25,7 +25,7 @@ class UDF(BaseUDF):
25
25
  """A User-Defined Function (UDF)."""
26
26
 
27
27
  def __init__(self, name: str):
28
- super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
28
+ super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
29
29
 
30
30
  @abc.abstractmethod
31
31
  def invoke(self, *input_args: pa.Array) -> pa.Array: ...
@@ -35,10 +35,10 @@ class RefUDF(BaseUDF):
35
35
  """A UDF over a single ref cell, and therefore can access the file object."""
36
36
 
37
37
  def __init__(self, name: str):
38
- super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
38
+ super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
39
39
 
40
40
  @abc.abstractmethod
41
- def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
41
+ def invoke(self, fp: _lib.FileObject, *input_args: pa.Array) -> pa.Array:
42
42
  """Invoke the UDF with the given arguments.
43
43
 
44
44
  NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
@@ -0,0 +1,3 @@
1
+ from spiral.iceberg.client import Iceberg
2
+
3
+ __all__ = ["Iceberg"]
@@ -0,0 +1,33 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from pyiceberg.catalog import Catalog
5
+
6
+ from spiral.client import Spiral
7
+
8
+
9
+ class Iceberg:
10
+ """
11
+ Apache Iceberg is a powerful open-source table format designed for high-performance data lakes.
12
+ Iceberg brings reliability, scalability, and advanced features like time travel, schema evolution,
13
+ and ACID transactions to your warehouse.
14
+ """
15
+
16
+ def __init__(self, spiral: "Spiral", *, project_id: str | None = None):
17
+ self._spiral = spiral
18
+ self._project_id = project_id
19
+
20
+ self._api = self._spiral.config.api
21
+
22
+ def catalog(self) -> "Catalog":
23
+ """Open the Iceberg catalog."""
24
+ from pyiceberg.catalog import load_catalog
25
+
26
+ return load_catalog(
27
+ "default",
28
+ **{
29
+ "type": "rest",
30
+ "uri": self._spiral.config.spiraldb.uri + "/iceberg",
31
+ "token": self._spiral.config.authn.token().expose_secret(),
32
+ },
33
+ )
@@ -0,0 +1,5 @@
1
+ from spiral.indexes.client import Indexes
2
+ from spiral.indexes.index import TextIndex
3
+ from spiral.indexes.scan import SearchScan
4
+
5
+ __all__ = ["Indexes", "SearchScan", "TextIndex"]
@@ -0,0 +1,137 @@
1
+ import datetime
2
+
3
+ from spiral.api import SpiralAPI
4
+ from spiral.api.projects import TextIndexResource
5
+ from spiral.core.client import Spiral as CoreSpiral
6
+ from spiral.expressions.base import ExprLike
7
+ from spiral.indexes.index import TextIndex
8
+ from spiral.indexes.scan import SearchScan
9
+ from spiral.types_ import Uri
10
+
11
+
12
+ class Indexes:
13
+ def __init__(self, api: SpiralAPI, spiral: CoreSpiral, *, project_id: str | None = None):
14
+ self._api = api
15
+ self._spiral = spiral
16
+ self._project_id = project_id
17
+
18
+ def index(self, identifier: str) -> TextIndex:
19
+ """Returns the index with the given identifier."""
20
+ project_id, index_name = self._parse_identifier(identifier)
21
+ if project_id is None:
22
+ raise ValueError("Must provide a fully qualified index identifier.")
23
+
24
+ res = list(self._api.project.list_text_indexes(project_id, name=index_name))
25
+ if len(res) == 0:
26
+ raise ValueError(f"Index not found: {project_id}.{index_name}")
27
+ res = res[0]
28
+
29
+ return TextIndex(self, self._spiral.get_text_index(res.id), index_name)
30
+
31
+ def list_indexes(self) -> list[TextIndexResource]:
32
+ project_id = self._project_id
33
+ if project_id is None:
34
+ raise ValueError("Must provide a project ID to list indexes.")
35
+ return list(self._api.project.list_text_indexes(project_id))
36
+
37
+ def create_text_index(
38
+ self,
39
+ identifier: str,
40
+ # At least one projection is required. All projections must reference the same table!
41
+ # NOTE(marko): Indexes are currently independent of tables.
42
+ # That will likely change with the new root resource such as documents.
43
+ *projections: ExprLike,
44
+ where: ExprLike | None = None,
45
+ root_uri: Uri | None = None,
46
+ exist_ok: bool = False,
47
+ ) -> TextIndex:
48
+ """Creates a text index over the table projection.
49
+
50
+ See `se.text.field` for how to create and configure indexable fields.
51
+
52
+ Args:
53
+ identifier: The index identifier, in the form `project.index` or `index`.
54
+ projections: At least one projection expression is required.
55
+ All projections must reference the same table.
56
+ where: An optional filter expression to apply to the index.
57
+ root_uri: The root URI for the index.
58
+ exist_ok: If True, do not raise an error if the index already exists.
59
+ """
60
+ from spiral import expressions as se
61
+
62
+ project_id, index_name = self._parse_identifier(identifier)
63
+ if project_id is None:
64
+ raise ValueError("Must provide a fully qualified index identifier.")
65
+
66
+ if not projections:
67
+ raise ValueError("At least one projection is required.")
68
+ projection = se.merge(*projections)
69
+ if where is not None:
70
+ where = se.lift(where)
71
+
72
+ core_index = self._spiral.create_text_index(
73
+ project_id,
74
+ index_name,
75
+ projection.__expr__,
76
+ where.__expr__ if where else None,
77
+ root_uri=root_uri,
78
+ # TODO(marko): Validate that if an index exists, it's the same?
79
+ exist_ok=exist_ok,
80
+ )
81
+
82
+ return TextIndex(self, core_index, index_name)
83
+
84
+ def _parse_identifier(self, identifier: str) -> tuple[str | None, str]:
85
+ parts = identifier.split(".")
86
+ if len(parts) == 1:
87
+ return self._project_id, parts[0]
88
+ elif len(parts) == 2:
89
+ return parts[0], parts[1]
90
+ else:
91
+ raise ValueError(f"Invalid index identifier: {identifier}")
92
+
93
+ def search(
94
+ self,
95
+ *rank_by: ExprLike,
96
+ where: ExprLike | None = None,
97
+ top_k: int = 10,
98
+ # Do not refresh the index if freshness does not exceed the freshness window.
99
+ # NOTE(marko): The current implementation fails the query if the index is stale.
100
+ freshness_window: datetime.timedelta | None = None,
101
+ ) -> SearchScan:
102
+ """Queries the index with the given rank by and where clauses.
103
+
104
+ Rank by expressions are combined for scoring.
105
+ See `se.text.find` and `se.text.boost` for scoring expressions.
106
+ The `where` expression is used to filter the results.
107
+ It must return a boolean value and use only conjunctions (ANDs). Expressions in where statement
108
+ are considered either a `must` or `must_not` clause in search terminology.
109
+
110
+ Args:
111
+ rank_by: At least one rank by expression is required.
112
+ These expressions are used to score the results.
113
+ where: An optional filter expression to apply to the index.
114
+ It must return a boolean value and use only conjunctions (ANDs).
115
+ top_k: The number of top results to return.
116
+ freshness_window: If provided, the index will not be refreshed if its freshness does not exceed this window.
117
+ """
118
+ from spiral import expressions as se
119
+
120
+ if not rank_by:
121
+ raise ValueError("At least one rank by expression is required.")
122
+ rank_by = se.or_(*rank_by)
123
+ if where is not None:
124
+ where = se.lift(where)
125
+
126
+ if freshness_window is None:
127
+ freshness_window = datetime.timedelta(seconds=0)
128
+ freshness_window_s = int(freshness_window.total_seconds())
129
+
130
+ return SearchScan(
131
+ self._spiral.open_search_scan(
132
+ rank_by.__expr__,
133
+ top_k=top_k,
134
+ freshness_window_s=freshness_window_s,
135
+ filter=where.__expr__ if where else None,
136
+ )
137
+ )
@@ -0,0 +1,34 @@
1
+ import datetime
2
+ from typing import TYPE_CHECKING
3
+
4
+ from spiral.core.index import TextIndex as CoreTextIndex
5
+ from spiral.expressions import Expr
6
+
7
+ if TYPE_CHECKING:
8
+ from spiral.indexes import Indexes
9
+
10
+
11
+ class TextIndex(Expr):
12
+ def __init__(self, indexes: "Indexes", index: CoreTextIndex, name: str):
13
+ super().__init__(index.__expr__)
14
+
15
+ self._indexes = indexes
16
+ self._index = index
17
+ self._name = name
18
+
19
+ @property
20
+ def client(self) -> "Indexes":
21
+ return self._indexes
22
+
23
+ @property
24
+ def index_id(self) -> str:
25
+ return self._index.id
26
+
27
+ @property
28
+ def name(self) -> str:
29
+ return self._name
30
+
31
+ def status(self) -> (str, datetime.timedelta | None):
32
+ """Fetch the status of the index. If status is ready, returns the staleness of the index."""
33
+ status = self._index.status()
34
+ return status.status, datetime.timedelta(seconds=status.staleness_s) if status.staleness_s is not None else None
spiral/indexes/scan.py ADDED
@@ -0,0 +1,22 @@
1
+ import pyarrow as pa
2
+
3
+ from spiral.core.index import SearchScan as CoreSearchScan
4
+ from spiral.settings import CI, DEV
5
+
6
+
7
+ class SearchScan:
8
+ def __init__(self, scan: CoreSearchScan):
9
+ self._scan = scan
10
+
11
+ def to_record_batches(self) -> pa.RecordBatchReader:
12
+ """Read all results as a record batch reader."""
13
+ return self._scan.to_record_batches()
14
+
15
+ def to_table(self) -> pa.Table:
16
+ """Read all results as a table."""
17
+ # NOTE: Evaluates fully on Rust side which improved debuggability.
18
+ if DEV and not CI:
19
+ rb = self._scan.to_record_batch()
20
+ return pa.Table.from_batches([rb])
21
+
22
+ return self.to_record_batches().read_all()