pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.4.0.dist-info/METADATA +46 -0
- pyspiral-0.4.0.dist-info/RECORD +98 -0
- pyspiral-0.4.0.dist-info/WHEEL +4 -0
- pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
- spiral/__init__.py +10 -0
- spiral/_lib.pypy310-pp73-darwin.so +0 -0
- spiral/adbc.py +393 -0
- spiral/api/__init__.py +64 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +160 -0
- spiral/api/filesystems.py +153 -0
- spiral/api/organizations.py +77 -0
- spiral/api/projects.py +197 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/types.py +20 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +221 -0
- spiral/cli/__init__.py +79 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +16 -0
- spiral/cli/app.py +65 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +112 -0
- spiral/cli/iceberg/__init__.py +7 -0
- spiral/cli/iceberg/namespaces.py +47 -0
- spiral/cli/iceberg/tables.py +60 -0
- spiral/cli/indexes/__init__.py +19 -0
- spiral/cli/login.py +22 -0
- spiral/cli/orgs.py +90 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +136 -0
- spiral/cli/state.py +5 -0
- spiral/cli/tables/__init__.py +121 -0
- spiral/cli/telemetry.py +18 -0
- spiral/cli/types.py +51 -0
- spiral/cli/workloads.py +59 -0
- spiral/client.py +79 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/client/__init__.pyi +117 -0
- spiral/core/index/__init__.pyi +15 -0
- spiral/core/table/__init__.pyi +108 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +62 -0
- spiral/core/table/spec/__init__.pyi +214 -0
- spiral/datetime_.py +27 -0
- spiral/expressions/__init__.py +245 -0
- spiral/expressions/base.py +149 -0
- spiral/expressions/http.py +86 -0
- spiral/expressions/io.py +100 -0
- spiral/expressions/list_.py +68 -0
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +58 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +223 -0
- spiral/expressions/udf.py +46 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg/__init__.py +3 -0
- spiral/iceberg/client.py +33 -0
- spiral/indexes/__init__.py +5 -0
- spiral/indexes/client.py +137 -0
- spiral/indexes/index.py +34 -0
- spiral/indexes/scan.py +22 -0
- spiral/project.py +46 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
- spiral/protogen/_/scandal/__init__.py +178 -0
- spiral/protogen/_/spiral/__init__.py +0 -0
- spiral/protogen/_/spiral/table/__init__.py +22 -0
- spiral/protogen/_/substrait/__init__.py +3399 -0
- spiral/protogen/_/substrait/extensions/__init__.py +115 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/substrait/__init__.py +3399 -0
- spiral/protogen/substrait/extensions/__init__.py +115 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/server.py +17 -0
- spiral/settings.py +101 -0
- spiral/substrait_.py +279 -0
- spiral/tables/__init__.py +12 -0
- spiral/tables/client.py +130 -0
- spiral/tables/dataset.py +250 -0
- spiral/tables/debug/__init__.py +0 -0
- spiral/tables/debug/manifests.py +70 -0
- spiral/tables/debug/metrics.py +56 -0
- spiral/tables/debug/scan.py +248 -0
- spiral/tables/maintenance.py +12 -0
- spiral/tables/scan.py +193 -0
- spiral/tables/snapshot.py +78 -0
- spiral/tables/table.py +157 -0
- spiral/tables/transaction.py +52 -0
- spiral/types_.py +6 -0
@@ -0,0 +1,248 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
|
3
|
+
from spiral.core.table import TableScan
|
4
|
+
from spiral.core.table.manifests import FragmentFile, FragmentManifest
|
5
|
+
from spiral.core.table.spec import Key
|
6
|
+
from spiral.types_ import Timestamp
|
7
|
+
|
8
|
+
|
9
|
+
def show_scan(scan: TableScan):
|
10
|
+
"""Displays a scan in a way that is useful for debugging."""
|
11
|
+
table_ids = scan.table_ids()
|
12
|
+
if len(table_ids) > 1:
|
13
|
+
raise NotImplementedError("Multiple table scan is not supported.")
|
14
|
+
table_id = table_ids[0]
|
15
|
+
column_groups = scan.column_groups()
|
16
|
+
|
17
|
+
splits = scan.split()
|
18
|
+
key_space_scan = scan.key_space_scan(table_id)
|
19
|
+
|
20
|
+
# Collect all key bounds from all manifests. This makes sure all visualizations are aligned.
|
21
|
+
key_points = set()
|
22
|
+
key_space_manifest = key_space_scan.manifest
|
23
|
+
for i in range(len(key_space_manifest)):
|
24
|
+
fragment_file = key_space_manifest[i]
|
25
|
+
key_points.add(fragment_file.key_extent.min)
|
26
|
+
key_points.add(fragment_file.key_extent.max)
|
27
|
+
for cg in column_groups:
|
28
|
+
cg_scan = scan.column_group_scan(cg)
|
29
|
+
cg_manifest = cg_scan.manifest
|
30
|
+
for i in range(len(cg_manifest)):
|
31
|
+
fragment_file = cg_manifest[i]
|
32
|
+
key_points.add(fragment_file.key_extent.min)
|
33
|
+
key_points.add(fragment_file.key_extent.max)
|
34
|
+
|
35
|
+
# Make sure split points exist in all key points.
|
36
|
+
for s in splits[:-1]: # Don't take the last end.
|
37
|
+
key_points.add(s.end)
|
38
|
+
key_points = list(sorted(key_points))
|
39
|
+
|
40
|
+
show_manifest(key_space_manifest, scope="Key space", key_points=key_points, splits=splits)
|
41
|
+
for cg in scan.column_groups():
|
42
|
+
cg_scan = scan.column_group_scan(cg)
|
43
|
+
# Skip table id from the start of the column group.
|
44
|
+
show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
|
45
|
+
|
46
|
+
|
47
|
+
def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
|
48
|
+
try:
|
49
|
+
import matplotlib.patches as patches
|
50
|
+
import matplotlib.pyplot as plt
|
51
|
+
except ImportError:
|
52
|
+
raise ImportError("matplotlib is required for debug")
|
53
|
+
|
54
|
+
total_fragments = len(manifest)
|
55
|
+
|
56
|
+
size_points = set()
|
57
|
+
for i in range(total_fragments):
|
58
|
+
manifest_file: FragmentFile = manifest[i]
|
59
|
+
size_points.add(manifest_file.size_bytes)
|
60
|
+
size_points = list(sorted(size_points))
|
61
|
+
|
62
|
+
if key_points is None:
|
63
|
+
key_points = set()
|
64
|
+
|
65
|
+
for i in range(total_fragments):
|
66
|
+
manifest_file: FragmentFile = manifest[i]
|
67
|
+
|
68
|
+
key_points.add(manifest_file.key_extent.min)
|
69
|
+
key_points.add(manifest_file.key_extent.max)
|
70
|
+
|
71
|
+
if splits is not None:
|
72
|
+
for split in splits[:-1]:
|
73
|
+
key_points.add(split.end)
|
74
|
+
|
75
|
+
key_points = list(sorted(key_points))
|
76
|
+
|
77
|
+
# Create figure and axis with specified size
|
78
|
+
fig, ax = plt.subplots(figsize=(12, 8))
|
79
|
+
|
80
|
+
# Plot each rectangle
|
81
|
+
for i in range(total_fragments):
|
82
|
+
manifest_file: FragmentFile = manifest[i]
|
83
|
+
|
84
|
+
left = key_points.index(manifest_file.key_extent.min)
|
85
|
+
right = key_points.index(manifest_file.key_extent.max)
|
86
|
+
height = size_points.index(manifest_file.size_bytes) + 1
|
87
|
+
|
88
|
+
color = _get_fragment_color(manifest_file, i, total_fragments)
|
89
|
+
|
90
|
+
# Create rectangle patch
|
91
|
+
rect = patches.Rectangle(
|
92
|
+
(left, 0), # (x, y)
|
93
|
+
right - left, # width
|
94
|
+
height, # height
|
95
|
+
facecolor=color, # fill color
|
96
|
+
edgecolor="black", # border color
|
97
|
+
alpha=0.5, # transparency
|
98
|
+
linewidth=1, # border width
|
99
|
+
label=manifest_file.id, # label for legend
|
100
|
+
)
|
101
|
+
|
102
|
+
ax.add_patch(rect)
|
103
|
+
|
104
|
+
# Set axis limits with some padding
|
105
|
+
ax.set_xlim(-0.5, len(key_points) - 1 + 0.5)
|
106
|
+
ax.set_ylim(-0.5, len(size_points) + 0.5)
|
107
|
+
|
108
|
+
# Create split markers on x-axis
|
109
|
+
if splits is not None:
|
110
|
+
split_positions = [key_points.index(split.end) for split in splits[:-1]]
|
111
|
+
|
112
|
+
# Add split markers at the bottom
|
113
|
+
for pos in split_positions:
|
114
|
+
ax.annotate("▲", xy=(pos, 0), ha="center", va="top", color="red", annotation_clip=False)
|
115
|
+
|
116
|
+
# Add grid
|
117
|
+
ax.grid(True, linestyle="--", alpha=0.7, zorder=0)
|
118
|
+
|
119
|
+
# Add labels and title
|
120
|
+
ax.set_title("Fragment Distribution" if scope is None else f"{scope} Fragment Distribution")
|
121
|
+
ax.set_xlabel("Key Index")
|
122
|
+
ax.set_ylabel("Size Index")
|
123
|
+
|
124
|
+
# Add legend
|
125
|
+
ax.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="small")
|
126
|
+
|
127
|
+
# Adjust layout to prevent label cutoff
|
128
|
+
plt.tight_layout()
|
129
|
+
|
130
|
+
plot = FragmentManifestPlot(fig, ax, manifest)
|
131
|
+
fig.canvas.mpl_connect("motion_notify_event", plot.hover)
|
132
|
+
|
133
|
+
plt.show()
|
134
|
+
|
135
|
+
|
136
|
+
def _get_fragment_color(manifest_file: FragmentFile, color_index, total_colors):
|
137
|
+
import matplotlib.cm as cm
|
138
|
+
|
139
|
+
if manifest_file.compacted_at is not None:
|
140
|
+
# Use a shade of gray for compacted fragments
|
141
|
+
# Vary the shade based on the index to distinguish different compacted fragments
|
142
|
+
gray_value = 0.3 + (0.5 * (color_index / total_colors))
|
143
|
+
return (gray_value, gray_value, gray_value)
|
144
|
+
else:
|
145
|
+
# Use viridis colormap for non-compacted fragments
|
146
|
+
return cm.viridis(color_index / total_colors)
|
147
|
+
|
148
|
+
|
149
|
+
def _get_fragment_legend(manifest_file: FragmentFile):
|
150
|
+
return "\n".join(
|
151
|
+
[
|
152
|
+
f"id: {manifest_file.id}",
|
153
|
+
f"size: {manifest_file.size_bytes:,} bytes",
|
154
|
+
f"key_span: {manifest_file.key_span}",
|
155
|
+
f"key_min: {manifest_file.key_extent.min}",
|
156
|
+
f"key_max: {manifest_file.key_extent.max}",
|
157
|
+
f"format: {manifest_file.format}",
|
158
|
+
f"level: {manifest_file.level}",
|
159
|
+
f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
|
160
|
+
f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
|
161
|
+
f"ks_id: {manifest_file.ks_id}",
|
162
|
+
]
|
163
|
+
)
|
164
|
+
|
165
|
+
|
166
|
+
def _format_timestamp(ts: Timestamp | None) -> str:
|
167
|
+
# Format timestamp or show None
|
168
|
+
if ts is None:
|
169
|
+
return "None"
|
170
|
+
try:
|
171
|
+
return datetime.fromtimestamp(ts / 1e6).strftime("%Y-%m-%d %H:%M:%S")
|
172
|
+
except ValueError:
|
173
|
+
return str(ts)
|
174
|
+
|
175
|
+
|
176
|
+
class FragmentManifestPlot:
|
177
|
+
def __init__(self, fig, ax, manifest: FragmentManifest):
|
178
|
+
self.fig = fig
|
179
|
+
self.ax = ax
|
180
|
+
self.manifest = manifest
|
181
|
+
|
182
|
+
# Position the annotation in the bottom right corner
|
183
|
+
self.annotation = ax.annotate(
|
184
|
+
"",
|
185
|
+
xy=(0.98, 0.02), # Position in axes coordinates
|
186
|
+
xycoords="axes fraction",
|
187
|
+
bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
|
188
|
+
ha="right", # Right-align text
|
189
|
+
va="bottom", # Bottom-align text
|
190
|
+
visible=False,
|
191
|
+
)
|
192
|
+
self.highlighted_rect = None
|
193
|
+
self.highlighted_legend = None
|
194
|
+
|
195
|
+
def hover(self, event):
|
196
|
+
if event.inaxes != self.ax:
|
197
|
+
# Check if we're hovering over the legend
|
198
|
+
legend = self.ax.get_legend()
|
199
|
+
if legend and legend.contains(event)[0]:
|
200
|
+
# Find which legend item we're hovering over
|
201
|
+
for i, legend_text in enumerate(legend.get_texts()):
|
202
|
+
if legend_text.contains(event)[0]:
|
203
|
+
manifest_file = self.manifest[i]
|
204
|
+
self._show_legend(manifest_file, i, legend_text)
|
205
|
+
return
|
206
|
+
self._hide_legend()
|
207
|
+
return
|
208
|
+
|
209
|
+
# Check rectangles in the main plot
|
210
|
+
for i, rect in enumerate(self.ax.patches):
|
211
|
+
if rect.contains(event)[0]:
|
212
|
+
manifest_file = self.manifest[i]
|
213
|
+
self._show_legend(manifest_file, i, rect)
|
214
|
+
return
|
215
|
+
|
216
|
+
self._hide_legend()
|
217
|
+
|
218
|
+
def _show_legend(self, manifest_file, index, highlight_obj):
|
219
|
+
import matplotlib.patches as patches
|
220
|
+
|
221
|
+
# Update tooltip text
|
222
|
+
self.annotation.set_text(_get_fragment_legend(manifest_file))
|
223
|
+
self.annotation.set_visible(True)
|
224
|
+
|
225
|
+
# Handle highlighting
|
226
|
+
if isinstance(highlight_obj, patches.Rectangle):
|
227
|
+
# Highlighting rectangle in main plot
|
228
|
+
if self.highlighted_rect and self.highlighted_rect != highlight_obj:
|
229
|
+
self.highlighted_rect.set_alpha(0.5)
|
230
|
+
highlight_obj.set_alpha(0.8)
|
231
|
+
self.highlighted_rect = highlight_obj
|
232
|
+
else:
|
233
|
+
# Highlighting legend text
|
234
|
+
if self.highlighted_rect:
|
235
|
+
self.highlighted_rect.set_alpha(0.5)
|
236
|
+
# Find and highlight corresponding rectangle
|
237
|
+
rect = self.ax.patches[index]
|
238
|
+
rect.set_alpha(0.8)
|
239
|
+
self.highlighted_rect = rect
|
240
|
+
|
241
|
+
self.fig.canvas.draw_idle()
|
242
|
+
|
243
|
+
def _hide_legend(self):
|
244
|
+
if self.annotation.get_visible():
|
245
|
+
self.annotation.set_visible(False)
|
246
|
+
if self.highlighted_rect:
|
247
|
+
self.highlighted_rect.set_alpha(0.5)
|
248
|
+
self.fig.canvas.draw_idle()
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from spiral.core.table import TableMaintenance
|
2
|
+
|
3
|
+
|
4
|
+
class Maintenance:
|
5
|
+
"""Spiral table maintenance."""
|
6
|
+
|
7
|
+
def __init__(self, maintenance: TableMaintenance):
|
8
|
+
self._maintenance = maintenance
|
9
|
+
|
10
|
+
def flush_wal(self):
|
11
|
+
"""Flush the write-ahead log."""
|
12
|
+
self._maintenance.flush_wal()
|
spiral/tables/scan.py
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
from collections.abc import Iterator
|
2
|
+
from typing import TYPE_CHECKING, Any
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
from datasets import DatasetInfo, Features
|
6
|
+
|
7
|
+
from spiral.core.table import KeyRange, TableScan
|
8
|
+
from spiral.core.table.spec import Schema
|
9
|
+
from spiral.settings import CI, DEV
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import dask.dataframe as dd
|
13
|
+
import pandas as pd
|
14
|
+
import polars as pl
|
15
|
+
from datasets import iterable_dataset
|
16
|
+
|
17
|
+
|
18
|
+
class Scan:
|
19
|
+
"""Scan object."""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
scan: TableScan,
|
24
|
+
):
|
25
|
+
# NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
|
26
|
+
# when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
|
27
|
+
# and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
|
28
|
+
self._scan = scan
|
29
|
+
|
30
|
+
@property
|
31
|
+
def metrics(self) -> dict[str, Any]:
|
32
|
+
"""Returns metrics about the scan."""
|
33
|
+
return self._scan.metrics()
|
34
|
+
|
35
|
+
@property
|
36
|
+
def schema(self) -> Schema:
|
37
|
+
"""Returns the schema of the scan."""
|
38
|
+
return self._scan.schema()
|
39
|
+
|
40
|
+
def is_empty(self) -> bool:
|
41
|
+
"""Check if the Spiral is empty for the given key range.
|
42
|
+
|
43
|
+
**IMPORTANT**: False negatives are possible, but false positives are not,
|
44
|
+
i.e. is_empty can return False and scan can return zero rows.
|
45
|
+
"""
|
46
|
+
return self._scan.is_empty()
|
47
|
+
|
48
|
+
def to_record_batches(
|
49
|
+
self,
|
50
|
+
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
51
|
+
batch_size: int | None = None,
|
52
|
+
batch_readahead: int | None = None,
|
53
|
+
) -> pa.RecordBatchReader:
|
54
|
+
"""Read as a stream of RecordBatches.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
58
|
+
If None, the scan will be executed without a key table.
|
59
|
+
batch_size: the maximum number of rows per returned batch.
|
60
|
+
IMPORTANT: This is currently only respected when the key_table is used. If key table is a
|
61
|
+
RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
|
62
|
+
batch_readahead: the number of batches to prefetch in the background.
|
63
|
+
"""
|
64
|
+
if isinstance(key_table, pa.RecordBatchReader):
|
65
|
+
if batch_size is not None:
|
66
|
+
raise ValueError(
|
67
|
+
"batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
|
68
|
+
)
|
69
|
+
elif isinstance(key_table, pa.Table):
|
70
|
+
key_table = key_table.to_reader(max_chunksize=batch_size)
|
71
|
+
|
72
|
+
return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
73
|
+
|
74
|
+
def to_table(
|
75
|
+
self,
|
76
|
+
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
77
|
+
) -> pa.Table:
|
78
|
+
"""Read into a single PyArrow Table.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
82
|
+
If None, the scan will be executed without a key table.
|
83
|
+
"""
|
84
|
+
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
85
|
+
if DEV and not CI and key_table is None:
|
86
|
+
rb = self._scan.to_record_batch()
|
87
|
+
return pa.Table.from_batches([rb])
|
88
|
+
|
89
|
+
return self.to_record_batches(key_table=key_table).read_all()
|
90
|
+
|
91
|
+
def to_dask(self) -> "dd.DataFrame":
|
92
|
+
"""Read into a Dask DataFrame.
|
93
|
+
|
94
|
+
Requires the `dask` package to be installed.
|
95
|
+
"""
|
96
|
+
import dask.dataframe as dd
|
97
|
+
import pandas as pd
|
98
|
+
|
99
|
+
def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
|
100
|
+
# TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
|
101
|
+
raise NotImplementedError()
|
102
|
+
|
103
|
+
# Fetch a set of partition ranges
|
104
|
+
return dd.from_map(_read_key_range, self.split())
|
105
|
+
|
106
|
+
def to_pandas(self) -> "pd.DataFrame":
|
107
|
+
"""Read into a Pandas DataFrame.
|
108
|
+
|
109
|
+
Requires the `pandas` package to be installed.
|
110
|
+
"""
|
111
|
+
return self.to_table().to_pandas()
|
112
|
+
|
113
|
+
def to_polars(self) -> "pl.DataFrame":
|
114
|
+
"""Read into a Polars DataFrame.
|
115
|
+
|
116
|
+
Requires the `polars` package to be installed.
|
117
|
+
"""
|
118
|
+
import polars as pl
|
119
|
+
|
120
|
+
# TODO(marko): This should support lazy dataframe.
|
121
|
+
return pl.from_arrow(self.to_record_batches())
|
122
|
+
|
123
|
+
def to_pytorch(
|
124
|
+
self,
|
125
|
+
batch_readahead: int | None = None,
|
126
|
+
shuffle_batch_size: int | None = None,
|
127
|
+
shuffle_pool_num_rows: int | None = None,
|
128
|
+
) -> "iterable_dataset.IterableDataset":
|
129
|
+
"""Returns an iterable dataset that can be used to build a PyTorch DataLoader.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
batch_readahead: Number of batches to prefetch in the background.
|
133
|
+
shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
|
134
|
+
None along with shuffle_pool_num_rows=None, shuffling is disabled.
|
135
|
+
shuffle_pool_num_rows: Pool size for shuffling batches.
|
136
|
+
"""
|
137
|
+
from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
|
138
|
+
|
139
|
+
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
140
|
+
if shuffle_batch_size is None and shuffle_pool_num_rows is None:
|
141
|
+
stream = self.to_record_batches(
|
142
|
+
batch_readahead=batch_readahead,
|
143
|
+
)
|
144
|
+
else:
|
145
|
+
stream = self._scan.to_shuffled_record_batches(
|
146
|
+
batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
|
147
|
+
)
|
148
|
+
|
149
|
+
# This key is unused when training with IterableDataset.
|
150
|
+
# Default implementation returns shard id, e.g. parquet row group id.
|
151
|
+
for i, rb in enumerate(stream):
|
152
|
+
yield i, pa.Table.from_batches([rb], stream.schema)
|
153
|
+
|
154
|
+
def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
|
155
|
+
"""
|
156
|
+
Replace string-view columns in the schema with strings. We do use this converted schema
|
157
|
+
as Features in the returned Dataset.
|
158
|
+
Remove this method once we have https://github.com/huggingface/datasets/pull/7718
|
159
|
+
"""
|
160
|
+
new_fields = [
|
161
|
+
pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
|
162
|
+
if field.type == pa.string_view()
|
163
|
+
else field
|
164
|
+
for field in schema
|
165
|
+
]
|
166
|
+
return pa.schema(new_fields)
|
167
|
+
|
168
|
+
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
169
|
+
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
|
170
|
+
info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
|
171
|
+
return IterableDataset(ex_iterable=ex_iterable, info=info)
|
172
|
+
|
173
|
+
def _split(self) -> list[KeyRange]:
|
174
|
+
# Splits the scan into a set of key ranges.
|
175
|
+
return self._scan.split()
|
176
|
+
|
177
|
+
def _debug(self):
|
178
|
+
# Visualizes the scan, mainly for debugging purposes.
|
179
|
+
from spiral.tables.debug.scan import show_scan
|
180
|
+
|
181
|
+
show_scan(self._scan)
|
182
|
+
|
183
|
+
def _dump_manifests(self):
|
184
|
+
# Print manifests in a human-readable format.
|
185
|
+
from spiral.tables.debug.manifests import display_manifests
|
186
|
+
|
187
|
+
display_manifests(self._scan)
|
188
|
+
|
189
|
+
def _dump_metrics(self):
|
190
|
+
# Print metrics in a human-readable format.
|
191
|
+
from spiral.tables.debug.metrics import display_metrics
|
192
|
+
|
193
|
+
display_metrics(self.metrics)
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from spiral.core.table import TableSnapshot
|
4
|
+
from spiral.expressions import ExprLike
|
5
|
+
from spiral.tables.scan import Scan
|
6
|
+
from spiral.types_ import Timestamp
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
import duckdb
|
10
|
+
import polars as pl
|
11
|
+
import pyarrow.dataset
|
12
|
+
|
13
|
+
from spiral.tables import Tables
|
14
|
+
from spiral.tables.table import Table
|
15
|
+
|
16
|
+
|
17
|
+
class Snapshot:
|
18
|
+
"""Spiral table snapshot.
|
19
|
+
|
20
|
+
A snapshot represents a point-in-time view of a table.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, tables: "Tables", snapshot: TableSnapshot):
|
24
|
+
self._tables = tables
|
25
|
+
self._snapshot = snapshot
|
26
|
+
|
27
|
+
@property
|
28
|
+
def asof(self) -> Timestamp:
|
29
|
+
"""Returns the asof timestamp of the snapshot."""
|
30
|
+
return self._snapshot.asof
|
31
|
+
|
32
|
+
@property
|
33
|
+
def client(self) -> "Tables":
|
34
|
+
"""Returns the client used by the snapshot."""
|
35
|
+
return self._tables
|
36
|
+
|
37
|
+
@property
|
38
|
+
def table(self) -> "Table":
|
39
|
+
"""Returns the table associated with the snapshot."""
|
40
|
+
from spiral.tables.table import Table
|
41
|
+
|
42
|
+
return Table(self._tables, self._snapshot.table)
|
43
|
+
|
44
|
+
def to_dataset(self) -> "pyarrow.dataset.Dataset":
|
45
|
+
"""Returns a PyArrow Dataset representing the table."""
|
46
|
+
from .dataset import TableDataset
|
47
|
+
|
48
|
+
return TableDataset(self)
|
49
|
+
|
50
|
+
def to_polars(self) -> "pl.LazyFrame":
|
51
|
+
"""Returns a Polars LazyFrame for the Spiral table."""
|
52
|
+
import polars as pl
|
53
|
+
|
54
|
+
return pl.scan_pyarrow_dataset(self.to_dataset())
|
55
|
+
|
56
|
+
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
57
|
+
"""Returns a DuckDB relation for the Spiral table."""
|
58
|
+
import duckdb
|
59
|
+
|
60
|
+
return duckdb.from_arrow(self.to_dataset())
|
61
|
+
|
62
|
+
def scan(
|
63
|
+
self,
|
64
|
+
*projections: ExprLike,
|
65
|
+
where: ExprLike | None = None,
|
66
|
+
exclude_keys: bool = False,
|
67
|
+
) -> Scan:
|
68
|
+
"""Reads the snapshot. If projections are not provided, the entire table is read."""
|
69
|
+
if not projections:
|
70
|
+
# Use table as the default projection.
|
71
|
+
projections = [self._snapshot.table.__expr__]
|
72
|
+
|
73
|
+
return self._tables.scan(
|
74
|
+
*projections,
|
75
|
+
where=where,
|
76
|
+
asof=self._snapshot.asof,
|
77
|
+
exclude_keys=exclude_keys,
|
78
|
+
)
|
spiral/tables/table.py
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import TYPE_CHECKING
|
3
|
+
|
4
|
+
from spiral.core.table import Table as CoreTable
|
5
|
+
from spiral.core.table.spec import Schema
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
7
|
+
from spiral.settings import settings
|
8
|
+
from spiral.tables.maintenance import Maintenance
|
9
|
+
from spiral.tables.scan import Scan
|
10
|
+
from spiral.tables.snapshot import Snapshot
|
11
|
+
from spiral.tables.transaction import Transaction
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from spiral.tables import Tables
|
15
|
+
|
16
|
+
|
17
|
+
class Table(Expr):
|
18
|
+
"""API for interacting with a SpiralDB's Table.
|
19
|
+
|
20
|
+
Different catalog implementations should ultimately construct a Table object.
|
21
|
+
"""
|
22
|
+
|
23
|
+
# TODO(marko): Make identifier required.
|
24
|
+
def __init__(self, tables: "Tables", table: CoreTable, *, identifier: str | None = None):
|
25
|
+
super().__init__(table.__expr__)
|
26
|
+
|
27
|
+
self._tables = tables
|
28
|
+
self._table = table
|
29
|
+
self._identifier = identifier
|
30
|
+
self._key_schema = self._table.key_schema
|
31
|
+
self._key_columns = set(self._key_schema.names)
|
32
|
+
|
33
|
+
@property
|
34
|
+
def client(self) -> "Tables":
|
35
|
+
"""Returns the client used by the table."""
|
36
|
+
return self._tables
|
37
|
+
|
38
|
+
@property
|
39
|
+
def table_id(self) -> str:
|
40
|
+
return self._table.id
|
41
|
+
|
42
|
+
@property
|
43
|
+
def identifier(self) -> str:
|
44
|
+
"""Returns the fully qualified identifier of the table."""
|
45
|
+
return self._identifier or self._table.id
|
46
|
+
|
47
|
+
@property
|
48
|
+
def dataset(self) -> str | None:
|
49
|
+
"""Returns the dataset of the table."""
|
50
|
+
if self._identifier is None:
|
51
|
+
return None
|
52
|
+
_, dataset, _ = self._identifier.split(".")
|
53
|
+
return dataset
|
54
|
+
|
55
|
+
@property
|
56
|
+
def name(self) -> str | None:
|
57
|
+
"""Returns the name of the table."""
|
58
|
+
if self._identifier is None:
|
59
|
+
return None
|
60
|
+
_, _, name = self._identifier.split(".")
|
61
|
+
return name
|
62
|
+
|
63
|
+
@property
|
64
|
+
def last_modified_at(self) -> int:
|
65
|
+
return self._table.get_wal(asof=None).last_modified_at
|
66
|
+
|
67
|
+
def __str__(self):
|
68
|
+
return self.identifier
|
69
|
+
|
70
|
+
def __repr__(self):
|
71
|
+
return f'Table("{self.identifier}")'
|
72
|
+
|
73
|
+
def __getitem__(self, item: str) -> Expr:
|
74
|
+
from spiral import expressions as se
|
75
|
+
|
76
|
+
if item in self._key_columns:
|
77
|
+
return se.key(name=item)
|
78
|
+
|
79
|
+
return super().__getitem__(item)
|
80
|
+
|
81
|
+
def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
|
82
|
+
# Override an expression select in the root column group to split between keys and columns.
|
83
|
+
if exclude is not None:
|
84
|
+
if set(exclude) & self._key_columns:
|
85
|
+
raise ValueError(
|
86
|
+
"Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
|
87
|
+
)
|
88
|
+
|
89
|
+
key_paths = set(paths) & self._key_columns
|
90
|
+
other_paths = set(paths) - key_paths
|
91
|
+
if not key_paths:
|
92
|
+
return super().select(*paths, exclude=exclude)
|
93
|
+
|
94
|
+
from spiral import expressions as se
|
95
|
+
|
96
|
+
return se.merge(se.pack({key: se.key(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
|
97
|
+
|
98
|
+
@property
|
99
|
+
def key_schema(self) -> Schema:
|
100
|
+
"""Returns the key schema of the table."""
|
101
|
+
return self._key_schema
|
102
|
+
|
103
|
+
@property
|
104
|
+
def schema(self) -> Schema:
|
105
|
+
"""Returns the FULL schema of the table.
|
106
|
+
|
107
|
+
NOTE: This can be expensive for large tables.
|
108
|
+
"""
|
109
|
+
return self._table.get_schema(asof=None)
|
110
|
+
|
111
|
+
def scan(
|
112
|
+
self,
|
113
|
+
*projections: ExprLike,
|
114
|
+
where: ExprLike | None = None,
|
115
|
+
asof: datetime | int | None = None,
|
116
|
+
exclude_keys: bool = False,
|
117
|
+
) -> Scan:
|
118
|
+
"""Reads the table. If projections are not provided, the entire table is read."""
|
119
|
+
if not projections:
|
120
|
+
projections = [self]
|
121
|
+
|
122
|
+
return self._tables.scan(*projections, where=where, asof=asof, exclude_keys=exclude_keys)
|
123
|
+
|
124
|
+
def write(
|
125
|
+
self,
|
126
|
+
expr: ExprLike,
|
127
|
+
*,
|
128
|
+
partition_size_bytes: int | None = None,
|
129
|
+
) -> None:
|
130
|
+
"""Write an item to the table inside a single transaction.
|
131
|
+
|
132
|
+
:param expr: The expression to write. Must evaluate to a struct array.
|
133
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
134
|
+
"""
|
135
|
+
with self.txn() as txn:
|
136
|
+
txn.write(
|
137
|
+
expr,
|
138
|
+
partition_size_bytes=partition_size_bytes,
|
139
|
+
)
|
140
|
+
|
141
|
+
def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
|
142
|
+
"""Returns a snapshot of the table at the given timestamp."""
|
143
|
+
if isinstance(asof, datetime):
|
144
|
+
asof = int(asof.timestamp() * 1_000_000)
|
145
|
+
return Snapshot(self._tables, self._table.get_snapshot(asof=asof))
|
146
|
+
|
147
|
+
def txn(self) -> Transaction:
|
148
|
+
"""Begins a new transaction. Transaction must be committed for writes to become visible.
|
149
|
+
|
150
|
+
IMPORTANT: While transaction can be used to atomically write data to the table,
|
151
|
+
it is important that the primary key columns are unique within the transaction.
|
152
|
+
"""
|
153
|
+
return Transaction(self._tables._spiral.open_transaction(self._table, settings().file_format))
|
154
|
+
|
155
|
+
def maintenance(self) -> Maintenance:
|
156
|
+
"""Access maintenance operations for a table."""
|
157
|
+
return Maintenance(self._tables._spiral.open_maintenance(self._table, settings().file_format))
|