pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. pyspiral-0.4.0.dist-info/METADATA +46 -0
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. pyspiral-0.4.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +10 -0
  6. spiral/_lib.pypy310-pp73-darwin.so +0 -0
  7. spiral/adbc.py +393 -0
  8. spiral/api/__init__.py +64 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +160 -0
  11. spiral/api/filesystems.py +153 -0
  12. spiral/api/organizations.py +77 -0
  13. spiral/api/projects.py +197 -0
  14. spiral/api/telemetry.py +19 -0
  15. spiral/api/types.py +20 -0
  16. spiral/api/workloads.py +52 -0
  17. spiral/arrow_.py +221 -0
  18. spiral/cli/__init__.py +79 -0
  19. spiral/cli/__main__.py +4 -0
  20. spiral/cli/admin.py +16 -0
  21. spiral/cli/app.py +65 -0
  22. spiral/cli/console.py +95 -0
  23. spiral/cli/fs.py +112 -0
  24. spiral/cli/iceberg/__init__.py +7 -0
  25. spiral/cli/iceberg/namespaces.py +47 -0
  26. spiral/cli/iceberg/tables.py +60 -0
  27. spiral/cli/indexes/__init__.py +19 -0
  28. spiral/cli/login.py +22 -0
  29. spiral/cli/orgs.py +90 -0
  30. spiral/cli/printer.py +53 -0
  31. spiral/cli/projects.py +136 -0
  32. spiral/cli/state.py +5 -0
  33. spiral/cli/tables/__init__.py +121 -0
  34. spiral/cli/telemetry.py +18 -0
  35. spiral/cli/types.py +51 -0
  36. spiral/cli/workloads.py +59 -0
  37. spiral/client.py +79 -0
  38. spiral/core/__init__.pyi +0 -0
  39. spiral/core/client/__init__.pyi +117 -0
  40. spiral/core/index/__init__.pyi +15 -0
  41. spiral/core/table/__init__.pyi +108 -0
  42. spiral/core/table/manifests/__init__.pyi +35 -0
  43. spiral/core/table/metastore/__init__.pyi +62 -0
  44. spiral/core/table/spec/__init__.pyi +214 -0
  45. spiral/datetime_.py +27 -0
  46. spiral/expressions/__init__.py +245 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/mp4.py +62 -0
  52. spiral/expressions/png.py +18 -0
  53. spiral/expressions/qoi.py +18 -0
  54. spiral/expressions/refs.py +58 -0
  55. spiral/expressions/str_.py +39 -0
  56. spiral/expressions/struct.py +59 -0
  57. spiral/expressions/text.py +62 -0
  58. spiral/expressions/tiff.py +223 -0
  59. spiral/expressions/udf.py +46 -0
  60. spiral/grpc_.py +32 -0
  61. spiral/iceberg/__init__.py +3 -0
  62. spiral/iceberg/client.py +33 -0
  63. spiral/indexes/__init__.py +5 -0
  64. spiral/indexes/client.py +137 -0
  65. spiral/indexes/index.py +34 -0
  66. spiral/indexes/scan.py +22 -0
  67. spiral/project.py +46 -0
  68. spiral/protogen/_/__init__.py +0 -0
  69. spiral/protogen/_/arrow/__init__.py +0 -0
  70. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  71. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  72. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  73. spiral/protogen/_/scandal/__init__.py +178 -0
  74. spiral/protogen/_/spiral/__init__.py +0 -0
  75. spiral/protogen/_/spiral/table/__init__.py +22 -0
  76. spiral/protogen/_/substrait/__init__.py +3399 -0
  77. spiral/protogen/_/substrait/extensions/__init__.py +115 -0
  78. spiral/protogen/__init__.py +0 -0
  79. spiral/protogen/substrait/__init__.py +3399 -0
  80. spiral/protogen/substrait/extensions/__init__.py +115 -0
  81. spiral/protogen/util.py +41 -0
  82. spiral/py.typed +0 -0
  83. spiral/server.py +17 -0
  84. spiral/settings.py +101 -0
  85. spiral/substrait_.py +279 -0
  86. spiral/tables/__init__.py +12 -0
  87. spiral/tables/client.py +130 -0
  88. spiral/tables/dataset.py +250 -0
  89. spiral/tables/debug/__init__.py +0 -0
  90. spiral/tables/debug/manifests.py +70 -0
  91. spiral/tables/debug/metrics.py +56 -0
  92. spiral/tables/debug/scan.py +248 -0
  93. spiral/tables/maintenance.py +12 -0
  94. spiral/tables/scan.py +193 -0
  95. spiral/tables/snapshot.py +78 -0
  96. spiral/tables/table.py +157 -0
  97. spiral/tables/transaction.py +52 -0
  98. spiral/types_.py +6 -0
@@ -0,0 +1,248 @@
1
+ from datetime import datetime
2
+
3
+ from spiral.core.table import TableScan
4
+ from spiral.core.table.manifests import FragmentFile, FragmentManifest
5
+ from spiral.core.table.spec import Key
6
+ from spiral.types_ import Timestamp
7
+
8
+
9
+ def show_scan(scan: TableScan):
10
+ """Displays a scan in a way that is useful for debugging."""
11
+ table_ids = scan.table_ids()
12
+ if len(table_ids) > 1:
13
+ raise NotImplementedError("Multiple table scan is not supported.")
14
+ table_id = table_ids[0]
15
+ column_groups = scan.column_groups()
16
+
17
+ splits = scan.split()
18
+ key_space_scan = scan.key_space_scan(table_id)
19
+
20
+ # Collect all key bounds from all manifests. This makes sure all visualizations are aligned.
21
+ key_points = set()
22
+ key_space_manifest = key_space_scan.manifest
23
+ for i in range(len(key_space_manifest)):
24
+ fragment_file = key_space_manifest[i]
25
+ key_points.add(fragment_file.key_extent.min)
26
+ key_points.add(fragment_file.key_extent.max)
27
+ for cg in column_groups:
28
+ cg_scan = scan.column_group_scan(cg)
29
+ cg_manifest = cg_scan.manifest
30
+ for i in range(len(cg_manifest)):
31
+ fragment_file = cg_manifest[i]
32
+ key_points.add(fragment_file.key_extent.min)
33
+ key_points.add(fragment_file.key_extent.max)
34
+
35
+ # Make sure split points exist in all key points.
36
+ for s in splits[:-1]: # Don't take the last end.
37
+ key_points.add(s.end)
38
+ key_points = list(sorted(key_points))
39
+
40
+ show_manifest(key_space_manifest, scope="Key space", key_points=key_points, splits=splits)
41
+ for cg in scan.column_groups():
42
+ cg_scan = scan.column_group_scan(cg)
43
+ # Skip table id from the start of the column group.
44
+ show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
45
+
46
+
47
+ def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
48
+ try:
49
+ import matplotlib.patches as patches
50
+ import matplotlib.pyplot as plt
51
+ except ImportError:
52
+ raise ImportError("matplotlib is required for debug")
53
+
54
+ total_fragments = len(manifest)
55
+
56
+ size_points = set()
57
+ for i in range(total_fragments):
58
+ manifest_file: FragmentFile = manifest[i]
59
+ size_points.add(manifest_file.size_bytes)
60
+ size_points = list(sorted(size_points))
61
+
62
+ if key_points is None:
63
+ key_points = set()
64
+
65
+ for i in range(total_fragments):
66
+ manifest_file: FragmentFile = manifest[i]
67
+
68
+ key_points.add(manifest_file.key_extent.min)
69
+ key_points.add(manifest_file.key_extent.max)
70
+
71
+ if splits is not None:
72
+ for split in splits[:-1]:
73
+ key_points.add(split.end)
74
+
75
+ key_points = list(sorted(key_points))
76
+
77
+ # Create figure and axis with specified size
78
+ fig, ax = plt.subplots(figsize=(12, 8))
79
+
80
+ # Plot each rectangle
81
+ for i in range(total_fragments):
82
+ manifest_file: FragmentFile = manifest[i]
83
+
84
+ left = key_points.index(manifest_file.key_extent.min)
85
+ right = key_points.index(manifest_file.key_extent.max)
86
+ height = size_points.index(manifest_file.size_bytes) + 1
87
+
88
+ color = _get_fragment_color(manifest_file, i, total_fragments)
89
+
90
+ # Create rectangle patch
91
+ rect = patches.Rectangle(
92
+ (left, 0), # (x, y)
93
+ right - left, # width
94
+ height, # height
95
+ facecolor=color, # fill color
96
+ edgecolor="black", # border color
97
+ alpha=0.5, # transparency
98
+ linewidth=1, # border width
99
+ label=manifest_file.id, # label for legend
100
+ )
101
+
102
+ ax.add_patch(rect)
103
+
104
+ # Set axis limits with some padding
105
+ ax.set_xlim(-0.5, len(key_points) - 1 + 0.5)
106
+ ax.set_ylim(-0.5, len(size_points) + 0.5)
107
+
108
+ # Create split markers on x-axis
109
+ if splits is not None:
110
+ split_positions = [key_points.index(split.end) for split in splits[:-1]]
111
+
112
+ # Add split markers at the bottom
113
+ for pos in split_positions:
114
+ ax.annotate("▲", xy=(pos, 0), ha="center", va="top", color="red", annotation_clip=False)
115
+
116
+ # Add grid
117
+ ax.grid(True, linestyle="--", alpha=0.7, zorder=0)
118
+
119
+ # Add labels and title
120
+ ax.set_title("Fragment Distribution" if scope is None else f"{scope} Fragment Distribution")
121
+ ax.set_xlabel("Key Index")
122
+ ax.set_ylabel("Size Index")
123
+
124
+ # Add legend
125
+ ax.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="small")
126
+
127
+ # Adjust layout to prevent label cutoff
128
+ plt.tight_layout()
129
+
130
+ plot = FragmentManifestPlot(fig, ax, manifest)
131
+ fig.canvas.mpl_connect("motion_notify_event", plot.hover)
132
+
133
+ plt.show()
134
+
135
+
136
+ def _get_fragment_color(manifest_file: FragmentFile, color_index, total_colors):
137
+ import matplotlib.cm as cm
138
+
139
+ if manifest_file.compacted_at is not None:
140
+ # Use a shade of gray for compacted fragments
141
+ # Vary the shade based on the index to distinguish different compacted fragments
142
+ gray_value = 0.3 + (0.5 * (color_index / total_colors))
143
+ return (gray_value, gray_value, gray_value)
144
+ else:
145
+ # Use viridis colormap for non-compacted fragments
146
+ return cm.viridis(color_index / total_colors)
147
+
148
+
149
+ def _get_fragment_legend(manifest_file: FragmentFile):
150
+ return "\n".join(
151
+ [
152
+ f"id: {manifest_file.id}",
153
+ f"size: {manifest_file.size_bytes:,} bytes",
154
+ f"key_span: {manifest_file.key_span}",
155
+ f"key_min: {manifest_file.key_extent.min}",
156
+ f"key_max: {manifest_file.key_extent.max}",
157
+ f"format: {manifest_file.format}",
158
+ f"level: {manifest_file.level}",
159
+ f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
160
+ f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
161
+ f"ks_id: {manifest_file.ks_id}",
162
+ ]
163
+ )
164
+
165
+
166
+ def _format_timestamp(ts: Timestamp | None) -> str:
167
+ # Format timestamp or show None
168
+ if ts is None:
169
+ return "None"
170
+ try:
171
+ return datetime.fromtimestamp(ts / 1e6).strftime("%Y-%m-%d %H:%M:%S")
172
+ except ValueError:
173
+ return str(ts)
174
+
175
+
176
+ class FragmentManifestPlot:
177
+ def __init__(self, fig, ax, manifest: FragmentManifest):
178
+ self.fig = fig
179
+ self.ax = ax
180
+ self.manifest = manifest
181
+
182
+ # Position the annotation in the bottom right corner
183
+ self.annotation = ax.annotate(
184
+ "",
185
+ xy=(0.98, 0.02), # Position in axes coordinates
186
+ xycoords="axes fraction",
187
+ bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
188
+ ha="right", # Right-align text
189
+ va="bottom", # Bottom-align text
190
+ visible=False,
191
+ )
192
+ self.highlighted_rect = None
193
+ self.highlighted_legend = None
194
+
195
+ def hover(self, event):
196
+ if event.inaxes != self.ax:
197
+ # Check if we're hovering over the legend
198
+ legend = self.ax.get_legend()
199
+ if legend and legend.contains(event)[0]:
200
+ # Find which legend item we're hovering over
201
+ for i, legend_text in enumerate(legend.get_texts()):
202
+ if legend_text.contains(event)[0]:
203
+ manifest_file = self.manifest[i]
204
+ self._show_legend(manifest_file, i, legend_text)
205
+ return
206
+ self._hide_legend()
207
+ return
208
+
209
+ # Check rectangles in the main plot
210
+ for i, rect in enumerate(self.ax.patches):
211
+ if rect.contains(event)[0]:
212
+ manifest_file = self.manifest[i]
213
+ self._show_legend(manifest_file, i, rect)
214
+ return
215
+
216
+ self._hide_legend()
217
+
218
+ def _show_legend(self, manifest_file, index, highlight_obj):
219
+ import matplotlib.patches as patches
220
+
221
+ # Update tooltip text
222
+ self.annotation.set_text(_get_fragment_legend(manifest_file))
223
+ self.annotation.set_visible(True)
224
+
225
+ # Handle highlighting
226
+ if isinstance(highlight_obj, patches.Rectangle):
227
+ # Highlighting rectangle in main plot
228
+ if self.highlighted_rect and self.highlighted_rect != highlight_obj:
229
+ self.highlighted_rect.set_alpha(0.5)
230
+ highlight_obj.set_alpha(0.8)
231
+ self.highlighted_rect = highlight_obj
232
+ else:
233
+ # Highlighting legend text
234
+ if self.highlighted_rect:
235
+ self.highlighted_rect.set_alpha(0.5)
236
+ # Find and highlight corresponding rectangle
237
+ rect = self.ax.patches[index]
238
+ rect.set_alpha(0.8)
239
+ self.highlighted_rect = rect
240
+
241
+ self.fig.canvas.draw_idle()
242
+
243
+ def _hide_legend(self):
244
+ if self.annotation.get_visible():
245
+ self.annotation.set_visible(False)
246
+ if self.highlighted_rect:
247
+ self.highlighted_rect.set_alpha(0.5)
248
+ self.fig.canvas.draw_idle()
@@ -0,0 +1,12 @@
1
+ from spiral.core.table import TableMaintenance
2
+
3
+
4
+ class Maintenance:
5
+ """Spiral table maintenance."""
6
+
7
+ def __init__(self, maintenance: TableMaintenance):
8
+ self._maintenance = maintenance
9
+
10
+ def flush_wal(self):
11
+ """Flush the write-ahead log."""
12
+ self._maintenance.flush_wal()
spiral/tables/scan.py ADDED
@@ -0,0 +1,193 @@
1
+ from collections.abc import Iterator
2
+ from typing import TYPE_CHECKING, Any
3
+
4
+ import pyarrow as pa
5
+ from datasets import DatasetInfo, Features
6
+
7
+ from spiral.core.table import KeyRange, TableScan
8
+ from spiral.core.table.spec import Schema
9
+ from spiral.settings import CI, DEV
10
+
11
+ if TYPE_CHECKING:
12
+ import dask.dataframe as dd
13
+ import pandas as pd
14
+ import polars as pl
15
+ from datasets import iterable_dataset
16
+
17
+
18
+ class Scan:
19
+ """Scan object."""
20
+
21
+ def __init__(
22
+ self,
23
+ scan: TableScan,
24
+ ):
25
+ # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
26
+ # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
27
+ # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
28
+ self._scan = scan
29
+
30
+ @property
31
+ def metrics(self) -> dict[str, Any]:
32
+ """Returns metrics about the scan."""
33
+ return self._scan.metrics()
34
+
35
+ @property
36
+ def schema(self) -> Schema:
37
+ """Returns the schema of the scan."""
38
+ return self._scan.schema()
39
+
40
+ def is_empty(self) -> bool:
41
+ """Check if the Spiral is empty for the given key range.
42
+
43
+ **IMPORTANT**: False negatives are possible, but false positives are not,
44
+ i.e. is_empty can return False and scan can return zero rows.
45
+ """
46
+ return self._scan.is_empty()
47
+
48
+ def to_record_batches(
49
+ self,
50
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
51
+ batch_size: int | None = None,
52
+ batch_readahead: int | None = None,
53
+ ) -> pa.RecordBatchReader:
54
+ """Read as a stream of RecordBatches.
55
+
56
+ Args:
57
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
58
+ If None, the scan will be executed without a key table.
59
+ batch_size: the maximum number of rows per returned batch.
60
+ IMPORTANT: This is currently only respected when the key_table is used. If key table is a
61
+ RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
62
+ batch_readahead: the number of batches to prefetch in the background.
63
+ """
64
+ if isinstance(key_table, pa.RecordBatchReader):
65
+ if batch_size is not None:
66
+ raise ValueError(
67
+ "batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
68
+ )
69
+ elif isinstance(key_table, pa.Table):
70
+ key_table = key_table.to_reader(max_chunksize=batch_size)
71
+
72
+ return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
73
+
74
+ def to_table(
75
+ self,
76
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
77
+ ) -> pa.Table:
78
+ """Read into a single PyArrow Table.
79
+
80
+ Args:
81
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
82
+ If None, the scan will be executed without a key table.
83
+ """
84
+ # NOTE: Evaluates fully on Rust side which improved debuggability.
85
+ if DEV and not CI and key_table is None:
86
+ rb = self._scan.to_record_batch()
87
+ return pa.Table.from_batches([rb])
88
+
89
+ return self.to_record_batches(key_table=key_table).read_all()
90
+
91
+ def to_dask(self) -> "dd.DataFrame":
92
+ """Read into a Dask DataFrame.
93
+
94
+ Requires the `dask` package to be installed.
95
+ """
96
+ import dask.dataframe as dd
97
+ import pandas as pd
98
+
99
+ def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
100
+ # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
101
+ raise NotImplementedError()
102
+
103
+ # Fetch a set of partition ranges
104
+ return dd.from_map(_read_key_range, self.split())
105
+
106
+ def to_pandas(self) -> "pd.DataFrame":
107
+ """Read into a Pandas DataFrame.
108
+
109
+ Requires the `pandas` package to be installed.
110
+ """
111
+ return self.to_table().to_pandas()
112
+
113
+ def to_polars(self) -> "pl.DataFrame":
114
+ """Read into a Polars DataFrame.
115
+
116
+ Requires the `polars` package to be installed.
117
+ """
118
+ import polars as pl
119
+
120
+ # TODO(marko): This should support lazy dataframe.
121
+ return pl.from_arrow(self.to_record_batches())
122
+
123
+ def to_pytorch(
124
+ self,
125
+ batch_readahead: int | None = None,
126
+ shuffle_batch_size: int | None = None,
127
+ shuffle_pool_num_rows: int | None = None,
128
+ ) -> "iterable_dataset.IterableDataset":
129
+ """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
130
+
131
+ Args:
132
+ batch_readahead: Number of batches to prefetch in the background.
133
+ shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
134
+ None along with shuffle_pool_num_rows=None, shuffling is disabled.
135
+ shuffle_pool_num_rows: Pool size for shuffling batches.
136
+ """
137
+ from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
138
+
139
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
140
+ if shuffle_batch_size is None and shuffle_pool_num_rows is None:
141
+ stream = self.to_record_batches(
142
+ batch_readahead=batch_readahead,
143
+ )
144
+ else:
145
+ stream = self._scan.to_shuffled_record_batches(
146
+ batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
147
+ )
148
+
149
+ # This key is unused when training with IterableDataset.
150
+ # Default implementation returns shard id, e.g. parquet row group id.
151
+ for i, rb in enumerate(stream):
152
+ yield i, pa.Table.from_batches([rb], stream.schema)
153
+
154
+ def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
155
+ """
156
+ Replace string-view columns in the schema with strings. We do use this converted schema
157
+ as Features in the returned Dataset.
158
+ Remove this method once we have https://github.com/huggingface/datasets/pull/7718
159
+ """
160
+ new_fields = [
161
+ pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
162
+ if field.type == pa.string_view()
163
+ else field
164
+ for field in schema
165
+ ]
166
+ return pa.schema(new_fields)
167
+
168
+ # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
169
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
170
+ info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
171
+ return IterableDataset(ex_iterable=ex_iterable, info=info)
172
+
173
+ def _split(self) -> list[KeyRange]:
174
+ # Splits the scan into a set of key ranges.
175
+ return self._scan.split()
176
+
177
+ def _debug(self):
178
+ # Visualizes the scan, mainly for debugging purposes.
179
+ from spiral.tables.debug.scan import show_scan
180
+
181
+ show_scan(self._scan)
182
+
183
+ def _dump_manifests(self):
184
+ # Print manifests in a human-readable format.
185
+ from spiral.tables.debug.manifests import display_manifests
186
+
187
+ display_manifests(self._scan)
188
+
189
+ def _dump_metrics(self):
190
+ # Print metrics in a human-readable format.
191
+ from spiral.tables.debug.metrics import display_metrics
192
+
193
+ display_metrics(self.metrics)
@@ -0,0 +1,78 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from spiral.core.table import TableSnapshot
4
+ from spiral.expressions import ExprLike
5
+ from spiral.tables.scan import Scan
6
+ from spiral.types_ import Timestamp
7
+
8
+ if TYPE_CHECKING:
9
+ import duckdb
10
+ import polars as pl
11
+ import pyarrow.dataset
12
+
13
+ from spiral.tables import Tables
14
+ from spiral.tables.table import Table
15
+
16
+
17
+ class Snapshot:
18
+ """Spiral table snapshot.
19
+
20
+ A snapshot represents a point-in-time view of a table.
21
+ """
22
+
23
+ def __init__(self, tables: "Tables", snapshot: TableSnapshot):
24
+ self._tables = tables
25
+ self._snapshot = snapshot
26
+
27
+ @property
28
+ def asof(self) -> Timestamp:
29
+ """Returns the asof timestamp of the snapshot."""
30
+ return self._snapshot.asof
31
+
32
+ @property
33
+ def client(self) -> "Tables":
34
+ """Returns the client used by the snapshot."""
35
+ return self._tables
36
+
37
+ @property
38
+ def table(self) -> "Table":
39
+ """Returns the table associated with the snapshot."""
40
+ from spiral.tables.table import Table
41
+
42
+ return Table(self._tables, self._snapshot.table)
43
+
44
+ def to_dataset(self) -> "pyarrow.dataset.Dataset":
45
+ """Returns a PyArrow Dataset representing the table."""
46
+ from .dataset import TableDataset
47
+
48
+ return TableDataset(self)
49
+
50
+ def to_polars(self) -> "pl.LazyFrame":
51
+ """Returns a Polars LazyFrame for the Spiral table."""
52
+ import polars as pl
53
+
54
+ return pl.scan_pyarrow_dataset(self.to_dataset())
55
+
56
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
57
+ """Returns a DuckDB relation for the Spiral table."""
58
+ import duckdb
59
+
60
+ return duckdb.from_arrow(self.to_dataset())
61
+
62
+ def scan(
63
+ self,
64
+ *projections: ExprLike,
65
+ where: ExprLike | None = None,
66
+ exclude_keys: bool = False,
67
+ ) -> Scan:
68
+ """Reads the snapshot. If projections are not provided, the entire table is read."""
69
+ if not projections:
70
+ # Use table as the default projection.
71
+ projections = [self._snapshot.table.__expr__]
72
+
73
+ return self._tables.scan(
74
+ *projections,
75
+ where=where,
76
+ asof=self._snapshot.asof,
77
+ exclude_keys=exclude_keys,
78
+ )
spiral/tables/table.py ADDED
@@ -0,0 +1,157 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING
3
+
4
+ from spiral.core.table import Table as CoreTable
5
+ from spiral.core.table.spec import Schema
6
+ from spiral.expressions.base import Expr, ExprLike
7
+ from spiral.settings import settings
8
+ from spiral.tables.maintenance import Maintenance
9
+ from spiral.tables.scan import Scan
10
+ from spiral.tables.snapshot import Snapshot
11
+ from spiral.tables.transaction import Transaction
12
+
13
+ if TYPE_CHECKING:
14
+ from spiral.tables import Tables
15
+
16
+
17
+ class Table(Expr):
18
+ """API for interacting with a SpiralDB's Table.
19
+
20
+ Different catalog implementations should ultimately construct a Table object.
21
+ """
22
+
23
+ # TODO(marko): Make identifier required.
24
+ def __init__(self, tables: "Tables", table: CoreTable, *, identifier: str | None = None):
25
+ super().__init__(table.__expr__)
26
+
27
+ self._tables = tables
28
+ self._table = table
29
+ self._identifier = identifier
30
+ self._key_schema = self._table.key_schema
31
+ self._key_columns = set(self._key_schema.names)
32
+
33
+ @property
34
+ def client(self) -> "Tables":
35
+ """Returns the client used by the table."""
36
+ return self._tables
37
+
38
+ @property
39
+ def table_id(self) -> str:
40
+ return self._table.id
41
+
42
+ @property
43
+ def identifier(self) -> str:
44
+ """Returns the fully qualified identifier of the table."""
45
+ return self._identifier or self._table.id
46
+
47
+ @property
48
+ def dataset(self) -> str | None:
49
+ """Returns the dataset of the table."""
50
+ if self._identifier is None:
51
+ return None
52
+ _, dataset, _ = self._identifier.split(".")
53
+ return dataset
54
+
55
+ @property
56
+ def name(self) -> str | None:
57
+ """Returns the name of the table."""
58
+ if self._identifier is None:
59
+ return None
60
+ _, _, name = self._identifier.split(".")
61
+ return name
62
+
63
+ @property
64
+ def last_modified_at(self) -> int:
65
+ return self._table.get_wal(asof=None).last_modified_at
66
+
67
+ def __str__(self):
68
+ return self.identifier
69
+
70
+ def __repr__(self):
71
+ return f'Table("{self.identifier}")'
72
+
73
+ def __getitem__(self, item: str) -> Expr:
74
+ from spiral import expressions as se
75
+
76
+ if item in self._key_columns:
77
+ return se.key(name=item)
78
+
79
+ return super().__getitem__(item)
80
+
81
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
82
+ # Override an expression select in the root column group to split between keys and columns.
83
+ if exclude is not None:
84
+ if set(exclude) & self._key_columns:
85
+ raise ValueError(
86
+ "Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
87
+ )
88
+
89
+ key_paths = set(paths) & self._key_columns
90
+ other_paths = set(paths) - key_paths
91
+ if not key_paths:
92
+ return super().select(*paths, exclude=exclude)
93
+
94
+ from spiral import expressions as se
95
+
96
+ return se.merge(se.pack({key: se.key(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
97
+
98
+ @property
99
+ def key_schema(self) -> Schema:
100
+ """Returns the key schema of the table."""
101
+ return self._key_schema
102
+
103
+ @property
104
+ def schema(self) -> Schema:
105
+ """Returns the FULL schema of the table.
106
+
107
+ NOTE: This can be expensive for large tables.
108
+ """
109
+ return self._table.get_schema(asof=None)
110
+
111
+ def scan(
112
+ self,
113
+ *projections: ExprLike,
114
+ where: ExprLike | None = None,
115
+ asof: datetime | int | None = None,
116
+ exclude_keys: bool = False,
117
+ ) -> Scan:
118
+ """Reads the table. If projections are not provided, the entire table is read."""
119
+ if not projections:
120
+ projections = [self]
121
+
122
+ return self._tables.scan(*projections, where=where, asof=asof, exclude_keys=exclude_keys)
123
+
124
+ def write(
125
+ self,
126
+ expr: ExprLike,
127
+ *,
128
+ partition_size_bytes: int | None = None,
129
+ ) -> None:
130
+ """Write an item to the table inside a single transaction.
131
+
132
+ :param expr: The expression to write. Must evaluate to a struct array.
133
+ :param partition_size_bytes: The maximum partition size in bytes.
134
+ """
135
+ with self.txn() as txn:
136
+ txn.write(
137
+ expr,
138
+ partition_size_bytes=partition_size_bytes,
139
+ )
140
+
141
+ def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
142
+ """Returns a snapshot of the table at the given timestamp."""
143
+ if isinstance(asof, datetime):
144
+ asof = int(asof.timestamp() * 1_000_000)
145
+ return Snapshot(self._tables, self._table.get_snapshot(asof=asof))
146
+
147
+ def txn(self) -> Transaction:
148
+ """Begins a new transaction. Transaction must be committed for writes to become visible.
149
+
150
+ IMPORTANT: While transaction can be used to atomically write data to the table,
151
+ it is important that the primary key columns are unique within the transaction.
152
+ """
153
+ return Transaction(self._tables._spiral.open_transaction(self._table, settings().file_format))
154
+
155
+ def maintenance(self) -> Maintenance:
156
+ """Access maintenance operations for a table."""
157
+ return Maintenance(self._tables._spiral.open_maintenance(self._table, settings().file_format))