pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.4.0__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/METADATA +12 -14
- pyspiral-0.4.0.dist-info/RECORD +98 -0
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/WHEEL +1 -1
- spiral/__init__.py +6 -7
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +21 -14
- spiral/api/__init__.py +15 -172
- spiral/api/admin.py +12 -26
- spiral/api/client.py +160 -0
- spiral/api/filesystems.py +100 -72
- spiral/api/organizations.py +45 -58
- spiral/api/projects.py +171 -134
- spiral/api/telemetry.py +19 -0
- spiral/api/types.py +20 -0
- spiral/api/workloads.py +32 -25
- spiral/{arrow.py → arrow_.py} +12 -0
- spiral/cli/__init__.py +2 -5
- spiral/cli/admin.py +7 -12
- spiral/cli/app.py +23 -6
- spiral/cli/console.py +1 -1
- spiral/cli/fs.py +83 -18
- spiral/cli/iceberg/__init__.py +7 -0
- spiral/cli/iceberg/namespaces.py +47 -0
- spiral/cli/iceberg/tables.py +60 -0
- spiral/cli/indexes/__init__.py +19 -0
- spiral/cli/login.py +14 -5
- spiral/cli/orgs.py +90 -0
- spiral/cli/printer.py +9 -1
- spiral/cli/projects.py +136 -0
- spiral/cli/state.py +2 -0
- spiral/cli/tables/__init__.py +121 -0
- spiral/cli/telemetry.py +18 -0
- spiral/cli/types.py +8 -10
- spiral/cli/{workload.py → workloads.py} +11 -11
- spiral/{catalog.py → client.py} +22 -21
- spiral/core/client/__init__.pyi +117 -0
- spiral/core/index/__init__.pyi +15 -0
- spiral/core/table/__init__.pyi +108 -0
- spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
- spiral/core/table/metastore/__init__.pyi +62 -0
- spiral/core/{spec → table/spec}/__init__.pyi +49 -92
- spiral/datetime_.py +27 -0
- spiral/expressions/__init__.py +40 -17
- spiral/expressions/base.py +5 -5
- spiral/expressions/list_.py +1 -1
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +23 -9
- spiral/expressions/struct.py +7 -5
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +88 -88
- spiral/expressions/udf.py +3 -3
- spiral/iceberg/__init__.py +3 -0
- spiral/iceberg/client.py +33 -0
- spiral/indexes/__init__.py +5 -0
- spiral/indexes/client.py +137 -0
- spiral/indexes/index.py +34 -0
- spiral/indexes/scan.py +22 -0
- spiral/project.py +19 -110
- spiral/{proto → protogen}/_/scandal/__init__.py +32 -77
- spiral/protogen/_/spiral/table/__init__.py +22 -0
- spiral/protogen/substrait/__init__.py +3399 -0
- spiral/protogen/substrait/extensions/__init__.py +115 -0
- spiral/server.py +17 -0
- spiral/settings.py +31 -87
- spiral/substrait_.py +10 -6
- spiral/tables/__init__.py +12 -0
- spiral/tables/client.py +130 -0
- spiral/{dataset.py → tables/dataset.py} +36 -25
- spiral/tables/debug/manifests.py +70 -0
- spiral/tables/debug/metrics.py +56 -0
- spiral/{debug.py → tables/debug/scan.py} +6 -9
- spiral/tables/maintenance.py +12 -0
- spiral/tables/scan.py +193 -0
- spiral/tables/snapshot.py +78 -0
- spiral/tables/table.py +157 -0
- spiral/tables/transaction.py +52 -0
- pyspiral-0.2.5.dist-info/RECORD +0 -81
- spiral/api/tables.py +0 -94
- spiral/api/tokens.py +0 -56
- spiral/authn/authn.py +0 -89
- spiral/authn/device.py +0 -206
- spiral/authn/github_.py +0 -33
- spiral/authn/modal_.py +0 -18
- spiral/cli/org.py +0 -90
- spiral/cli/project.py +0 -107
- spiral/cli/table.py +0 -20
- spiral/cli/token.py +0 -27
- spiral/config.py +0 -26
- spiral/core/core/__init__.pyi +0 -53
- spiral/core/metastore/__init__.pyi +0 -91
- spiral/proto/_/spfs/__init__.py +0 -36
- spiral/proto/_/spiral/table/__init__.py +0 -225
- spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
- spiral/proto/__init__.py +0 -0
- spiral/proto/scandal/__init__.py +0 -45
- spiral/proto/spiral/__init__.py +0 -0
- spiral/proto/spiral/table/__init__.py +0 -96
- spiral/scan_.py +0 -168
- spiral/table.py +0 -157
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/entry_points.txt +0 -0
- /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
- /spiral/{core → protogen/_}/__init__.py +0 -0
- /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
- /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
- /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
- /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
- /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
- /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
- /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
- /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
- /spiral/{proto → protogen}/util.py +0 -0
- /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
from spiral import datetime_
|
2
|
+
from spiral.core.table import TableScan
|
3
|
+
from spiral.core.table.manifests import FragmentManifest
|
4
|
+
from spiral.tables.debug.metrics import _format_bytes
|
5
|
+
|
6
|
+
|
7
|
+
def display_manifests(scan: TableScan):
|
8
|
+
"""Display all manifests in a scan."""
|
9
|
+
if len(scan.table_ids()) != 1:
|
10
|
+
raise NotImplementedError("Multiple table scans are not supported.")
|
11
|
+
table_id = scan.table_ids()[0]
|
12
|
+
|
13
|
+
key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
|
14
|
+
_table_of_fragments(
|
15
|
+
key_space_manifest,
|
16
|
+
title="Key Space manifest",
|
17
|
+
)
|
18
|
+
|
19
|
+
for column_group in scan.column_groups():
|
20
|
+
column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
|
21
|
+
_table_of_fragments(
|
22
|
+
column_group_manifest,
|
23
|
+
title=f"Column Group manifest for {str(column_group)}",
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
def _table_of_fragments(manifest: FragmentManifest, title: str):
|
28
|
+
"""Display fragments in a formatted table."""
|
29
|
+
# Calculate summary statistics
|
30
|
+
total_size = sum(fragment.size_bytes for fragment in manifest)
|
31
|
+
total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
|
32
|
+
fragment_count = len(manifest)
|
33
|
+
avg_size = total_size / fragment_count if fragment_count > 0 else 0
|
34
|
+
|
35
|
+
# Print title and summary
|
36
|
+
print(f"\n\n{title}")
|
37
|
+
print(
|
38
|
+
f"{fragment_count} fragments, "
|
39
|
+
f"total: {_format_bytes(total_size)}, "
|
40
|
+
f"avg: {_format_bytes(int(avg_size))}, "
|
41
|
+
f"metadata: {_format_bytes(total_metadata_size)}"
|
42
|
+
)
|
43
|
+
print("=" * 120)
|
44
|
+
|
45
|
+
# Print header
|
46
|
+
print(
|
47
|
+
f"{'ID':<30} {'Size (Metadata)':<20} {'Format':<10} {'Key Span':<10} "
|
48
|
+
f"{'Level':<5} {'Committed At':<20} {'Compacted At':<20}"
|
49
|
+
)
|
50
|
+
print("=" * 120)
|
51
|
+
|
52
|
+
# Print each fragment
|
53
|
+
for fragment in manifest:
|
54
|
+
committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
|
55
|
+
compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
|
56
|
+
|
57
|
+
size_with_metadata = (
|
58
|
+
f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
|
59
|
+
)
|
60
|
+
key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
|
61
|
+
|
62
|
+
print(
|
63
|
+
f"{fragment.id:<30} "
|
64
|
+
f"{size_with_metadata:<20} "
|
65
|
+
f"{str(fragment.format):<10} "
|
66
|
+
f"{key_span:<10} "
|
67
|
+
f"{str(fragment.level):<5} "
|
68
|
+
f"{committed_str:<20} "
|
69
|
+
f"{compacted_str:<20}"
|
70
|
+
)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
|
4
|
+
def display_metrics(metrics: dict[str, Any]) -> None:
|
5
|
+
"""Display metrics in a formatted table."""
|
6
|
+
print(
|
7
|
+
f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
|
8
|
+
f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
|
9
|
+
)
|
10
|
+
print("=" * 140)
|
11
|
+
|
12
|
+
for metric_name, data in sorted(metrics.items()):
|
13
|
+
metric_type = data["type"]
|
14
|
+
count = data["count"]
|
15
|
+
avg = _format_value(data["avg"], metric_type, metric_name)
|
16
|
+
min_val = _format_value(data["min"], metric_type, metric_name)
|
17
|
+
max_val = _format_value(data["max"], metric_type, metric_name)
|
18
|
+
p95 = _format_value(data["p95"], metric_type, metric_name)
|
19
|
+
p99 = _format_value(data["p99"], metric_type, metric_name)
|
20
|
+
stddev = _format_value(data["stddev"], metric_type, metric_name)
|
21
|
+
|
22
|
+
print(
|
23
|
+
f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
|
24
|
+
f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
def _format_duration(nanoseconds: float) -> str:
|
29
|
+
"""Convert nanoseconds to human-readable duration."""
|
30
|
+
if nanoseconds >= 1_000_000_000:
|
31
|
+
return f"{nanoseconds / 1_000_000_000:.2f}s"
|
32
|
+
elif nanoseconds >= 1_000_000:
|
33
|
+
return f"{nanoseconds / 1_000_000:.2f}ms"
|
34
|
+
elif nanoseconds >= 1_000:
|
35
|
+
return f"{nanoseconds / 1_000:.2f}μs"
|
36
|
+
else:
|
37
|
+
return f"{nanoseconds:.0f}ns"
|
38
|
+
|
39
|
+
|
40
|
+
def _format_bytes(bytes_value: float) -> str:
|
41
|
+
"""Convert bytes to human-readable size."""
|
42
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
43
|
+
if bytes_value < 1024:
|
44
|
+
return f"{bytes_value:.1f}{unit}"
|
45
|
+
bytes_value /= 1024
|
46
|
+
return f"{bytes_value:.1f}TB"
|
47
|
+
|
48
|
+
|
49
|
+
def _format_value(value: float, metric_type: str, metric_name: str) -> str:
|
50
|
+
"""Format a value based on metric type and name."""
|
51
|
+
if metric_type == "timer" or "duration" in metric_name:
|
52
|
+
return _format_duration(value)
|
53
|
+
elif "bytes" in metric_name:
|
54
|
+
return _format_bytes(value)
|
55
|
+
else:
|
56
|
+
return f"{value:,.0f}"
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from datetime import datetime
|
2
2
|
|
3
|
-
from spiral.core.
|
4
|
-
from spiral.core.manifests import FragmentFile, FragmentManifest
|
5
|
-
from spiral.core.spec import Key
|
3
|
+
from spiral.core.table import TableScan
|
4
|
+
from spiral.core.table.manifests import FragmentFile, FragmentManifest
|
5
|
+
from spiral.core.table.spec import Key
|
6
6
|
from spiral.types_ import Timestamp
|
7
7
|
|
8
8
|
|
@@ -30,7 +30,7 @@ def show_scan(scan: TableScan):
|
|
30
30
|
for i in range(len(cg_manifest)):
|
31
31
|
fragment_file = cg_manifest[i]
|
32
32
|
key_points.add(fragment_file.key_extent.min)
|
33
|
-
|
33
|
+
key_points.add(fragment_file.key_extent.max)
|
34
34
|
|
35
35
|
# Make sure split points exist in all key points.
|
36
36
|
for s in splits[:-1]: # Don't take the last end.
|
@@ -44,9 +44,7 @@ def show_scan(scan: TableScan):
|
|
44
44
|
show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
|
45
45
|
|
46
46
|
|
47
|
-
def show_manifest(
|
48
|
-
manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list[KeyRange] = None
|
49
|
-
):
|
47
|
+
def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
|
50
48
|
try:
|
51
49
|
import matplotlib.patches as patches
|
52
50
|
import matplotlib.pyplot as plt
|
@@ -157,10 +155,9 @@ def _get_fragment_legend(manifest_file: FragmentFile):
|
|
157
155
|
f"key_min: {manifest_file.key_extent.min}",
|
158
156
|
f"key_max: {manifest_file.key_extent.max}",
|
159
157
|
f"format: {manifest_file.format}",
|
160
|
-
f"level: {manifest_file.
|
158
|
+
f"level: {manifest_file.level}",
|
161
159
|
f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
|
162
160
|
f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
|
163
|
-
f"fs_id: {manifest_file.fs_id}",
|
164
161
|
f"ks_id: {manifest_file.ks_id}",
|
165
162
|
]
|
166
163
|
)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from spiral.core.table import TableMaintenance
|
2
|
+
|
3
|
+
|
4
|
+
class Maintenance:
|
5
|
+
"""Spiral table maintenance."""
|
6
|
+
|
7
|
+
def __init__(self, maintenance: TableMaintenance):
|
8
|
+
self._maintenance = maintenance
|
9
|
+
|
10
|
+
def flush_wal(self):
|
11
|
+
"""Flush the write-ahead log."""
|
12
|
+
self._maintenance.flush_wal()
|
spiral/tables/scan.py
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
from collections.abc import Iterator
|
2
|
+
from typing import TYPE_CHECKING, Any
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
from datasets import DatasetInfo, Features
|
6
|
+
|
7
|
+
from spiral.core.table import KeyRange, TableScan
|
8
|
+
from spiral.core.table.spec import Schema
|
9
|
+
from spiral.settings import CI, DEV
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import dask.dataframe as dd
|
13
|
+
import pandas as pd
|
14
|
+
import polars as pl
|
15
|
+
from datasets import iterable_dataset
|
16
|
+
|
17
|
+
|
18
|
+
class Scan:
|
19
|
+
"""Scan object."""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
scan: TableScan,
|
24
|
+
):
|
25
|
+
# NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
|
26
|
+
# when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
|
27
|
+
# and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
|
28
|
+
self._scan = scan
|
29
|
+
|
30
|
+
@property
|
31
|
+
def metrics(self) -> dict[str, Any]:
|
32
|
+
"""Returns metrics about the scan."""
|
33
|
+
return self._scan.metrics()
|
34
|
+
|
35
|
+
@property
|
36
|
+
def schema(self) -> Schema:
|
37
|
+
"""Returns the schema of the scan."""
|
38
|
+
return self._scan.schema()
|
39
|
+
|
40
|
+
def is_empty(self) -> bool:
|
41
|
+
"""Check if the Spiral is empty for the given key range.
|
42
|
+
|
43
|
+
**IMPORTANT**: False negatives are possible, but false positives are not,
|
44
|
+
i.e. is_empty can return False and scan can return zero rows.
|
45
|
+
"""
|
46
|
+
return self._scan.is_empty()
|
47
|
+
|
48
|
+
def to_record_batches(
|
49
|
+
self,
|
50
|
+
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
51
|
+
batch_size: int | None = None,
|
52
|
+
batch_readahead: int | None = None,
|
53
|
+
) -> pa.RecordBatchReader:
|
54
|
+
"""Read as a stream of RecordBatches.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
58
|
+
If None, the scan will be executed without a key table.
|
59
|
+
batch_size: the maximum number of rows per returned batch.
|
60
|
+
IMPORTANT: This is currently only respected when the key_table is used. If key table is a
|
61
|
+
RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
|
62
|
+
batch_readahead: the number of batches to prefetch in the background.
|
63
|
+
"""
|
64
|
+
if isinstance(key_table, pa.RecordBatchReader):
|
65
|
+
if batch_size is not None:
|
66
|
+
raise ValueError(
|
67
|
+
"batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
|
68
|
+
)
|
69
|
+
elif isinstance(key_table, pa.Table):
|
70
|
+
key_table = key_table.to_reader(max_chunksize=batch_size)
|
71
|
+
|
72
|
+
return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
73
|
+
|
74
|
+
def to_table(
|
75
|
+
self,
|
76
|
+
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
77
|
+
) -> pa.Table:
|
78
|
+
"""Read into a single PyArrow Table.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
82
|
+
If None, the scan will be executed without a key table.
|
83
|
+
"""
|
84
|
+
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
85
|
+
if DEV and not CI and key_table is None:
|
86
|
+
rb = self._scan.to_record_batch()
|
87
|
+
return pa.Table.from_batches([rb])
|
88
|
+
|
89
|
+
return self.to_record_batches(key_table=key_table).read_all()
|
90
|
+
|
91
|
+
def to_dask(self) -> "dd.DataFrame":
|
92
|
+
"""Read into a Dask DataFrame.
|
93
|
+
|
94
|
+
Requires the `dask` package to be installed.
|
95
|
+
"""
|
96
|
+
import dask.dataframe as dd
|
97
|
+
import pandas as pd
|
98
|
+
|
99
|
+
def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
|
100
|
+
# TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
|
101
|
+
raise NotImplementedError()
|
102
|
+
|
103
|
+
# Fetch a set of partition ranges
|
104
|
+
return dd.from_map(_read_key_range, self.split())
|
105
|
+
|
106
|
+
def to_pandas(self) -> "pd.DataFrame":
|
107
|
+
"""Read into a Pandas DataFrame.
|
108
|
+
|
109
|
+
Requires the `pandas` package to be installed.
|
110
|
+
"""
|
111
|
+
return self.to_table().to_pandas()
|
112
|
+
|
113
|
+
def to_polars(self) -> "pl.DataFrame":
|
114
|
+
"""Read into a Polars DataFrame.
|
115
|
+
|
116
|
+
Requires the `polars` package to be installed.
|
117
|
+
"""
|
118
|
+
import polars as pl
|
119
|
+
|
120
|
+
# TODO(marko): This should support lazy dataframe.
|
121
|
+
return pl.from_arrow(self.to_record_batches())
|
122
|
+
|
123
|
+
def to_pytorch(
|
124
|
+
self,
|
125
|
+
batch_readahead: int | None = None,
|
126
|
+
shuffle_batch_size: int | None = None,
|
127
|
+
shuffle_pool_num_rows: int | None = None,
|
128
|
+
) -> "iterable_dataset.IterableDataset":
|
129
|
+
"""Returns an iterable dataset that can be used to build a PyTorch DataLoader.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
batch_readahead: Number of batches to prefetch in the background.
|
133
|
+
shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
|
134
|
+
None along with shuffle_pool_num_rows=None, shuffling is disabled.
|
135
|
+
shuffle_pool_num_rows: Pool size for shuffling batches.
|
136
|
+
"""
|
137
|
+
from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
|
138
|
+
|
139
|
+
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
140
|
+
if shuffle_batch_size is None and shuffle_pool_num_rows is None:
|
141
|
+
stream = self.to_record_batches(
|
142
|
+
batch_readahead=batch_readahead,
|
143
|
+
)
|
144
|
+
else:
|
145
|
+
stream = self._scan.to_shuffled_record_batches(
|
146
|
+
batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
|
147
|
+
)
|
148
|
+
|
149
|
+
# This key is unused when training with IterableDataset.
|
150
|
+
# Default implementation returns shard id, e.g. parquet row group id.
|
151
|
+
for i, rb in enumerate(stream):
|
152
|
+
yield i, pa.Table.from_batches([rb], stream.schema)
|
153
|
+
|
154
|
+
def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
|
155
|
+
"""
|
156
|
+
Replace string-view columns in the schema with strings. We do use this converted schema
|
157
|
+
as Features in the returned Dataset.
|
158
|
+
Remove this method once we have https://github.com/huggingface/datasets/pull/7718
|
159
|
+
"""
|
160
|
+
new_fields = [
|
161
|
+
pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
|
162
|
+
if field.type == pa.string_view()
|
163
|
+
else field
|
164
|
+
for field in schema
|
165
|
+
]
|
166
|
+
return pa.schema(new_fields)
|
167
|
+
|
168
|
+
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
169
|
+
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
|
170
|
+
info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
|
171
|
+
return IterableDataset(ex_iterable=ex_iterable, info=info)
|
172
|
+
|
173
|
+
def _split(self) -> list[KeyRange]:
|
174
|
+
# Splits the scan into a set of key ranges.
|
175
|
+
return self._scan.split()
|
176
|
+
|
177
|
+
def _debug(self):
|
178
|
+
# Visualizes the scan, mainly for debugging purposes.
|
179
|
+
from spiral.tables.debug.scan import show_scan
|
180
|
+
|
181
|
+
show_scan(self._scan)
|
182
|
+
|
183
|
+
def _dump_manifests(self):
|
184
|
+
# Print manifests in a human-readable format.
|
185
|
+
from spiral.tables.debug.manifests import display_manifests
|
186
|
+
|
187
|
+
display_manifests(self._scan)
|
188
|
+
|
189
|
+
def _dump_metrics(self):
|
190
|
+
# Print metrics in a human-readable format.
|
191
|
+
from spiral.tables.debug.metrics import display_metrics
|
192
|
+
|
193
|
+
display_metrics(self.metrics)
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from spiral.core.table import TableSnapshot
|
4
|
+
from spiral.expressions import ExprLike
|
5
|
+
from spiral.tables.scan import Scan
|
6
|
+
from spiral.types_ import Timestamp
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
import duckdb
|
10
|
+
import polars as pl
|
11
|
+
import pyarrow.dataset
|
12
|
+
|
13
|
+
from spiral.tables import Tables
|
14
|
+
from spiral.tables.table import Table
|
15
|
+
|
16
|
+
|
17
|
+
class Snapshot:
|
18
|
+
"""Spiral table snapshot.
|
19
|
+
|
20
|
+
A snapshot represents a point-in-time view of a table.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, tables: "Tables", snapshot: TableSnapshot):
|
24
|
+
self._tables = tables
|
25
|
+
self._snapshot = snapshot
|
26
|
+
|
27
|
+
@property
|
28
|
+
def asof(self) -> Timestamp:
|
29
|
+
"""Returns the asof timestamp of the snapshot."""
|
30
|
+
return self._snapshot.asof
|
31
|
+
|
32
|
+
@property
|
33
|
+
def client(self) -> "Tables":
|
34
|
+
"""Returns the client used by the snapshot."""
|
35
|
+
return self._tables
|
36
|
+
|
37
|
+
@property
|
38
|
+
def table(self) -> "Table":
|
39
|
+
"""Returns the table associated with the snapshot."""
|
40
|
+
from spiral.tables.table import Table
|
41
|
+
|
42
|
+
return Table(self._tables, self._snapshot.table)
|
43
|
+
|
44
|
+
def to_dataset(self) -> "pyarrow.dataset.Dataset":
|
45
|
+
"""Returns a PyArrow Dataset representing the table."""
|
46
|
+
from .dataset import TableDataset
|
47
|
+
|
48
|
+
return TableDataset(self)
|
49
|
+
|
50
|
+
def to_polars(self) -> "pl.LazyFrame":
|
51
|
+
"""Returns a Polars LazyFrame for the Spiral table."""
|
52
|
+
import polars as pl
|
53
|
+
|
54
|
+
return pl.scan_pyarrow_dataset(self.to_dataset())
|
55
|
+
|
56
|
+
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
57
|
+
"""Returns a DuckDB relation for the Spiral table."""
|
58
|
+
import duckdb
|
59
|
+
|
60
|
+
return duckdb.from_arrow(self.to_dataset())
|
61
|
+
|
62
|
+
def scan(
|
63
|
+
self,
|
64
|
+
*projections: ExprLike,
|
65
|
+
where: ExprLike | None = None,
|
66
|
+
exclude_keys: bool = False,
|
67
|
+
) -> Scan:
|
68
|
+
"""Reads the snapshot. If projections are not provided, the entire table is read."""
|
69
|
+
if not projections:
|
70
|
+
# Use table as the default projection.
|
71
|
+
projections = [self._snapshot.table.__expr__]
|
72
|
+
|
73
|
+
return self._tables.scan(
|
74
|
+
*projections,
|
75
|
+
where=where,
|
76
|
+
asof=self._snapshot.asof,
|
77
|
+
exclude_keys=exclude_keys,
|
78
|
+
)
|
spiral/tables/table.py
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import TYPE_CHECKING
|
3
|
+
|
4
|
+
from spiral.core.table import Table as CoreTable
|
5
|
+
from spiral.core.table.spec import Schema
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
7
|
+
from spiral.settings import settings
|
8
|
+
from spiral.tables.maintenance import Maintenance
|
9
|
+
from spiral.tables.scan import Scan
|
10
|
+
from spiral.tables.snapshot import Snapshot
|
11
|
+
from spiral.tables.transaction import Transaction
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from spiral.tables import Tables
|
15
|
+
|
16
|
+
|
17
|
+
class Table(Expr):
|
18
|
+
"""API for interacting with a SpiralDB's Table.
|
19
|
+
|
20
|
+
Different catalog implementations should ultimately construct a Table object.
|
21
|
+
"""
|
22
|
+
|
23
|
+
# TODO(marko): Make identifier required.
|
24
|
+
def __init__(self, tables: "Tables", table: CoreTable, *, identifier: str | None = None):
|
25
|
+
super().__init__(table.__expr__)
|
26
|
+
|
27
|
+
self._tables = tables
|
28
|
+
self._table = table
|
29
|
+
self._identifier = identifier
|
30
|
+
self._key_schema = self._table.key_schema
|
31
|
+
self._key_columns = set(self._key_schema.names)
|
32
|
+
|
33
|
+
@property
|
34
|
+
def client(self) -> "Tables":
|
35
|
+
"""Returns the client used by the table."""
|
36
|
+
return self._tables
|
37
|
+
|
38
|
+
@property
|
39
|
+
def table_id(self) -> str:
|
40
|
+
return self._table.id
|
41
|
+
|
42
|
+
@property
|
43
|
+
def identifier(self) -> str:
|
44
|
+
"""Returns the fully qualified identifier of the table."""
|
45
|
+
return self._identifier or self._table.id
|
46
|
+
|
47
|
+
@property
|
48
|
+
def dataset(self) -> str | None:
|
49
|
+
"""Returns the dataset of the table."""
|
50
|
+
if self._identifier is None:
|
51
|
+
return None
|
52
|
+
_, dataset, _ = self._identifier.split(".")
|
53
|
+
return dataset
|
54
|
+
|
55
|
+
@property
|
56
|
+
def name(self) -> str | None:
|
57
|
+
"""Returns the name of the table."""
|
58
|
+
if self._identifier is None:
|
59
|
+
return None
|
60
|
+
_, _, name = self._identifier.split(".")
|
61
|
+
return name
|
62
|
+
|
63
|
+
@property
|
64
|
+
def last_modified_at(self) -> int:
|
65
|
+
return self._table.get_wal(asof=None).last_modified_at
|
66
|
+
|
67
|
+
def __str__(self):
|
68
|
+
return self.identifier
|
69
|
+
|
70
|
+
def __repr__(self):
|
71
|
+
return f'Table("{self.identifier}")'
|
72
|
+
|
73
|
+
def __getitem__(self, item: str) -> Expr:
|
74
|
+
from spiral import expressions as se
|
75
|
+
|
76
|
+
if item in self._key_columns:
|
77
|
+
return se.key(name=item)
|
78
|
+
|
79
|
+
return super().__getitem__(item)
|
80
|
+
|
81
|
+
def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
|
82
|
+
# Override an expression select in the root column group to split between keys and columns.
|
83
|
+
if exclude is not None:
|
84
|
+
if set(exclude) & self._key_columns:
|
85
|
+
raise ValueError(
|
86
|
+
"Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
|
87
|
+
)
|
88
|
+
|
89
|
+
key_paths = set(paths) & self._key_columns
|
90
|
+
other_paths = set(paths) - key_paths
|
91
|
+
if not key_paths:
|
92
|
+
return super().select(*paths, exclude=exclude)
|
93
|
+
|
94
|
+
from spiral import expressions as se
|
95
|
+
|
96
|
+
return se.merge(se.pack({key: se.key(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
|
97
|
+
|
98
|
+
@property
|
99
|
+
def key_schema(self) -> Schema:
|
100
|
+
"""Returns the key schema of the table."""
|
101
|
+
return self._key_schema
|
102
|
+
|
103
|
+
@property
|
104
|
+
def schema(self) -> Schema:
|
105
|
+
"""Returns the FULL schema of the table.
|
106
|
+
|
107
|
+
NOTE: This can be expensive for large tables.
|
108
|
+
"""
|
109
|
+
return self._table.get_schema(asof=None)
|
110
|
+
|
111
|
+
def scan(
|
112
|
+
self,
|
113
|
+
*projections: ExprLike,
|
114
|
+
where: ExprLike | None = None,
|
115
|
+
asof: datetime | int | None = None,
|
116
|
+
exclude_keys: bool = False,
|
117
|
+
) -> Scan:
|
118
|
+
"""Reads the table. If projections are not provided, the entire table is read."""
|
119
|
+
if not projections:
|
120
|
+
projections = [self]
|
121
|
+
|
122
|
+
return self._tables.scan(*projections, where=where, asof=asof, exclude_keys=exclude_keys)
|
123
|
+
|
124
|
+
def write(
|
125
|
+
self,
|
126
|
+
expr: ExprLike,
|
127
|
+
*,
|
128
|
+
partition_size_bytes: int | None = None,
|
129
|
+
) -> None:
|
130
|
+
"""Write an item to the table inside a single transaction.
|
131
|
+
|
132
|
+
:param expr: The expression to write. Must evaluate to a struct array.
|
133
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
134
|
+
"""
|
135
|
+
with self.txn() as txn:
|
136
|
+
txn.write(
|
137
|
+
expr,
|
138
|
+
partition_size_bytes=partition_size_bytes,
|
139
|
+
)
|
140
|
+
|
141
|
+
def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
|
142
|
+
"""Returns a snapshot of the table at the given timestamp."""
|
143
|
+
if isinstance(asof, datetime):
|
144
|
+
asof = int(asof.timestamp() * 1_000_000)
|
145
|
+
return Snapshot(self._tables, self._table.get_snapshot(asof=asof))
|
146
|
+
|
147
|
+
def txn(self) -> Transaction:
|
148
|
+
"""Begins a new transaction. Transaction must be committed for writes to become visible.
|
149
|
+
|
150
|
+
IMPORTANT: While transaction can be used to atomically write data to the table,
|
151
|
+
it is important that the primary key columns are unique within the transaction.
|
152
|
+
"""
|
153
|
+
return Transaction(self._tables._spiral.open_transaction(self._table, settings().file_format))
|
154
|
+
|
155
|
+
def maintenance(self) -> Maintenance:
|
156
|
+
"""Access maintenance operations for a table."""
|
157
|
+
return Maintenance(self._tables._spiral.open_maintenance(self._table, settings().file_format))
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from spiral.core.table import TableTransaction
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from spiral.expressions.base import ExprLike
|
7
|
+
|
8
|
+
|
9
|
+
class Transaction:
|
10
|
+
"""Spiral table transaction.
|
11
|
+
|
12
|
+
IMPORTANT: While transaction can be used to atomically write data to the table,
|
13
|
+
it is important that the primary key columns are unique within the transaction.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, transaction: TableTransaction):
|
17
|
+
self._transaction = transaction
|
18
|
+
|
19
|
+
@property
|
20
|
+
def status(self) -> str:
|
21
|
+
"""The status of the transaction."""
|
22
|
+
return self._transaction.status
|
23
|
+
|
24
|
+
def __enter__(self):
|
25
|
+
return self
|
26
|
+
|
27
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
28
|
+
if exc_type is None:
|
29
|
+
self._transaction.commit()
|
30
|
+
else:
|
31
|
+
self._transaction.abort()
|
32
|
+
|
33
|
+
def write(self, expr: "ExprLike", *, partition_size_bytes: int | None = None):
|
34
|
+
"""Write an item to the table inside a single transaction.
|
35
|
+
|
36
|
+
:param expr: The expression to write. Must evaluate to a struct array.
|
37
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
38
|
+
If not provided, the default partition size is used.
|
39
|
+
"""
|
40
|
+
from spiral import expressions as se
|
41
|
+
|
42
|
+
expr = se.lift(expr)
|
43
|
+
|
44
|
+
self._transaction.write(expr.__expr__, partition_size_bytes=partition_size_bytes)
|
45
|
+
|
46
|
+
def commit(self):
|
47
|
+
"""Commit the transaction."""
|
48
|
+
self._transaction.commit()
|
49
|
+
|
50
|
+
def abort(self):
|
51
|
+
"""Abort the transaction."""
|
52
|
+
self._transaction.abort()
|