pyspiral 0.6.6__cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.6.6.dist-info/METADATA +51 -0
- pyspiral-0.6.6.dist-info/RECORD +102 -0
- pyspiral-0.6.6.dist-info/WHEEL +4 -0
- pyspiral-0.6.6.dist-info/entry_points.txt +2 -0
- spiral/__init__.py +35 -0
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +411 -0
- spiral/api/__init__.py +78 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +164 -0
- spiral/api/filesystems.py +134 -0
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/organizations.py +77 -0
- spiral/api/projects.py +219 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +22 -0
- spiral/api/workers.py +40 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +216 -0
- spiral/cli/__init__.py +88 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +14 -0
- spiral/cli/app.py +104 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +76 -0
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +89 -0
- spiral/cli/login.py +24 -0
- spiral/cli/orgs.py +89 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +147 -0
- spiral/cli/state.py +5 -0
- spiral/cli/tables.py +174 -0
- spiral/cli/telemetry.py +17 -0
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +50 -0
- spiral/cli/workloads.py +58 -0
- spiral/client.py +178 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/_tools/__init__.pyi +5 -0
- spiral/core/authn/__init__.pyi +27 -0
- spiral/core/client/__init__.pyi +237 -0
- spiral/core/table/__init__.pyi +101 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +58 -0
- spiral/core/table/spec/__init__.pyi +213 -0
- spiral/dataloader.py +285 -0
- spiral/dataset.py +255 -0
- spiral/datetime_.py +27 -0
- spiral/debug/__init__.py +0 -0
- spiral/debug/manifests.py +87 -0
- spiral/debug/metrics.py +56 -0
- spiral/debug/scan.py +266 -0
- spiral/expressions/__init__.py +276 -0
- spiral/expressions/base.py +157 -0
- spiral/expressions/http.py +86 -0
- spiral/expressions/io.py +100 -0
- spiral/expressions/list_.py +68 -0
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +58 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +223 -0
- spiral/expressions/udf.py +46 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg.py +31 -0
- spiral/iterable_dataset.py +106 -0
- spiral/key_space_index.py +44 -0
- spiral/project.py +199 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
- spiral/protogen/_/google/__init__.py +0 -0
- spiral/protogen/_/google/protobuf/__init__.py +2310 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +190 -0
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +6196 -0
- spiral/protogen/_/substrait/extensions/__init__.py +169 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/scan.py +285 -0
- spiral/server.py +17 -0
- spiral/settings.py +114 -0
- spiral/snapshot.py +56 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +133 -0
- spiral/streaming_/stream.py +157 -0
- spiral/substrait_.py +274 -0
- spiral/table.py +293 -0
- spiral/text_index.py +17 -0
- spiral/transaction.py +58 -0
- spiral/types_.py +6 -0
spiral/dataset.py
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.compute as pc
|
5
|
+
import pyarrow.dataset as ds
|
6
|
+
|
7
|
+
from spiral.scan import Scan
|
8
|
+
from spiral.snapshot import Snapshot
|
9
|
+
|
10
|
+
|
11
|
+
class Dataset(ds.Dataset):
|
12
|
+
def __init__(self, snapshot: Snapshot):
|
13
|
+
self._snapshot = snapshot
|
14
|
+
self._table = snapshot.table
|
15
|
+
self._schema: pa.Schema = self._snapshot.schema().to_arrow()
|
16
|
+
|
17
|
+
# We don't actually initialize a Dataset, we just implement enough of the API
|
18
|
+
# to fool both DuckDB and Polars.
|
19
|
+
# super().__init__()
|
20
|
+
self._last_scan = None
|
21
|
+
|
22
|
+
@property
|
23
|
+
def schema(self) -> pa.Schema:
|
24
|
+
return self._schema
|
25
|
+
|
26
|
+
def count_rows(
|
27
|
+
self,
|
28
|
+
filter: pc.Expression | None = None,
|
29
|
+
batch_size: int | None = None,
|
30
|
+
batch_readahead: int | None = None,
|
31
|
+
fragment_readahead: int | None = None,
|
32
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
33
|
+
use_threads: bool = True,
|
34
|
+
memory_pool: pa.MemoryPool = None,
|
35
|
+
):
|
36
|
+
return self.scanner(
|
37
|
+
None,
|
38
|
+
filter,
|
39
|
+
batch_size,
|
40
|
+
batch_readahead,
|
41
|
+
fragment_readahead,
|
42
|
+
fragment_scan_options,
|
43
|
+
use_threads,
|
44
|
+
memory_pool,
|
45
|
+
).count_rows()
|
46
|
+
|
47
|
+
def filter(self, expression: pc.Expression) -> "Dataset":
|
48
|
+
raise NotImplementedError("filter not implemented")
|
49
|
+
|
50
|
+
def get_fragments(self, filter: pc.Expression | None = None):
|
51
|
+
"""TODO(ngates): perhaps we should return ranges as per our split API?"""
|
52
|
+
raise NotImplementedError("get_fragments not implemented")
|
53
|
+
|
54
|
+
def head(
|
55
|
+
self,
|
56
|
+
num_rows: int,
|
57
|
+
columns: list[str] | None = None,
|
58
|
+
filter: pc.Expression | None = None,
|
59
|
+
batch_size: int | None = None,
|
60
|
+
batch_readahead: int | None = None,
|
61
|
+
fragment_readahead: int | None = None,
|
62
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
63
|
+
use_threads: bool = True,
|
64
|
+
memory_pool: pa.MemoryPool = None,
|
65
|
+
):
|
66
|
+
return self.scanner(
|
67
|
+
columns,
|
68
|
+
filter,
|
69
|
+
batch_size,
|
70
|
+
batch_readahead,
|
71
|
+
fragment_readahead,
|
72
|
+
fragment_scan_options,
|
73
|
+
use_threads,
|
74
|
+
memory_pool,
|
75
|
+
).head(num_rows)
|
76
|
+
|
77
|
+
def join(
|
78
|
+
self,
|
79
|
+
right_dataset,
|
80
|
+
keys,
|
81
|
+
right_keys=None,
|
82
|
+
join_type=None,
|
83
|
+
left_suffix=None,
|
84
|
+
right_suffix=None,
|
85
|
+
coalesce_keys=True,
|
86
|
+
use_threads=True,
|
87
|
+
):
|
88
|
+
raise NotImplementedError("join not implemented")
|
89
|
+
|
90
|
+
def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
|
91
|
+
raise NotImplementedError("join_asof not implemented")
|
92
|
+
|
93
|
+
def replace_schema(self, schema: pa.Schema) -> "Dataset":
|
94
|
+
raise NotImplementedError("replace_schema not implemented")
|
95
|
+
|
96
|
+
def scanner(
|
97
|
+
self,
|
98
|
+
columns: list[str] | None = None,
|
99
|
+
filter: pc.Expression | None = None,
|
100
|
+
batch_size: int | None = None,
|
101
|
+
batch_readahead: int | None = None,
|
102
|
+
fragment_readahead: int | None = None,
|
103
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
104
|
+
use_threads: bool = True,
|
105
|
+
memory_pool: pa.MemoryPool = None,
|
106
|
+
) -> "TableScanner":
|
107
|
+
from spiral.substrait_ import SubstraitConverter
|
108
|
+
|
109
|
+
# Extract the substrait expression so we can convert it to a Spiral expression
|
110
|
+
if filter is not None:
|
111
|
+
filter = SubstraitConverter(self._table, self._schema, self._table.key_schema.to_arrow()).convert(
|
112
|
+
filter.to_substrait(self._schema, allow_arrow_extensions=True),
|
113
|
+
)
|
114
|
+
|
115
|
+
scan = (
|
116
|
+
self._table.spiral.scan(
|
117
|
+
{c: self._table[c] for c in columns},
|
118
|
+
where=filter,
|
119
|
+
asof=self._snapshot.asof,
|
120
|
+
)
|
121
|
+
if columns
|
122
|
+
else self._table.spiral.scan(
|
123
|
+
self._table,
|
124
|
+
where=filter,
|
125
|
+
asof=self._snapshot.asof,
|
126
|
+
)
|
127
|
+
)
|
128
|
+
self._last_scan = scan
|
129
|
+
|
130
|
+
return TableScanner(scan)
|
131
|
+
|
132
|
+
def sort_by(self, sorting, **kwargs):
|
133
|
+
raise NotImplementedError("sort_by not implemented")
|
134
|
+
|
135
|
+
def take(
|
136
|
+
self,
|
137
|
+
indices: pa.Array | Any,
|
138
|
+
columns: list[str] | None = None,
|
139
|
+
filter: pc.Expression | None = None,
|
140
|
+
batch_size: int | None = None,
|
141
|
+
batch_readahead: int | None = None,
|
142
|
+
fragment_readahead: int | None = None,
|
143
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
144
|
+
use_threads: bool = True,
|
145
|
+
memory_pool: pa.MemoryPool = None,
|
146
|
+
):
|
147
|
+
return self.scanner(
|
148
|
+
columns,
|
149
|
+
filter,
|
150
|
+
batch_size,
|
151
|
+
batch_readahead,
|
152
|
+
fragment_readahead,
|
153
|
+
fragment_scan_options,
|
154
|
+
use_threads,
|
155
|
+
memory_pool,
|
156
|
+
).take(indices)
|
157
|
+
|
158
|
+
def to_batches(
|
159
|
+
self,
|
160
|
+
columns: list[str] | None = None,
|
161
|
+
filter: pc.Expression | None = None,
|
162
|
+
batch_size: int | None = None,
|
163
|
+
batch_readahead: int | None = None,
|
164
|
+
fragment_readahead: int | None = None,
|
165
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
166
|
+
use_threads: bool = True,
|
167
|
+
memory_pool: pa.MemoryPool = None,
|
168
|
+
):
|
169
|
+
return self.scanner(
|
170
|
+
columns,
|
171
|
+
filter,
|
172
|
+
batch_size,
|
173
|
+
batch_readahead,
|
174
|
+
fragment_readahead,
|
175
|
+
fragment_scan_options,
|
176
|
+
use_threads,
|
177
|
+
memory_pool,
|
178
|
+
).to_batches()
|
179
|
+
|
180
|
+
def to_table(
|
181
|
+
self,
|
182
|
+
columns=None,
|
183
|
+
filter: pc.Expression | None = None,
|
184
|
+
batch_size: int | None = None,
|
185
|
+
batch_readahead: int | None = None,
|
186
|
+
fragment_readahead: int | None = None,
|
187
|
+
fragment_scan_options: ds.FragmentScanOptions | None = None,
|
188
|
+
use_threads: bool = True,
|
189
|
+
memory_pool: pa.MemoryPool = None,
|
190
|
+
):
|
191
|
+
return self.scanner(
|
192
|
+
columns,
|
193
|
+
filter,
|
194
|
+
batch_size,
|
195
|
+
batch_readahead,
|
196
|
+
fragment_readahead,
|
197
|
+
fragment_scan_options,
|
198
|
+
use_threads,
|
199
|
+
memory_pool,
|
200
|
+
).to_table()
|
201
|
+
|
202
|
+
|
203
|
+
class TableScanner(ds.Scanner):
|
204
|
+
"""A PyArrow Dataset Scanner that reads from a Spiral Table."""
|
205
|
+
|
206
|
+
def __init__(
|
207
|
+
self,
|
208
|
+
scan: Scan,
|
209
|
+
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
210
|
+
):
|
211
|
+
self._scan = scan
|
212
|
+
self._schema = scan.schema
|
213
|
+
self.key_table = key_table
|
214
|
+
|
215
|
+
# We don't actually initialize a Dataset, we just implement enough of the API
|
216
|
+
# to fool both DuckDB and Polars.
|
217
|
+
# super().__init__()
|
218
|
+
|
219
|
+
@property
|
220
|
+
def schema(self):
|
221
|
+
return self._schema
|
222
|
+
|
223
|
+
def count_rows(self):
|
224
|
+
# TODO(ngates): is there a faster way to count rows?
|
225
|
+
return sum(len(batch) for batch in self.to_reader())
|
226
|
+
|
227
|
+
def head(self, num_rows: int):
|
228
|
+
"""Return the first `num_rows` rows of the dataset."""
|
229
|
+
reader = self.to_reader()
|
230
|
+
batches = []
|
231
|
+
row_count = 0
|
232
|
+
for batch in reader:
|
233
|
+
if row_count + len(batch) > num_rows:
|
234
|
+
batches.append(batch.slice(0, num_rows - row_count))
|
235
|
+
break
|
236
|
+
row_count += len(batch)
|
237
|
+
batches.append(batch)
|
238
|
+
return pa.Table.from_batches(batches, schema=reader.schema)
|
239
|
+
|
240
|
+
def scan_batches(self):
|
241
|
+
raise NotImplementedError("scan_batches not implemented")
|
242
|
+
|
243
|
+
def take(self, indices):
|
244
|
+
# TODO(ngates): can we defer take until after we've constructed the scan?
|
245
|
+
# Or should this we delay constructing the Spiral Table.scan?
|
246
|
+
raise NotImplementedError("take not implemented")
|
247
|
+
|
248
|
+
def to_batches(self):
|
249
|
+
return self.to_reader()
|
250
|
+
|
251
|
+
def to_reader(self):
|
252
|
+
return self._scan.to_record_batches(key_table=self.key_table)
|
253
|
+
|
254
|
+
def to_table(self):
|
255
|
+
return self.to_reader().read_all()
|
spiral/datetime_.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
import warnings
|
2
|
+
from datetime import UTC, datetime, timedelta, tzinfo
|
3
|
+
|
4
|
+
_THE_EPOCH = datetime.fromtimestamp(0, tz=UTC)
|
5
|
+
|
6
|
+
|
7
|
+
def local_tz() -> tzinfo:
|
8
|
+
"""Determine this machine's local timezone."""
|
9
|
+
tz = datetime.now().astimezone().tzinfo
|
10
|
+
if tz is None:
|
11
|
+
raise ValueError("Could not determine this machine's local timezone.")
|
12
|
+
return tz
|
13
|
+
|
14
|
+
|
15
|
+
def timestamp_micros(instant: datetime) -> int:
|
16
|
+
"""The number of microseconds between the epoch and the given instant."""
|
17
|
+
if instant.tzinfo is None:
|
18
|
+
warnings.warn("assuming timezone-naive datetime is local time", stacklevel=2)
|
19
|
+
instant = instant.replace(tzinfo=local_tz())
|
20
|
+
return (instant - _THE_EPOCH) // timedelta(microseconds=1)
|
21
|
+
|
22
|
+
|
23
|
+
def from_timestamp_micros(ts: int) -> datetime:
|
24
|
+
"""Convert a timestamp in microseconds to a datetime."""
|
25
|
+
if ts < 0:
|
26
|
+
raise ValueError("Timestamp must be non-negative")
|
27
|
+
return _THE_EPOCH + timedelta(microseconds=ts)
|
spiral/debug/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,87 @@
|
|
1
|
+
from rich.console import Console
|
2
|
+
from rich.table import Table
|
3
|
+
|
4
|
+
from spiral import datetime_
|
5
|
+
from spiral.core.table import Scan
|
6
|
+
from spiral.core.table.manifests import FragmentManifest
|
7
|
+
from spiral.core.table.spec import ColumnGroup
|
8
|
+
from spiral.debug.metrics import _format_bytes
|
9
|
+
|
10
|
+
|
11
|
+
def display_scan_manifests(scan: Scan):
|
12
|
+
"""Display all manifests in a scan."""
|
13
|
+
if len(scan.table_ids()) != 1:
|
14
|
+
raise NotImplementedError("Multiple table scans are not supported.")
|
15
|
+
table_id = scan.table_ids()[0]
|
16
|
+
key_space_manifest = scan.key_space_state(table_id).manifest
|
17
|
+
column_group_manifests = [
|
18
|
+
(column_group, scan.column_group_state(column_group).manifest) for column_group in scan.column_groups()
|
19
|
+
]
|
20
|
+
|
21
|
+
display_manifests(key_space_manifest, column_group_manifests)
|
22
|
+
|
23
|
+
|
24
|
+
def display_manifests(
|
25
|
+
key_space_manifest: FragmentManifest, column_group_manifests: list[tuple[ColumnGroup, FragmentManifest]]
|
26
|
+
):
|
27
|
+
_table_of_fragments(
|
28
|
+
key_space_manifest,
|
29
|
+
title="Key Space manifest",
|
30
|
+
)
|
31
|
+
|
32
|
+
for column_group, column_group_manifest in column_group_manifests:
|
33
|
+
_table_of_fragments(
|
34
|
+
column_group_manifest,
|
35
|
+
title=f"Column Group manifest for {str(column_group)}",
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
def _table_of_fragments(manifest: FragmentManifest, title: str):
|
40
|
+
"""Display fragments in a formatted table."""
|
41
|
+
# Calculate summary statistics
|
42
|
+
total_size = sum(fragment.size_bytes for fragment in manifest)
|
43
|
+
total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
|
44
|
+
fragment_count = len(manifest)
|
45
|
+
avg_size = total_size / fragment_count if fragment_count > 0 else 0
|
46
|
+
|
47
|
+
# Print title and summary
|
48
|
+
console = Console()
|
49
|
+
console.print(f"\n\n{title}")
|
50
|
+
console.print(
|
51
|
+
f"{fragment_count} fragments, "
|
52
|
+
f"total: {_format_bytes(total_size)}, "
|
53
|
+
f"avg: {_format_bytes(int(avg_size))}, "
|
54
|
+
f"metadata: {_format_bytes(total_metadata_size)}"
|
55
|
+
)
|
56
|
+
|
57
|
+
# Create rich table
|
58
|
+
table = Table(title=None, show_header=True, header_style="bold")
|
59
|
+
table.add_column("ID", style="cyan", no_wrap=True)
|
60
|
+
table.add_column("Size (Metadata)", justify="right")
|
61
|
+
table.add_column("Format", justify="center")
|
62
|
+
table.add_column("Key Span", justify="center")
|
63
|
+
table.add_column("Level", justify="center")
|
64
|
+
table.add_column("Committed At", justify="center")
|
65
|
+
table.add_column("Compacted At", justify="center")
|
66
|
+
|
67
|
+
# Add each fragment as a row
|
68
|
+
for fragment in manifest:
|
69
|
+
committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
|
70
|
+
compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
|
71
|
+
|
72
|
+
size_with_metadata = (
|
73
|
+
f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
|
74
|
+
)
|
75
|
+
key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
|
76
|
+
|
77
|
+
table.add_row(
|
78
|
+
fragment.id,
|
79
|
+
size_with_metadata,
|
80
|
+
str(fragment.format),
|
81
|
+
key_span,
|
82
|
+
str(fragment.level),
|
83
|
+
committed_str,
|
84
|
+
compacted_str,
|
85
|
+
)
|
86
|
+
|
87
|
+
console.print(table)
|
spiral/debug/metrics.py
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
|
4
|
+
def display_metrics(metrics: dict[str, Any]) -> None:
|
5
|
+
"""Display metrics in a formatted table."""
|
6
|
+
print(
|
7
|
+
f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
|
8
|
+
f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
|
9
|
+
)
|
10
|
+
print("=" * 140)
|
11
|
+
|
12
|
+
for metric_name, data in sorted(metrics.items()):
|
13
|
+
metric_type = data["type"]
|
14
|
+
count = data["count"]
|
15
|
+
avg = _format_value(data["avg"], metric_type, metric_name)
|
16
|
+
min_val = _format_value(data["min"], metric_type, metric_name)
|
17
|
+
max_val = _format_value(data["max"], metric_type, metric_name)
|
18
|
+
p95 = _format_value(data["p95"], metric_type, metric_name)
|
19
|
+
p99 = _format_value(data["p99"], metric_type, metric_name)
|
20
|
+
stddev = _format_value(data["stddev"], metric_type, metric_name)
|
21
|
+
|
22
|
+
print(
|
23
|
+
f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
|
24
|
+
f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
def _format_duration(nanoseconds: float) -> str:
|
29
|
+
"""Convert nanoseconds to human-readable duration."""
|
30
|
+
if nanoseconds >= 1_000_000_000:
|
31
|
+
return f"{nanoseconds / 1_000_000_000:.2f}s"
|
32
|
+
elif nanoseconds >= 1_000_000:
|
33
|
+
return f"{nanoseconds / 1_000_000:.2f}ms"
|
34
|
+
elif nanoseconds >= 1_000:
|
35
|
+
return f"{nanoseconds / 1_000:.2f}μs"
|
36
|
+
else:
|
37
|
+
return f"{nanoseconds:.0f}ns"
|
38
|
+
|
39
|
+
|
40
|
+
def _format_bytes(bytes_value: float) -> str:
|
41
|
+
"""Convert bytes to human-readable size."""
|
42
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
43
|
+
if bytes_value < 1024:
|
44
|
+
return f"{bytes_value:.1f}{unit}"
|
45
|
+
bytes_value /= 1024
|
46
|
+
return f"{bytes_value:.1f}TB"
|
47
|
+
|
48
|
+
|
49
|
+
def _format_value(value: float, metric_type: str, metric_name: str) -> str:
|
50
|
+
"""Format a value based on metric type and name."""
|
51
|
+
if metric_type == "timer" or "duration" in metric_name:
|
52
|
+
return _format_duration(value)
|
53
|
+
elif "bytes" in metric_name:
|
54
|
+
return _format_bytes(value)
|
55
|
+
else:
|
56
|
+
return f"{value:,.0f}"
|
spiral/debug/scan.py
ADDED
@@ -0,0 +1,266 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
|
3
|
+
from spiral.core.table import Scan
|
4
|
+
from spiral.core.table.manifests import FragmentFile, FragmentManifest
|
5
|
+
from spiral.core.table.spec import Key
|
6
|
+
from spiral.types_ import Timestamp
|
7
|
+
|
8
|
+
|
9
|
+
def show_scan(scan: Scan):
|
10
|
+
"""Displays a scan in a way that is useful for debugging."""
|
11
|
+
table_ids = scan.table_ids()
|
12
|
+
if len(table_ids) > 1:
|
13
|
+
raise NotImplementedError("Multiple table scan is not supported.")
|
14
|
+
table_id = table_ids[0]
|
15
|
+
column_groups = scan.column_groups()
|
16
|
+
|
17
|
+
splits = scan.splits()
|
18
|
+
key_space_state = scan.key_space_state(table_id)
|
19
|
+
|
20
|
+
# Collect all key bounds from all manifests. This makes sure all visualizations are aligned.
|
21
|
+
key_points = set()
|
22
|
+
key_space_manifest = key_space_state.manifest
|
23
|
+
for i in range(len(key_space_manifest)):
|
24
|
+
fragment_file = key_space_manifest[i]
|
25
|
+
key_points.add(fragment_file.key_extent.min)
|
26
|
+
key_points.add(fragment_file.key_extent.max)
|
27
|
+
for cg in column_groups:
|
28
|
+
cg_scan = scan.column_group_state(cg)
|
29
|
+
cg_manifest = cg_scan.manifest
|
30
|
+
for i in range(len(cg_manifest)):
|
31
|
+
fragment_file = cg_manifest[i]
|
32
|
+
key_points.add(fragment_file.key_extent.min)
|
33
|
+
key_points.add(fragment_file.key_extent.max)
|
34
|
+
|
35
|
+
# Make sure split points exist in all key points.
|
36
|
+
for s in splits[:-1]: # Don't take the last end.
|
37
|
+
key_points.add(s.end)
|
38
|
+
key_points = list(sorted(key_points))
|
39
|
+
|
40
|
+
show_manifest(key_space_manifest, scope="Key space", key_points=key_points, splits=splits)
|
41
|
+
for cg in scan.column_groups():
|
42
|
+
cg_scan = scan.column_group_state(cg)
|
43
|
+
# Skip table id from the start of the column group.
|
44
|
+
show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
|
45
|
+
|
46
|
+
|
47
|
+
def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
|
48
|
+
try:
|
49
|
+
import matplotlib.patches as patches
|
50
|
+
import matplotlib.pyplot as plt
|
51
|
+
except ImportError:
|
52
|
+
raise ImportError("matplotlib is required for debug")
|
53
|
+
|
54
|
+
total_fragments = len(manifest)
|
55
|
+
|
56
|
+
size_points = set()
|
57
|
+
for i in range(total_fragments):
|
58
|
+
manifest_file: FragmentFile = manifest[i]
|
59
|
+
size_points.add(manifest_file.size_bytes)
|
60
|
+
size_points = list(sorted(size_points))
|
61
|
+
|
62
|
+
if key_points is None:
|
63
|
+
key_points = set()
|
64
|
+
|
65
|
+
for i in range(total_fragments):
|
66
|
+
manifest_file: FragmentFile = manifest[i]
|
67
|
+
|
68
|
+
key_points.add(manifest_file.key_extent.min)
|
69
|
+
key_points.add(manifest_file.key_extent.max)
|
70
|
+
|
71
|
+
if splits is not None:
|
72
|
+
for split in splits[:-1]:
|
73
|
+
key_points.add(split.end)
|
74
|
+
|
75
|
+
key_points = list(sorted(key_points))
|
76
|
+
|
77
|
+
# Create figure and axis with specified size
|
78
|
+
fig, ax = plt.subplots(figsize=(12, 8))
|
79
|
+
|
80
|
+
# Plot each rectangle
|
81
|
+
for i in range(total_fragments):
|
82
|
+
manifest_file: FragmentFile = manifest[i]
|
83
|
+
|
84
|
+
left = key_points.index(manifest_file.key_extent.min)
|
85
|
+
right = key_points.index(manifest_file.key_extent.max)
|
86
|
+
height = size_points.index(manifest_file.size_bytes) + 1
|
87
|
+
|
88
|
+
color = _get_fragment_color(manifest_file, i, total_fragments)
|
89
|
+
|
90
|
+
# Create rectangle patch
|
91
|
+
rect = patches.Rectangle(
|
92
|
+
(left, 0), # (x, y)
|
93
|
+
right - left, # width
|
94
|
+
height, # height
|
95
|
+
facecolor=color, # fill color
|
96
|
+
edgecolor="black", # border color
|
97
|
+
alpha=0.5, # transparency
|
98
|
+
linewidth=1, # border width
|
99
|
+
label=manifest_file.id, # label for legend
|
100
|
+
)
|
101
|
+
|
102
|
+
ax.add_patch(rect)
|
103
|
+
|
104
|
+
# Set axis limits with some padding
|
105
|
+
ax.set_xlim(-0.5, len(key_points) - 1 + 0.5)
|
106
|
+
ax.set_ylim(-0.5, len(size_points) + 0.5)
|
107
|
+
|
108
|
+
# Create split markers on x-axis
|
109
|
+
if splits is not None:
|
110
|
+
split_positions = [key_points.index(split.end) for split in splits[:-1]]
|
111
|
+
|
112
|
+
# Add split markers at the bottom
|
113
|
+
for pos in split_positions:
|
114
|
+
ax.annotate("▲", xy=(pos, 0), ha="center", va="top", color="red", annotation_clip=False)
|
115
|
+
|
116
|
+
# Add grid
|
117
|
+
ax.grid(True, linestyle="--", alpha=0.7, zorder=0)
|
118
|
+
|
119
|
+
# Add labels and title
|
120
|
+
ax.set_title("Fragment Distribution" if scope is None else f"{scope} Fragment Distribution")
|
121
|
+
ax.set_xlabel("Key Index")
|
122
|
+
ax.set_ylabel("Size Index")
|
123
|
+
|
124
|
+
# Add legend
|
125
|
+
ax.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="small")
|
126
|
+
|
127
|
+
# Adjust layout to prevent label cutoff
|
128
|
+
plt.tight_layout()
|
129
|
+
|
130
|
+
plot = FragmentManifestPlot(fig, ax, manifest)
|
131
|
+
fig.canvas.mpl_connect("motion_notify_event", plot.hover)
|
132
|
+
|
133
|
+
plt.show()
|
134
|
+
|
135
|
+
|
136
|
+
def _get_fragment_color(manifest_file: FragmentFile, color_index, total_colors):
|
137
|
+
import matplotlib.cm as cm
|
138
|
+
|
139
|
+
if manifest_file.compacted_at is not None:
|
140
|
+
# Use a shade of gray for compacted fragments
|
141
|
+
# Vary the shade based on the index to distinguish different compacted fragments
|
142
|
+
gray_value = 0.3 + (0.5 * (color_index / total_colors))
|
143
|
+
return (gray_value, gray_value, gray_value)
|
144
|
+
else:
|
145
|
+
# Use viridis colormap for non-compacted fragments
|
146
|
+
return cm.viridis(color_index / total_colors)
|
147
|
+
|
148
|
+
|
149
|
+
def _get_human_size(size_bytes: int) -> str:
|
150
|
+
# Convert bytes to a human-readable format
|
151
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
152
|
+
if size_bytes < 1024:
|
153
|
+
return f"{size_bytes:.2f} {unit}"
|
154
|
+
size_bytes /= 1024
|
155
|
+
return f"{size_bytes:.2f} PB"
|
156
|
+
|
157
|
+
|
158
|
+
def _maybe_truncate(text, max_length: int = 30) -> str:
|
159
|
+
text = str(text)
|
160
|
+
if len(text) <= max_length:
|
161
|
+
return text
|
162
|
+
|
163
|
+
half_length = (max_length - 3) // 2
|
164
|
+
return text[:half_length] + "..." + text[-half_length:]
|
165
|
+
|
166
|
+
|
167
|
+
def _get_fragment_legend(manifest_file: FragmentFile):
|
168
|
+
return "\n".join(
|
169
|
+
[
|
170
|
+
f"id: {manifest_file.id}",
|
171
|
+
f"size: {_get_human_size(manifest_file.size_bytes)} ({manifest_file.size_bytes} bytes)",
|
172
|
+
f"key_span: {manifest_file.key_span}",
|
173
|
+
f"key_min: {_maybe_truncate(manifest_file.key_extent.min)}",
|
174
|
+
f"key_max: {_maybe_truncate(manifest_file.key_extent.max)}",
|
175
|
+
f"format: {manifest_file.format}",
|
176
|
+
f"level: {manifest_file.level}",
|
177
|
+
f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
|
178
|
+
f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
|
179
|
+
f"ks_id: {manifest_file.ks_id}",
|
180
|
+
]
|
181
|
+
)
|
182
|
+
|
183
|
+
|
184
|
+
def _format_timestamp(ts: Timestamp | None) -> str:
|
185
|
+
# Format timestamp or show None
|
186
|
+
if ts is None:
|
187
|
+
return "None"
|
188
|
+
try:
|
189
|
+
return datetime.fromtimestamp(ts / 1e6).strftime("%Y-%m-%d %H:%M:%S")
|
190
|
+
except ValueError:
|
191
|
+
return str(ts)
|
192
|
+
|
193
|
+
|
194
|
+
class FragmentManifestPlot:
|
195
|
+
def __init__(self, fig, ax, manifest: FragmentManifest):
|
196
|
+
self.fig = fig
|
197
|
+
self.ax = ax
|
198
|
+
self.manifest = manifest
|
199
|
+
|
200
|
+
# Position the annotation in the bottom right corner
|
201
|
+
self.annotation = ax.annotate(
|
202
|
+
"",
|
203
|
+
xy=(0.98, 0.02), # Position in axes coordinates
|
204
|
+
xycoords="axes fraction",
|
205
|
+
bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
|
206
|
+
ha="right", # Right-align text
|
207
|
+
va="bottom", # Bottom-align text
|
208
|
+
visible=False,
|
209
|
+
)
|
210
|
+
self.highlighted_rect = None
|
211
|
+
self.highlighted_legend = None
|
212
|
+
|
213
|
+
def hover(self, event):
|
214
|
+
if event.inaxes != self.ax:
|
215
|
+
# Check if we're hovering over the legend
|
216
|
+
legend = self.ax.get_legend()
|
217
|
+
if legend and legend.contains(event)[0]:
|
218
|
+
# Find which legend item we're hovering over
|
219
|
+
for i, legend_text in enumerate(legend.get_texts()):
|
220
|
+
if legend_text.contains(event)[0]:
|
221
|
+
manifest_file = self.manifest[i]
|
222
|
+
self._show_legend(manifest_file, i, legend_text)
|
223
|
+
return
|
224
|
+
self._hide_legend()
|
225
|
+
return
|
226
|
+
|
227
|
+
# Check rectangles in the main plot
|
228
|
+
for i, rect in enumerate(self.ax.patches):
|
229
|
+
if rect.contains(event)[0]:
|
230
|
+
manifest_file = self.manifest[i]
|
231
|
+
self._show_legend(manifest_file, i, rect)
|
232
|
+
return
|
233
|
+
|
234
|
+
self._hide_legend()
|
235
|
+
|
236
|
+
def _show_legend(self, manifest_file, index, highlight_obj):
|
237
|
+
import matplotlib.patches as patches
|
238
|
+
|
239
|
+
# Update tooltip text
|
240
|
+
self.annotation.set_text(_get_fragment_legend(manifest_file))
|
241
|
+
self.annotation.set_visible(True)
|
242
|
+
|
243
|
+
# Handle highlighting
|
244
|
+
if isinstance(highlight_obj, patches.Rectangle):
|
245
|
+
# Highlighting rectangle in main plot
|
246
|
+
if self.highlighted_rect and self.highlighted_rect != highlight_obj:
|
247
|
+
self.highlighted_rect.set_alpha(0.5)
|
248
|
+
highlight_obj.set_alpha(0.8)
|
249
|
+
self.highlighted_rect = highlight_obj
|
250
|
+
else:
|
251
|
+
# Highlighting legend text
|
252
|
+
if self.highlighted_rect:
|
253
|
+
self.highlighted_rect.set_alpha(0.5)
|
254
|
+
# Find and highlight corresponding rectangle
|
255
|
+
rect = self.ax.patches[index]
|
256
|
+
rect.set_alpha(0.8)
|
257
|
+
self.highlighted_rect = rect
|
258
|
+
|
259
|
+
self.fig.canvas.draw_idle()
|
260
|
+
|
261
|
+
def _hide_legend(self):
|
262
|
+
if self.annotation.get_visible():
|
263
|
+
self.annotation.set_visible(False)
|
264
|
+
if self.highlighted_rect:
|
265
|
+
self.highlighted_rect.set_alpha(0.5)
|
266
|
+
self.fig.canvas.draw_idle()
|