pyspiral 0.4.4__cp310-abi3-macosx_11_0_arm64.whl → 0.6.0__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/METADATA +10 -5
- pyspiral-0.6.0.dist-info/RECORD +99 -0
- {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/WHEEL +1 -1
- spiral/__init__.py +10 -3
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +29 -11
- spiral/api/__init__.py +14 -0
- spiral/api/client.py +5 -1
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/projects.py +17 -2
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +2 -0
- spiral/api/workers.py +40 -0
- spiral/cli/__init__.py +15 -6
- spiral/cli/admin.py +2 -4
- spiral/cli/app.py +4 -2
- spiral/cli/fs.py +5 -6
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +68 -0
- spiral/cli/login.py +6 -7
- spiral/cli/orgs.py +7 -8
- spiral/cli/printer.py +3 -3
- spiral/cli/projects.py +5 -6
- spiral/cli/tables.py +131 -0
- spiral/cli/telemetry.py +3 -4
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +3 -4
- spiral/cli/workloads.py +7 -8
- spiral/client.py +111 -8
- spiral/core/authn/__init__.pyi +27 -0
- spiral/core/client/__init__.pyi +135 -63
- spiral/core/table/__init__.pyi +36 -26
- spiral/core/table/metastore/__init__.pyi +0 -4
- spiral/core/table/spec/__init__.pyi +0 -2
- spiral/{tables/dataset.py → dataset.py} +13 -7
- spiral/{tables/debug → debug}/manifests.py +17 -6
- spiral/{tables/debug → debug}/scan.py +7 -7
- spiral/expressions/base.py +3 -3
- spiral/expressions/udf.py +1 -1
- spiral/{iceberg/client.py → iceberg.py} +1 -3
- spiral/key_space_index.py +44 -0
- spiral/project.py +171 -18
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
- spiral/protogen/_/google/protobuf/__init__.py +2190 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +138 -126
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +5256 -2459
- spiral/protogen/_/substrait/extensions/__init__.py +103 -49
- spiral/{tables/scan.py → scan.py} +37 -44
- spiral/settings.py +14 -3
- spiral/snapshot.py +55 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +117 -0
- spiral/streaming_/stream.py +146 -0
- spiral/substrait_.py +9 -9
- spiral/table.py +257 -0
- spiral/text_index.py +17 -0
- spiral/{tables/transaction.py → transaction.py} +11 -15
- pyspiral-0.4.4.dist-info/RECORD +0 -98
- spiral/cli/iceberg/__init__.py +0 -7
- spiral/cli/iceberg/namespaces.py +0 -47
- spiral/cli/iceberg/tables.py +0 -60
- spiral/cli/indexes/__init__.py +0 -19
- spiral/cli/tables/__init__.py +0 -121
- spiral/core/index/__init__.pyi +0 -15
- spiral/iceberg/__init__.py +0 -3
- spiral/indexes/__init__.py +0 -5
- spiral/indexes/client.py +0 -137
- spiral/indexes/index.py +0 -34
- spiral/indexes/scan.py +0 -22
- spiral/protogen/_/spiral/table/__init__.py +0 -22
- spiral/protogen/substrait/__init__.py +0 -3399
- spiral/protogen/substrait/extensions/__init__.py +0 -115
- spiral/tables/__init__.py +0 -12
- spiral/tables/client.py +0 -130
- spiral/tables/maintenance.py +0 -12
- spiral/tables/snapshot.py +0 -78
- spiral/tables/table.py +0 -145
- {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/entry_points.txt +0 -0
- /spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
- /spiral/{tables/debug → debug}/metrics.py +0 -0
- /spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0
@@ -1,115 +1,169 @@
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
2
|
# sources: substrait/extensions/extensions.proto
|
3
|
-
# plugin: python-
|
3
|
+
# plugin: python-betterproto2
|
4
4
|
# This file has been @generated
|
5
5
|
|
6
|
+
__all__ = (
|
7
|
+
"AdvancedExtension",
|
8
|
+
"SimpleExtensionDeclaration",
|
9
|
+
"SimpleExtensionDeclarationExtensionFunction",
|
10
|
+
"SimpleExtensionDeclarationExtensionType",
|
11
|
+
"SimpleExtensionDeclarationExtensionTypeVariation",
|
12
|
+
"SimpleExtensionUri",
|
13
|
+
)
|
14
|
+
|
6
15
|
from dataclasses import dataclass
|
7
|
-
from typing import List
|
8
16
|
|
9
|
-
import
|
10
|
-
|
17
|
+
import betterproto2
|
18
|
+
|
19
|
+
from ...message_pool import default_message_pool
|
20
|
+
|
21
|
+
_COMPILER_VERSION = "0.8.0"
|
22
|
+
betterproto2.check_compiler_version(_COMPILER_VERSION)
|
11
23
|
|
12
24
|
|
13
25
|
@dataclass(eq=False, repr=False)
|
14
|
-
class
|
15
|
-
extension_uri_anchor: int = betterproto.uint32_field(1)
|
26
|
+
class AdvancedExtension(betterproto2.Message):
|
16
27
|
"""
|
17
|
-
A
|
18
|
-
|
28
|
+
A generic object that can be used to embed additional extension information
|
29
|
+
into the serialized substrait plan.
|
19
30
|
"""
|
20
31
|
|
21
|
-
|
32
|
+
optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
|
22
33
|
"""
|
23
|
-
|
24
|
-
|
34
|
+
An optimization is helpful information that don't influence semantics. May
|
35
|
+
be ignored by a consumer.
|
36
|
+
"""
|
37
|
+
|
38
|
+
enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
|
39
|
+
"""
|
40
|
+
An enhancement alter semantics. Cannot be ignored by a consumer.
|
25
41
|
"""
|
26
42
|
|
27
43
|
|
44
|
+
default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
|
45
|
+
|
46
|
+
|
28
47
|
@dataclass(eq=False, repr=False)
|
29
|
-
class SimpleExtensionDeclaration(
|
48
|
+
class SimpleExtensionDeclaration(betterproto2.Message):
|
30
49
|
"""
|
31
50
|
Describes a mapping between a specific extension entity and the uri where
|
32
|
-
|
51
|
+
that extension can be found.
|
52
|
+
|
53
|
+
Oneofs:
|
54
|
+
- mapping_type:
|
33
55
|
"""
|
34
56
|
|
35
|
-
extension_type: "SimpleExtensionDeclarationExtensionType" = (
|
36
|
-
|
57
|
+
extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
|
58
|
+
1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
37
59
|
)
|
38
|
-
|
39
|
-
|
60
|
+
|
61
|
+
extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
|
62
|
+
2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
40
63
|
)
|
41
|
-
|
42
|
-
|
64
|
+
|
65
|
+
extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
|
66
|
+
3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
43
67
|
)
|
44
68
|
|
45
69
|
|
46
|
-
|
47
|
-
|
48
|
-
"""Describes a Type"""
|
70
|
+
default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
|
71
|
+
|
49
72
|
|
50
|
-
|
73
|
+
@dataclass(eq=False, repr=False)
|
74
|
+
class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
|
75
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
51
76
|
"""
|
52
77
|
references the extension_uri_anchor defined for a specific extension URI.
|
53
78
|
"""
|
54
79
|
|
55
|
-
|
80
|
+
function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
56
81
|
"""
|
57
82
|
A surrogate key used in the context of a single plan to reference a
|
58
|
-
|
83
|
+
specific function
|
84
|
+
"""
|
85
|
+
|
86
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
59
87
|
"""
|
88
|
+
A function signature compound name
|
89
|
+
"""
|
90
|
+
|
60
91
|
|
61
|
-
|
62
|
-
"""
|
92
|
+
default_message_pool.register_message(
|
93
|
+
"substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
|
94
|
+
)
|
63
95
|
|
64
96
|
|
65
97
|
@dataclass(eq=False, repr=False)
|
66
|
-
class
|
67
|
-
|
98
|
+
class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
|
99
|
+
"""
|
100
|
+
Describes a Type
|
101
|
+
"""
|
102
|
+
|
103
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
68
104
|
"""
|
69
105
|
references the extension_uri_anchor defined for a specific extension URI.
|
70
106
|
"""
|
71
107
|
|
72
|
-
|
108
|
+
type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
73
109
|
"""
|
74
110
|
A surrogate key used in the context of a single plan to reference a
|
75
|
-
|
111
|
+
specific extension type
|
112
|
+
"""
|
113
|
+
|
114
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
115
|
+
"""
|
116
|
+
the name of the type in the defined extension YAML.
|
76
117
|
"""
|
77
118
|
|
78
|
-
|
79
|
-
|
119
|
+
|
120
|
+
default_message_pool.register_message(
|
121
|
+
"substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
|
122
|
+
)
|
80
123
|
|
81
124
|
|
82
125
|
@dataclass(eq=False, repr=False)
|
83
|
-
class
|
84
|
-
extension_uri_reference: int =
|
126
|
+
class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
|
127
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
85
128
|
"""
|
86
129
|
references the extension_uri_anchor defined for a specific extension URI.
|
87
130
|
"""
|
88
131
|
|
89
|
-
|
132
|
+
type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
90
133
|
"""
|
91
134
|
A surrogate key used in the context of a single plan to reference a
|
92
|
-
|
135
|
+
specific type variation
|
136
|
+
"""
|
137
|
+
|
138
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
139
|
+
"""
|
140
|
+
the name of the type in the defined extension YAML.
|
93
141
|
"""
|
94
142
|
|
95
|
-
|
96
|
-
|
143
|
+
|
144
|
+
default_message_pool.register_message(
|
145
|
+
"substrait.extensions",
|
146
|
+
"SimpleExtensionDeclaration.ExtensionTypeVariation",
|
147
|
+
SimpleExtensionDeclarationExtensionTypeVariation,
|
148
|
+
)
|
97
149
|
|
98
150
|
|
99
151
|
@dataclass(eq=False, repr=False)
|
100
|
-
class
|
152
|
+
class SimpleExtensionUri(betterproto2.Message):
|
153
|
+
extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
101
154
|
"""
|
102
|
-
A
|
103
|
-
|
155
|
+
A surrogate key used in the context of a single plan used to reference the
|
156
|
+
URI associated with an extension.
|
104
157
|
"""
|
105
158
|
|
106
|
-
|
107
|
-
"betterproto_lib_google_protobuf.Any"
|
108
|
-
] = betterproto.message_field(1)
|
159
|
+
uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
|
109
160
|
"""
|
110
|
-
|
111
|
-
|
161
|
+
The URI where this extension YAML can be retrieved. This is the "namespace"
|
162
|
+
of this extension.
|
112
163
|
"""
|
113
164
|
|
114
|
-
|
115
|
-
|
165
|
+
|
166
|
+
default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
|
167
|
+
|
168
|
+
|
169
|
+
from ...google import protobuf as __google__protobuf__
|
@@ -2,40 +2,36 @@ from collections.abc import Iterator
|
|
2
2
|
from typing import TYPE_CHECKING, Any
|
3
3
|
|
4
4
|
import pyarrow as pa
|
5
|
-
from datasets import DatasetInfo, Features
|
6
5
|
|
7
|
-
from spiral.core.table import KeyRange,
|
6
|
+
from spiral.core.table import KeyRange, ShuffleStrategy
|
7
|
+
from spiral.core.table import Scan as CoreScan
|
8
8
|
from spiral.core.table.spec import Schema
|
9
9
|
from spiral.settings import CI, DEV
|
10
10
|
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
import dask.dataframe as dd
|
13
|
+
import datasets.iterable_dataset as hf # noqa
|
13
14
|
import pandas as pd
|
14
15
|
import polars as pl
|
15
|
-
|
16
|
+
import streaming # noqa
|
17
|
+
import torch.utils.data as torchdata # noqa
|
16
18
|
|
17
19
|
|
18
20
|
class Scan:
|
19
21
|
"""Scan object."""
|
20
22
|
|
21
|
-
def __init__(
|
22
|
-
self
|
23
|
-
scan: TableScan,
|
24
|
-
):
|
25
|
-
# NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
|
26
|
-
# when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
|
27
|
-
# and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
|
28
|
-
self._scan = scan
|
23
|
+
def __init__(self, core: CoreScan):
|
24
|
+
self.core = core
|
29
25
|
|
30
26
|
@property
|
31
27
|
def metrics(self) -> dict[str, Any]:
|
32
28
|
"""Returns metrics about the scan."""
|
33
|
-
return self.
|
29
|
+
return self.core.metrics()
|
34
30
|
|
35
31
|
@property
|
36
32
|
def schema(self) -> Schema:
|
37
33
|
"""Returns the schema of the scan."""
|
38
|
-
return self.
|
34
|
+
return self.core.schema()
|
39
35
|
|
40
36
|
def is_empty(self) -> bool:
|
41
37
|
"""Check if the Spiral is empty for the given key range.
|
@@ -43,7 +39,7 @@ class Scan:
|
|
43
39
|
**IMPORTANT**: False negatives are possible, but false positives are not,
|
44
40
|
i.e. is_empty can return False and scan can return zero rows.
|
45
41
|
"""
|
46
|
-
return self.
|
42
|
+
return self.core.is_empty()
|
47
43
|
|
48
44
|
def to_record_batches(
|
49
45
|
self,
|
@@ -69,7 +65,7 @@ class Scan:
|
|
69
65
|
elif isinstance(key_table, pa.Table):
|
70
66
|
key_table = key_table.to_reader(max_chunksize=batch_size)
|
71
67
|
|
72
|
-
return self.
|
68
|
+
return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
73
69
|
|
74
70
|
def to_table(
|
75
71
|
self,
|
@@ -83,7 +79,7 @@ class Scan:
|
|
83
79
|
"""
|
84
80
|
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
85
81
|
if DEV and not CI and key_table is None:
|
86
|
-
rb = self.
|
82
|
+
rb = self.core.to_record_batch()
|
87
83
|
return pa.Table.from_batches([rb])
|
88
84
|
|
89
85
|
return self.to_record_batches(key_table=key_table).read_all()
|
@@ -97,11 +93,11 @@ class Scan:
|
|
97
93
|
import pandas as pd
|
98
94
|
|
99
95
|
def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
|
100
|
-
# TODO(ngates): we need a way to preserve the existing asofs?
|
96
|
+
# TODO(ngates): we need a way to preserve the existing asofs?
|
101
97
|
raise NotImplementedError()
|
102
98
|
|
103
99
|
# Fetch a set of partition ranges
|
104
|
-
return dd.from_map(_read_key_range, self.
|
100
|
+
return dd.from_map(_read_key_range, self._splits())
|
105
101
|
|
106
102
|
def to_pandas(self) -> "pd.DataFrame":
|
107
103
|
"""Read into a Pandas DataFrame.
|
@@ -117,34 +113,31 @@ class Scan:
|
|
117
113
|
"""
|
118
114
|
import polars as pl
|
119
115
|
|
120
|
-
# TODO(marko): This should support lazy dataframe.
|
121
116
|
return pl.from_arrow(self.to_record_batches())
|
122
117
|
|
123
|
-
def
|
118
|
+
def to_iterable_dataset(
|
124
119
|
self,
|
120
|
+
shuffle: ShuffleStrategy | None = None,
|
125
121
|
batch_readahead: int | None = None,
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
122
|
+
) -> "hf.IterableDataset":
|
123
|
+
"""Returns an Huggingface's IterableDataset.
|
124
|
+
|
125
|
+
Requires `datasets` package to be installed.
|
130
126
|
|
131
127
|
Args:
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
128
|
+
shuffle: Controls sample shuffling. If None, no shuffling is performed.
|
129
|
+
batch_readahead: Controls how many batches to read ahead concurrently.
|
130
|
+
If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
|
131
|
+
Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
|
136
132
|
"""
|
133
|
+
from datasets import DatasetInfo, Features
|
137
134
|
from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
|
138
135
|
|
139
136
|
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
else:
|
145
|
-
stream = self._scan.to_shuffled_record_batches(
|
146
|
-
batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
|
147
|
-
)
|
137
|
+
stream = self.core.to_shuffled_record_batches(
|
138
|
+
shuffle,
|
139
|
+
batch_readahead,
|
140
|
+
)
|
148
141
|
|
149
142
|
# This key is unused when training with IterableDataset.
|
150
143
|
# Default implementation returns shard id, e.g. parquet row group id.
|
@@ -166,28 +159,28 @@ class Scan:
|
|
166
159
|
return pa.schema(new_fields)
|
167
160
|
|
168
161
|
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
169
|
-
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
|
162
|
+
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={}) # type: ignore
|
170
163
|
info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
|
171
164
|
return IterableDataset(ex_iterable=ex_iterable, info=info)
|
172
165
|
|
173
|
-
def
|
166
|
+
def _splits(self) -> list[KeyRange]:
|
174
167
|
# Splits the scan into a set of key ranges.
|
175
|
-
return self.
|
168
|
+
return self.core.splits()
|
176
169
|
|
177
170
|
def _debug(self):
|
178
171
|
# Visualizes the scan, mainly for debugging purposes.
|
179
|
-
from spiral.
|
172
|
+
from spiral.debug.scan import show_scan
|
180
173
|
|
181
|
-
show_scan(self.
|
174
|
+
show_scan(self.core)
|
182
175
|
|
183
176
|
def _dump_manifests(self):
|
184
177
|
# Print manifests in a human-readable format.
|
185
|
-
from spiral.
|
178
|
+
from spiral.debug.manifests import display_scan_manifests
|
186
179
|
|
187
|
-
|
180
|
+
display_scan_manifests(self._core)
|
188
181
|
|
189
182
|
def _dump_metrics(self):
|
190
183
|
# Print metrics in a human-readable format.
|
191
|
-
from spiral.
|
184
|
+
from spiral.debug.metrics import display_metrics
|
192
185
|
|
193
186
|
display_metrics(self.metrics)
|
spiral/settings.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import functools
|
2
2
|
import os
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Annotated
|
4
|
+
from typing import TYPE_CHECKING, Annotated
|
5
5
|
|
6
6
|
import typer
|
7
7
|
from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
|
@@ -12,8 +12,11 @@ from pydantic_settings import (
|
|
12
12
|
SettingsConfigDict,
|
13
13
|
)
|
14
14
|
|
15
|
-
from spiral.
|
16
|
-
from spiral.core.client import
|
15
|
+
from spiral.core.authn import Authn, DeviceCodeAuth, Token
|
16
|
+
from spiral.core.client import Spiral
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from spiral.api import SpiralAPI
|
17
20
|
|
18
21
|
DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
|
19
22
|
CI = "GITHUB_ACTIONS" in os.environ
|
@@ -74,6 +77,14 @@ class Settings(BaseSettings):
|
|
74
77
|
|
75
78
|
return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
|
76
79
|
|
80
|
+
@functools.cached_property
|
81
|
+
def core(self) -> Spiral:
|
82
|
+
return Spiral(
|
83
|
+
api_url=self.spiraldb.uri,
|
84
|
+
spfs_url=self.spfs.uri,
|
85
|
+
authn=self.authn,
|
86
|
+
)
|
87
|
+
|
77
88
|
@functools.cached_property
|
78
89
|
def authn(self):
|
79
90
|
if self.spiraldb.token:
|
spiral/snapshot.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from spiral.core.table import Snapshot as CoreSnapshot
|
4
|
+
from spiral.core.table.spec import Schema
|
5
|
+
from spiral.types_ import Timestamp
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
import duckdb
|
9
|
+
import polars as pl
|
10
|
+
import pyarrow.dataset as ds
|
11
|
+
|
12
|
+
from spiral.table import Table
|
13
|
+
|
14
|
+
|
15
|
+
class Snapshot:
|
16
|
+
"""Spiral table snapshot.
|
17
|
+
|
18
|
+
A snapshot represents a point-in-time view of a table.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, table: "Table", core: CoreSnapshot):
|
22
|
+
self.core = core
|
23
|
+
self._table = table
|
24
|
+
|
25
|
+
@property
|
26
|
+
def asof(self) -> Timestamp:
|
27
|
+
"""Returns the asof timestamp of the snapshot."""
|
28
|
+
return self.core.asof
|
29
|
+
|
30
|
+
def schema(self) -> Schema:
|
31
|
+
"""Returns the schema of the snapshot."""
|
32
|
+
return self.core.table.get_schema(asof=self.asof)
|
33
|
+
|
34
|
+
@property
|
35
|
+
def table(self) -> "Table":
|
36
|
+
"""Returns the table associated with the snapshot."""
|
37
|
+
return self._table
|
38
|
+
|
39
|
+
def to_dataset(self) -> "ds.Dataset":
|
40
|
+
"""Returns a PyArrow Dataset representing the table."""
|
41
|
+
from spiral.dataset import Dataset
|
42
|
+
|
43
|
+
return Dataset(self)
|
44
|
+
|
45
|
+
def to_polars(self) -> "pl.LazyFrame":
|
46
|
+
"""Returns a Polars LazyFrame for the Spiral table."""
|
47
|
+
import polars as pl
|
48
|
+
|
49
|
+
return pl.scan_pyarrow_dataset(self.to_dataset())
|
50
|
+
|
51
|
+
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
52
|
+
"""Returns a DuckDB relation for the Spiral table."""
|
53
|
+
import duckdb
|
54
|
+
|
55
|
+
return duckdb.from_arrow(self.to_dataset())
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import dataclasses
|
2
|
+
import functools
|
3
|
+
import os
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
import vortex as vx
|
7
|
+
|
8
|
+
from spiral.core.client import Shard
|
9
|
+
|
10
|
+
|
11
|
+
# Fake streaming.base.format.base.reader.FileInfo
|
12
|
+
# Dataset manages decompression instead of the Stream in MDS.
|
13
|
+
# So we return our own fake FileInfo that has None for compressed file.
|
14
|
+
@dataclasses.dataclass
|
15
|
+
class FileInfo:
|
16
|
+
basename: str
|
17
|
+
hashes: dict[str, str] = dataclasses.field(default_factory=dict)
|
18
|
+
|
19
|
+
@property
|
20
|
+
def bytes(self):
|
21
|
+
raise NotImplementedError("FileInfo.bytes should NOT be called.")
|
22
|
+
|
23
|
+
|
24
|
+
class SpiralReader:
|
25
|
+
"""
|
26
|
+
An MDS (streaming) compatible Reader.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, shard: Shard, basepath):
|
30
|
+
self._shard = shard
|
31
|
+
self._cardinality = shard.cardinality
|
32
|
+
self._basepath = basepath
|
33
|
+
self._scan: vx.RepeatedScan | None = None
|
34
|
+
|
35
|
+
@property
|
36
|
+
def shard(self) -> Shard:
|
37
|
+
return self._shard
|
38
|
+
|
39
|
+
@property
|
40
|
+
def size(self):
|
41
|
+
"""Get the number of samples in this shard.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
int: Sample count.
|
45
|
+
"""
|
46
|
+
return self._cardinality
|
47
|
+
|
48
|
+
@property
|
49
|
+
def samples(self):
|
50
|
+
"""Get the number of samples in this shard.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
int: Sample count.
|
54
|
+
"""
|
55
|
+
return self._cardinality
|
56
|
+
|
57
|
+
def __len__(self) -> int:
|
58
|
+
"""Get the number of samples in this shard.
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
int: Sample count.
|
62
|
+
"""
|
63
|
+
return self._cardinality
|
64
|
+
|
65
|
+
@property
|
66
|
+
def file_pairs(self) -> list[tuple[FileInfo, FileInfo | None]]:
|
67
|
+
"""Get the infos from raw and compressed file.
|
68
|
+
|
69
|
+
MDS uses this because dataset manages decompression of the shards, not stream...
|
70
|
+
"""
|
71
|
+
return [(FileInfo(basename=self.filename), None)]
|
72
|
+
|
73
|
+
@functools.cached_property
|
74
|
+
def filename(self) -> str:
|
75
|
+
"""Used by SpiralStream to identify shard's file-on-disk, if it exists."""
|
76
|
+
# TODO(marko): This might be too long...
|
77
|
+
return (
|
78
|
+
bytes(self._shard.key_range.begin).hex()
|
79
|
+
+ "_"
|
80
|
+
+ bytes(self._shard.key_range.end).hex()
|
81
|
+
+ "_"
|
82
|
+
+ str(self._shard.cardinality)
|
83
|
+
+ ".vortex"
|
84
|
+
)
|
85
|
+
|
86
|
+
@functools.cached_property
|
87
|
+
def filepath(self) -> str:
|
88
|
+
"""Full path to the shard's file-on-disk, if it exists."""
|
89
|
+
return os.path.join(self._basepath, self.filename)
|
90
|
+
|
91
|
+
def evict(self) -> int:
|
92
|
+
"""Remove all files belonging to this shard."""
|
93
|
+
|
94
|
+
# Clean up the scan handle first. This will make sure memory is freed.
|
95
|
+
self._scan = None
|
96
|
+
|
97
|
+
# Try to evict file.
|
98
|
+
try:
|
99
|
+
stat = os.stat(self.filepath)
|
100
|
+
os.remove(self.filepath)
|
101
|
+
return stat.st_size
|
102
|
+
except FileNotFoundError:
|
103
|
+
# Nothing to evict.
|
104
|
+
return 0
|
105
|
+
|
106
|
+
def __getitem__(self, item):
|
107
|
+
return self.get_item(item)
|
108
|
+
|
109
|
+
def get_item(self, idx: int) -> dict[str, Any]:
|
110
|
+
if self._scan is None:
|
111
|
+
# TODO(marko): vx.open should throw FileNotFoundError instead of
|
112
|
+
# ValueError: No such file or directory (os error 2)
|
113
|
+
# Check if shard is ready on disk. This must throw FileNotFoundError.
|
114
|
+
if not os.path.exists(self.filepath):
|
115
|
+
raise FileNotFoundError(f"Shard not found: {self.filepath}")
|
116
|
+
self._scan = vx.open(self.filepath, without_segment_cache=True).to_repeated_scan()
|
117
|
+
return self._scan.scalar_at(idx).as_py()
|