pyspiral 0.5.0__cp310-abi3-macosx_11_0_arm64.whl → 0.6.1__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/METADATA +7 -3
- pyspiral-0.6.1.dist-info/RECORD +99 -0
- {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/WHEEL +1 -1
- spiral/__init__.py +11 -3
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +6 -6
- spiral/api/__init__.py +8 -2
- spiral/api/client.py +1 -1
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/projects.py +15 -0
- spiral/api/text_indexes.py +1 -1
- spiral/cli/__init__.py +15 -6
- spiral/cli/admin.py +2 -4
- spiral/cli/app.py +4 -2
- spiral/cli/fs.py +5 -6
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +89 -0
- spiral/cli/login.py +6 -7
- spiral/cli/orgs.py +7 -8
- spiral/cli/printer.py +3 -3
- spiral/cli/projects.py +5 -6
- spiral/cli/tables.py +131 -0
- spiral/cli/telemetry.py +3 -4
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +3 -4
- spiral/cli/workloads.py +7 -8
- spiral/client.py +111 -8
- spiral/core/authn/__init__.pyi +27 -0
- spiral/core/client/__init__.pyi +152 -63
- spiral/core/table/__init__.pyi +17 -27
- spiral/core/table/metastore/__init__.pyi +0 -4
- spiral/core/table/spec/__init__.pyi +0 -2
- spiral/{tables/dataset.py → dataset.py} +13 -7
- spiral/{tables/debug → debug}/manifests.py +15 -6
- spiral/{tables/debug → debug}/scan.py +3 -3
- spiral/expressions/base.py +3 -3
- spiral/expressions/udf.py +1 -1
- spiral/{iceberg/client.py → iceberg.py} +1 -3
- spiral/key_space_index.py +44 -0
- spiral/project.py +171 -18
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
- spiral/protogen/_/google/protobuf/__init__.py +2190 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +138 -126
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +5256 -2459
- spiral/protogen/_/substrait/extensions/__init__.py +103 -49
- spiral/{tables/scan.py → scan.py} +38 -44
- spiral/settings.py +14 -3
- spiral/snapshot.py +55 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +131 -0
- spiral/streaming_/stream.py +146 -0
- spiral/substrait_.py +9 -9
- spiral/table.py +259 -0
- spiral/text_index.py +17 -0
- spiral/{tables/transaction.py → transaction.py} +11 -15
- pyspiral-0.5.0.dist-info/RECORD +0 -103
- spiral/cli/iceberg/__init__.py +0 -7
- spiral/cli/iceberg/namespaces.py +0 -47
- spiral/cli/iceberg/tables.py +0 -60
- spiral/cli/indexes/__init__.py +0 -40
- spiral/cli/indexes/args.py +0 -39
- spiral/cli/indexes/workers.py +0 -59
- spiral/cli/tables/__init__.py +0 -88
- spiral/cli/tables/args.py +0 -42
- spiral/core/index/__init__.pyi +0 -7
- spiral/iceberg/__init__.py +0 -3
- spiral/indexes/__init__.py +0 -5
- spiral/indexes/client.py +0 -137
- spiral/indexes/index.py +0 -28
- spiral/indexes/scan.py +0 -22
- spiral/protogen/_/spiral/table/__init__.py +0 -22
- spiral/protogen/substrait/__init__.py +0 -3399
- spiral/protogen/substrait/extensions/__init__.py +0 -115
- spiral/tables/__init__.py +0 -12
- spiral/tables/client.py +0 -133
- spiral/tables/maintenance.py +0 -12
- spiral/tables/snapshot.py +0 -78
- spiral/tables/table.py +0 -145
- {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/entry_points.txt +0 -0
- /spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
- /spiral/{tables/debug → debug}/metrics.py +0 -0
- /spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0
|
@@ -1,115 +1,169 @@
|
|
|
1
1
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
2
2
|
# sources: substrait/extensions/extensions.proto
|
|
3
|
-
# plugin: python-
|
|
3
|
+
# plugin: python-betterproto2
|
|
4
4
|
# This file has been @generated
|
|
5
5
|
|
|
6
|
+
__all__ = (
|
|
7
|
+
"AdvancedExtension",
|
|
8
|
+
"SimpleExtensionDeclaration",
|
|
9
|
+
"SimpleExtensionDeclarationExtensionFunction",
|
|
10
|
+
"SimpleExtensionDeclarationExtensionType",
|
|
11
|
+
"SimpleExtensionDeclarationExtensionTypeVariation",
|
|
12
|
+
"SimpleExtensionUri",
|
|
13
|
+
)
|
|
14
|
+
|
|
6
15
|
from dataclasses import dataclass
|
|
7
|
-
from typing import List
|
|
8
16
|
|
|
9
|
-
import
|
|
10
|
-
|
|
17
|
+
import betterproto2
|
|
18
|
+
|
|
19
|
+
from ...message_pool import default_message_pool
|
|
20
|
+
|
|
21
|
+
_COMPILER_VERSION = "0.8.0"
|
|
22
|
+
betterproto2.check_compiler_version(_COMPILER_VERSION)
|
|
11
23
|
|
|
12
24
|
|
|
13
25
|
@dataclass(eq=False, repr=False)
|
|
14
|
-
class
|
|
15
|
-
extension_uri_anchor: int = betterproto.uint32_field(1)
|
|
26
|
+
class AdvancedExtension(betterproto2.Message):
|
|
16
27
|
"""
|
|
17
|
-
A
|
|
18
|
-
|
|
28
|
+
A generic object that can be used to embed additional extension information
|
|
29
|
+
into the serialized substrait plan.
|
|
19
30
|
"""
|
|
20
31
|
|
|
21
|
-
|
|
32
|
+
optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
|
|
22
33
|
"""
|
|
23
|
-
|
|
24
|
-
|
|
34
|
+
An optimization is helpful information that don't influence semantics. May
|
|
35
|
+
be ignored by a consumer.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
|
|
39
|
+
"""
|
|
40
|
+
An enhancement alter semantics. Cannot be ignored by a consumer.
|
|
25
41
|
"""
|
|
26
42
|
|
|
27
43
|
|
|
44
|
+
default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
|
|
45
|
+
|
|
46
|
+
|
|
28
47
|
@dataclass(eq=False, repr=False)
|
|
29
|
-
class SimpleExtensionDeclaration(
|
|
48
|
+
class SimpleExtensionDeclaration(betterproto2.Message):
|
|
30
49
|
"""
|
|
31
50
|
Describes a mapping between a specific extension entity and the uri where
|
|
32
|
-
|
|
51
|
+
that extension can be found.
|
|
52
|
+
|
|
53
|
+
Oneofs:
|
|
54
|
+
- mapping_type:
|
|
33
55
|
"""
|
|
34
56
|
|
|
35
|
-
extension_type: "SimpleExtensionDeclarationExtensionType" = (
|
|
36
|
-
|
|
57
|
+
extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
|
|
58
|
+
1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
|
37
59
|
)
|
|
38
|
-
|
|
39
|
-
|
|
60
|
+
|
|
61
|
+
extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
|
|
62
|
+
2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
|
40
63
|
)
|
|
41
|
-
|
|
42
|
-
|
|
64
|
+
|
|
65
|
+
extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
|
|
66
|
+
3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
|
|
43
67
|
)
|
|
44
68
|
|
|
45
69
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"""Describes a Type"""
|
|
70
|
+
default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
|
|
71
|
+
|
|
49
72
|
|
|
50
|
-
|
|
73
|
+
@dataclass(eq=False, repr=False)
|
|
74
|
+
class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
|
|
75
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
|
51
76
|
"""
|
|
52
77
|
references the extension_uri_anchor defined for a specific extension URI.
|
|
53
78
|
"""
|
|
54
79
|
|
|
55
|
-
|
|
80
|
+
function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
|
56
81
|
"""
|
|
57
82
|
A surrogate key used in the context of a single plan to reference a
|
|
58
|
-
|
|
83
|
+
specific function
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
|
59
87
|
"""
|
|
88
|
+
A function signature compound name
|
|
89
|
+
"""
|
|
90
|
+
|
|
60
91
|
|
|
61
|
-
|
|
62
|
-
"""
|
|
92
|
+
default_message_pool.register_message(
|
|
93
|
+
"substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
|
|
94
|
+
)
|
|
63
95
|
|
|
64
96
|
|
|
65
97
|
@dataclass(eq=False, repr=False)
|
|
66
|
-
class
|
|
67
|
-
|
|
98
|
+
class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
|
|
99
|
+
"""
|
|
100
|
+
Describes a Type
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
|
68
104
|
"""
|
|
69
105
|
references the extension_uri_anchor defined for a specific extension URI.
|
|
70
106
|
"""
|
|
71
107
|
|
|
72
|
-
|
|
108
|
+
type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
|
73
109
|
"""
|
|
74
110
|
A surrogate key used in the context of a single plan to reference a
|
|
75
|
-
|
|
111
|
+
specific extension type
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
|
115
|
+
"""
|
|
116
|
+
the name of the type in the defined extension YAML.
|
|
76
117
|
"""
|
|
77
118
|
|
|
78
|
-
|
|
79
|
-
|
|
119
|
+
|
|
120
|
+
default_message_pool.register_message(
|
|
121
|
+
"substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
|
|
122
|
+
)
|
|
80
123
|
|
|
81
124
|
|
|
82
125
|
@dataclass(eq=False, repr=False)
|
|
83
|
-
class
|
|
84
|
-
extension_uri_reference: int =
|
|
126
|
+
class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
|
|
127
|
+
extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
|
85
128
|
"""
|
|
86
129
|
references the extension_uri_anchor defined for a specific extension URI.
|
|
87
130
|
"""
|
|
88
131
|
|
|
89
|
-
|
|
132
|
+
type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
|
|
90
133
|
"""
|
|
91
134
|
A surrogate key used in the context of a single plan to reference a
|
|
92
|
-
|
|
135
|
+
specific type variation
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
|
|
139
|
+
"""
|
|
140
|
+
the name of the type in the defined extension YAML.
|
|
93
141
|
"""
|
|
94
142
|
|
|
95
|
-
|
|
96
|
-
|
|
143
|
+
|
|
144
|
+
default_message_pool.register_message(
|
|
145
|
+
"substrait.extensions",
|
|
146
|
+
"SimpleExtensionDeclaration.ExtensionTypeVariation",
|
|
147
|
+
SimpleExtensionDeclarationExtensionTypeVariation,
|
|
148
|
+
)
|
|
97
149
|
|
|
98
150
|
|
|
99
151
|
@dataclass(eq=False, repr=False)
|
|
100
|
-
class
|
|
152
|
+
class SimpleExtensionUri(betterproto2.Message):
|
|
153
|
+
extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
|
|
101
154
|
"""
|
|
102
|
-
A
|
|
103
|
-
|
|
155
|
+
A surrogate key used in the context of a single plan used to reference the
|
|
156
|
+
URI associated with an extension.
|
|
104
157
|
"""
|
|
105
158
|
|
|
106
|
-
|
|
107
|
-
"betterproto_lib_google_protobuf.Any"
|
|
108
|
-
] = betterproto.message_field(1)
|
|
159
|
+
uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
|
|
109
160
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
161
|
+
The URI where this extension YAML can be retrieved. This is the "namespace"
|
|
162
|
+
of this extension.
|
|
112
163
|
"""
|
|
113
164
|
|
|
114
|
-
|
|
115
|
-
|
|
165
|
+
|
|
166
|
+
default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
from ...google import protobuf as __google__protobuf__
|
|
@@ -2,40 +2,37 @@ from collections.abc import Iterator
|
|
|
2
2
|
from typing import TYPE_CHECKING, Any
|
|
3
3
|
|
|
4
4
|
import pyarrow as pa
|
|
5
|
-
from datasets import DatasetInfo, Features
|
|
6
5
|
|
|
7
|
-
from spiral.core.
|
|
6
|
+
from spiral.core.client import ShuffleStrategy
|
|
7
|
+
from spiral.core.table import KeyRange
|
|
8
|
+
from spiral.core.table import Scan as CoreScan
|
|
8
9
|
from spiral.core.table.spec import Schema
|
|
9
10
|
from spiral.settings import CI, DEV
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
import dask.dataframe as dd
|
|
14
|
+
import datasets.iterable_dataset as hf # noqa
|
|
13
15
|
import pandas as pd
|
|
14
16
|
import polars as pl
|
|
15
|
-
|
|
17
|
+
import streaming # noqa
|
|
18
|
+
import torch.utils.data as torchdata # noqa
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class Scan:
|
|
19
22
|
"""Scan object."""
|
|
20
23
|
|
|
21
|
-
def __init__(
|
|
22
|
-
self
|
|
23
|
-
scan: TableScan,
|
|
24
|
-
):
|
|
25
|
-
# NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
|
|
26
|
-
# when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
|
|
27
|
-
# and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
|
|
28
|
-
self._scan = scan
|
|
24
|
+
def __init__(self, core: CoreScan):
|
|
25
|
+
self.core = core
|
|
29
26
|
|
|
30
27
|
@property
|
|
31
28
|
def metrics(self) -> dict[str, Any]:
|
|
32
29
|
"""Returns metrics about the scan."""
|
|
33
|
-
return self.
|
|
30
|
+
return self.core.metrics()
|
|
34
31
|
|
|
35
32
|
@property
|
|
36
33
|
def schema(self) -> Schema:
|
|
37
34
|
"""Returns the schema of the scan."""
|
|
38
|
-
return self.
|
|
35
|
+
return self.core.schema()
|
|
39
36
|
|
|
40
37
|
def is_empty(self) -> bool:
|
|
41
38
|
"""Check if the Spiral is empty for the given key range.
|
|
@@ -43,7 +40,7 @@ class Scan:
|
|
|
43
40
|
**IMPORTANT**: False negatives are possible, but false positives are not,
|
|
44
41
|
i.e. is_empty can return False and scan can return zero rows.
|
|
45
42
|
"""
|
|
46
|
-
return self.
|
|
43
|
+
return self.core.is_empty()
|
|
47
44
|
|
|
48
45
|
def to_record_batches(
|
|
49
46
|
self,
|
|
@@ -69,7 +66,7 @@ class Scan:
|
|
|
69
66
|
elif isinstance(key_table, pa.Table):
|
|
70
67
|
key_table = key_table.to_reader(max_chunksize=batch_size)
|
|
71
68
|
|
|
72
|
-
return self.
|
|
69
|
+
return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
|
73
70
|
|
|
74
71
|
def to_table(
|
|
75
72
|
self,
|
|
@@ -83,7 +80,7 @@ class Scan:
|
|
|
83
80
|
"""
|
|
84
81
|
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
|
85
82
|
if DEV and not CI and key_table is None:
|
|
86
|
-
rb = self.
|
|
83
|
+
rb = self.core.to_record_batch()
|
|
87
84
|
return pa.Table.from_batches([rb])
|
|
88
85
|
|
|
89
86
|
return self.to_record_batches(key_table=key_table).read_all()
|
|
@@ -97,11 +94,11 @@ class Scan:
|
|
|
97
94
|
import pandas as pd
|
|
98
95
|
|
|
99
96
|
def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
|
|
100
|
-
# TODO(ngates): we need a way to preserve the existing asofs?
|
|
97
|
+
# TODO(ngates): we need a way to preserve the existing asofs?
|
|
101
98
|
raise NotImplementedError()
|
|
102
99
|
|
|
103
100
|
# Fetch a set of partition ranges
|
|
104
|
-
return dd.from_map(_read_key_range, self.
|
|
101
|
+
return dd.from_map(_read_key_range, self._splits())
|
|
105
102
|
|
|
106
103
|
def to_pandas(self) -> "pd.DataFrame":
|
|
107
104
|
"""Read into a Pandas DataFrame.
|
|
@@ -117,34 +114,31 @@ class Scan:
|
|
|
117
114
|
"""
|
|
118
115
|
import polars as pl
|
|
119
116
|
|
|
120
|
-
# TODO(marko): This should support lazy dataframe.
|
|
121
117
|
return pl.from_arrow(self.to_record_batches())
|
|
122
118
|
|
|
123
|
-
def
|
|
119
|
+
def to_iterable_dataset(
|
|
124
120
|
self,
|
|
121
|
+
shuffle: ShuffleStrategy | None = None,
|
|
125
122
|
batch_readahead: int | None = None,
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
123
|
+
) -> "hf.IterableDataset":
|
|
124
|
+
"""Returns an Huggingface's IterableDataset.
|
|
125
|
+
|
|
126
|
+
Requires `datasets` package to be installed.
|
|
130
127
|
|
|
131
128
|
Args:
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
129
|
+
shuffle: Controls sample shuffling. If None, no shuffling is performed.
|
|
130
|
+
batch_readahead: Controls how many batches to read ahead concurrently.
|
|
131
|
+
If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
|
|
132
|
+
Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
|
|
136
133
|
"""
|
|
134
|
+
from datasets import DatasetInfo, Features
|
|
137
135
|
from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
|
|
138
136
|
|
|
139
137
|
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
else:
|
|
145
|
-
stream = self._scan.to_shuffled_record_batches(
|
|
146
|
-
batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
|
|
147
|
-
)
|
|
138
|
+
stream = self.core.to_shuffled_record_batches(
|
|
139
|
+
shuffle,
|
|
140
|
+
batch_readahead,
|
|
141
|
+
)
|
|
148
142
|
|
|
149
143
|
# This key is unused when training with IterableDataset.
|
|
150
144
|
# Default implementation returns shard id, e.g. parquet row group id.
|
|
@@ -166,28 +160,28 @@ class Scan:
|
|
|
166
160
|
return pa.schema(new_fields)
|
|
167
161
|
|
|
168
162
|
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
|
169
|
-
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
|
|
163
|
+
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={}) # type: ignore
|
|
170
164
|
info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
|
|
171
165
|
return IterableDataset(ex_iterable=ex_iterable, info=info)
|
|
172
166
|
|
|
173
|
-
def
|
|
167
|
+
def _splits(self) -> list[KeyRange]:
|
|
174
168
|
# Splits the scan into a set of key ranges.
|
|
175
|
-
return self.
|
|
169
|
+
return self.core.splits()
|
|
176
170
|
|
|
177
171
|
def _debug(self):
|
|
178
172
|
# Visualizes the scan, mainly for debugging purposes.
|
|
179
|
-
from spiral.
|
|
173
|
+
from spiral.debug.scan import show_scan
|
|
180
174
|
|
|
181
|
-
show_scan(self.
|
|
175
|
+
show_scan(self.core)
|
|
182
176
|
|
|
183
177
|
def _dump_manifests(self):
|
|
184
178
|
# Print manifests in a human-readable format.
|
|
185
|
-
from spiral.
|
|
179
|
+
from spiral.debug.manifests import display_scan_manifests
|
|
186
180
|
|
|
187
|
-
|
|
181
|
+
display_scan_manifests(self.core)
|
|
188
182
|
|
|
189
183
|
def _dump_metrics(self):
|
|
190
184
|
# Print metrics in a human-readable format.
|
|
191
|
-
from spiral.
|
|
185
|
+
from spiral.debug.metrics import display_metrics
|
|
192
186
|
|
|
193
187
|
display_metrics(self.metrics)
|
spiral/settings.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated
|
|
4
|
+
from typing import TYPE_CHECKING, Annotated
|
|
5
5
|
|
|
6
6
|
import typer
|
|
7
7
|
from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
|
|
@@ -12,8 +12,11 @@ from pydantic_settings import (
|
|
|
12
12
|
SettingsConfigDict,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
from spiral.
|
|
16
|
-
from spiral.core.client import
|
|
15
|
+
from spiral.core.authn import Authn, DeviceCodeAuth, Token
|
|
16
|
+
from spiral.core.client import Spiral
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from spiral.api import SpiralAPI
|
|
17
20
|
|
|
18
21
|
DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
|
|
19
22
|
CI = "GITHUB_ACTIONS" in os.environ
|
|
@@ -74,6 +77,14 @@ class Settings(BaseSettings):
|
|
|
74
77
|
|
|
75
78
|
return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
|
|
76
79
|
|
|
80
|
+
@functools.cached_property
|
|
81
|
+
def core(self) -> Spiral:
|
|
82
|
+
return Spiral(
|
|
83
|
+
api_url=self.spiraldb.uri,
|
|
84
|
+
spfs_url=self.spfs.uri,
|
|
85
|
+
authn=self.authn,
|
|
86
|
+
)
|
|
87
|
+
|
|
77
88
|
@functools.cached_property
|
|
78
89
|
def authn(self):
|
|
79
90
|
if self.spiraldb.token:
|
spiral/snapshot.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from spiral.core.table import Snapshot as CoreSnapshot
|
|
4
|
+
from spiral.core.table.spec import Schema
|
|
5
|
+
from spiral.types_ import Timestamp
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import duckdb
|
|
9
|
+
import polars as pl
|
|
10
|
+
import pyarrow.dataset as ds
|
|
11
|
+
|
|
12
|
+
from spiral.table import Table
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Snapshot:
|
|
16
|
+
"""Spiral table snapshot.
|
|
17
|
+
|
|
18
|
+
A snapshot represents a point-in-time view of a table.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, table: "Table", core: CoreSnapshot):
|
|
22
|
+
self.core = core
|
|
23
|
+
self._table = table
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def asof(self) -> Timestamp:
|
|
27
|
+
"""Returns the asof timestamp of the snapshot."""
|
|
28
|
+
return self.core.asof
|
|
29
|
+
|
|
30
|
+
def schema(self) -> Schema:
|
|
31
|
+
"""Returns the schema of the snapshot."""
|
|
32
|
+
return self.core.table.get_schema(asof=self.asof)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def table(self) -> "Table":
|
|
36
|
+
"""Returns the table associated with the snapshot."""
|
|
37
|
+
return self._table
|
|
38
|
+
|
|
39
|
+
def to_dataset(self) -> "ds.Dataset":
|
|
40
|
+
"""Returns a PyArrow Dataset representing the table."""
|
|
41
|
+
from spiral.dataset import Dataset
|
|
42
|
+
|
|
43
|
+
return Dataset(self)
|
|
44
|
+
|
|
45
|
+
def to_polars(self) -> "pl.LazyFrame":
|
|
46
|
+
"""Returns a Polars LazyFrame for the Spiral table."""
|
|
47
|
+
import polars as pl
|
|
48
|
+
|
|
49
|
+
return pl.scan_pyarrow_dataset(self.to_dataset())
|
|
50
|
+
|
|
51
|
+
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
|
52
|
+
"""Returns a DuckDB relation for the Spiral table."""
|
|
53
|
+
import duckdb
|
|
54
|
+
|
|
55
|
+
return duckdb.from_arrow(self.to_dataset())
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import functools
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import vortex as vx
|
|
7
|
+
|
|
8
|
+
from spiral.core.client import Shard
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Fake streaming.base.format.base.reader.FileInfo
|
|
12
|
+
# Dataset manages decompression instead of the Stream in MDS.
|
|
13
|
+
# So we return our own fake FileInfo that has None for compressed file.
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class FileInfo:
|
|
16
|
+
basename: str
|
|
17
|
+
hashes: dict[str, str] = dataclasses.field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def bytes(self):
|
|
21
|
+
raise NotImplementedError("FileInfo.bytes should NOT be called.")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SpiralReader:
|
|
25
|
+
"""
|
|
26
|
+
An MDS (streaming) compatible Reader.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, shard: Shard, basepath):
|
|
30
|
+
self._shard = shard
|
|
31
|
+
self._cardinality = shard.cardinality
|
|
32
|
+
self._basepath = basepath
|
|
33
|
+
self._scan: vx.RepeatedScan | None = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def shard(self) -> Shard:
|
|
37
|
+
return self._shard
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def size(self):
|
|
41
|
+
"""Get the number of samples in this shard.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
int: Sample count.
|
|
45
|
+
"""
|
|
46
|
+
return self._cardinality
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def samples(self):
|
|
50
|
+
"""Get the number of samples in this shard.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
int: Sample count.
|
|
54
|
+
"""
|
|
55
|
+
return self._cardinality
|
|
56
|
+
|
|
57
|
+
def __len__(self) -> int:
|
|
58
|
+
"""Get the number of samples in this shard.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
int: Sample count.
|
|
62
|
+
"""
|
|
63
|
+
return self._cardinality
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def file_pairs(self) -> list[tuple[FileInfo, FileInfo | None]]:
|
|
67
|
+
"""Get the infos from raw and compressed file.
|
|
68
|
+
|
|
69
|
+
MDS uses this because dataset manages decompression of the shards, not stream...
|
|
70
|
+
"""
|
|
71
|
+
return [(FileInfo(basename=self.filename), None)]
|
|
72
|
+
|
|
73
|
+
def get_max_size(self) -> int:
|
|
74
|
+
"""Get the full size of this shard.
|
|
75
|
+
|
|
76
|
+
"Max" in this case means both the raw (decompressed) and zip (compressed) versions are
|
|
77
|
+
resident (assuming it has a zip form). This is the maximum disk usage the shard can reach.
|
|
78
|
+
When compressed was used, even if keep_zip is ``False``, the zip form must still be
|
|
79
|
+
resident at the same time as the raw form during shard decompression.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
int: Size in bytes.
|
|
83
|
+
"""
|
|
84
|
+
# TODO(marko): This is used to check cache limit is possible...
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
@functools.cached_property
|
|
88
|
+
def filename(self) -> str:
|
|
89
|
+
"""Used by SpiralStream to identify shard's file-on-disk, if it exists."""
|
|
90
|
+
# TODO(marko): This might be too long...
|
|
91
|
+
return (
|
|
92
|
+
bytes(self._shard.key_range.begin).hex()
|
|
93
|
+
+ "_"
|
|
94
|
+
+ bytes(self._shard.key_range.end).hex()
|
|
95
|
+
+ "_"
|
|
96
|
+
+ str(self._shard.cardinality)
|
|
97
|
+
+ ".vortex"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@functools.cached_property
|
|
101
|
+
def filepath(self) -> str:
|
|
102
|
+
"""Full path to the shard's file-on-disk, if it exists."""
|
|
103
|
+
return os.path.join(self._basepath, self.filename)
|
|
104
|
+
|
|
105
|
+
def evict(self) -> int:
|
|
106
|
+
"""Remove all files belonging to this shard."""
|
|
107
|
+
|
|
108
|
+
# Clean up the scan handle first. This will make sure memory is freed.
|
|
109
|
+
self._scan = None
|
|
110
|
+
|
|
111
|
+
# Try to evict file.
|
|
112
|
+
try:
|
|
113
|
+
stat = os.stat(self.filepath)
|
|
114
|
+
os.remove(self.filepath)
|
|
115
|
+
return stat.st_size
|
|
116
|
+
except FileNotFoundError:
|
|
117
|
+
# Nothing to evict.
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
def __getitem__(self, item):
|
|
121
|
+
return self.get_item(item)
|
|
122
|
+
|
|
123
|
+
def get_item(self, idx: int) -> dict[str, Any]:
|
|
124
|
+
if self._scan is None:
|
|
125
|
+
# TODO(marko): vx.open should throw FileNotFoundError instead of
|
|
126
|
+
# ValueError: No such file or directory (os error 2)
|
|
127
|
+
# Check if shard is ready on disk. This must throw FileNotFoundError.
|
|
128
|
+
if not os.path.exists(self.filepath):
|
|
129
|
+
raise FileNotFoundError(f"Shard not found: {self.filepath}")
|
|
130
|
+
self._scan = vx.open(self.filepath, without_segment_cache=True).to_repeated_scan()
|
|
131
|
+
return self._scan.scalar_at(idx).as_py()
|