pyspiral 0.6.0__cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.6.2__cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.6.0.dist-info → pyspiral-0.6.2.dist-info}/METADATA +1 -1
- {pyspiral-0.6.0.dist-info → pyspiral-0.6.2.dist-info}/RECORD +17 -17
- spiral/__init__.py +2 -1
- spiral/_lib.abi3.so +0 -0
- spiral/cli/key_spaces.py +21 -0
- spiral/cli/tables.py +3 -3
- spiral/core/client/__init__.pyi +22 -5
- spiral/core/table/__init__.pyi +3 -21
- spiral/core/table/manifests/__init__.pyi +1 -1
- spiral/debug/manifests.py +3 -5
- spiral/scan.py +3 -2
- spiral/streaming_/reader.py +14 -0
- spiral/streaming_/stream.py +10 -2
- spiral/table.py +18 -3
- spiral/transaction.py +10 -0
- {pyspiral-0.6.0.dist-info → pyspiral-0.6.2.dist-info}/WHEEL +0 -0
- {pyspiral-0.6.0.dist-info → pyspiral-0.6.2.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
-
pyspiral-0.6.
|
2
|
-
pyspiral-0.6.
|
3
|
-
pyspiral-0.6.
|
4
|
-
spiral/__init__.py,sha256=
|
5
|
-
spiral/_lib.abi3.so,sha256=
|
1
|
+
pyspiral-0.6.2.dist-info/METADATA,sha256=n0a5SuYMybj-eUwB9No9IFxSk9Cn_pDZfjpx0HuDxRw,1836
|
2
|
+
pyspiral-0.6.2.dist-info/WHEEL,sha256=PxcKzGLVtZeSnGJDErQ-Emkn2AvBXbmzIogfnaf7-q8,130
|
3
|
+
pyspiral-0.6.2.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
|
4
|
+
spiral/__init__.py,sha256=5c0faqg-kHZBDwriQ7LzLAMcFolIucp-IA1EzNvCZ3k,711
|
5
|
+
spiral/_lib.abi3.so,sha256=oOsM8f904rj2leVKR8cCOkLQ7uxcf36tKVIN0bCVOok,55184120
|
6
6
|
spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
|
7
7
|
spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
|
8
8
|
spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
|
@@ -24,13 +24,13 @@ spiral/cli/app.py,sha256=HWCjMJLzSz_JaiLF046jzC9A4-yvzS6506D3cOR2Vgc,1773
|
|
24
24
|
spiral/cli/console.py,sha256=6JHbAQV6MFWz3P-VzqPOjhHpkIQagsCdzTMvmuDKMkU,2580
|
25
25
|
spiral/cli/fs.py,sha256=UREIJhjr6MfIdcKK7pjUKICd0wsQULhQiWRVWUnQ0dc,4376
|
26
26
|
spiral/cli/iceberg.py,sha256=Q14tcGcn1LixbFCYP0GhfYwFFXTmmi8tqBPYwalJEyE,3248
|
27
|
-
spiral/cli/key_spaces.py,sha256=
|
27
|
+
spiral/cli/key_spaces.py,sha256=x3IFRP5d47pKiAHeWExYMOBaT2TwxbWjVM01SUqKrwI,2943
|
28
28
|
spiral/cli/login.py,sha256=TgTr37ImgG1NKN8VbtqkxVAYaZFpMXMwPAb23gVldEw,649
|
29
29
|
spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
|
30
30
|
spiral/cli/printer.py,sha256=HcvSUpaMItzmhBUfIHROK1Z3SL8J8wDopS3Qo8H00uw,1781
|
31
31
|
spiral/cli/projects.py,sha256=UYrBlLcFacuXExdLX1sZByfvkz9MRtk_0oRAZvqHa0w,5105
|
32
32
|
spiral/cli/state.py,sha256=10wTIVQ0SJkY67Z6-KQ1LFlt3aVIPmZhoHFdTwp4kNA,130
|
33
|
-
spiral/cli/tables.py,sha256=
|
33
|
+
spiral/cli/tables.py,sha256=48lZ0wPQSCTul1vn-Qx6Be5eGnw75Abtw2zxMK9dCPg,4613
|
34
34
|
spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
|
35
35
|
spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
|
36
36
|
spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
|
@@ -38,15 +38,15 @@ spiral/cli/workloads.py,sha256=2_SLfQTFN6y73R9H0i9dk8VIOVagKxSxOpHXC56yptY,2015
|
|
38
38
|
spiral/client.py,sha256=Po9xgCH3NwVsCeRZMm3eJUPV77Rknyj-9MfCS1TbdTg,6623
|
39
39
|
spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
spiral/core/authn/__init__.pyi,sha256=Jw_8ywTMDTwgAtGxMtFED63rU0jOgrv-eZtaZ5sR5t4,757
|
41
|
-
spiral/core/client/__init__.pyi,sha256=
|
42
|
-
spiral/core/table/__init__.pyi,sha256=
|
43
|
-
spiral/core/table/manifests/__init__.pyi,sha256=
|
41
|
+
spiral/core/client/__init__.pyi,sha256=iEhZgbySG5LScfrtkiiHW1iHghgehsrVmPP-v5Pv_vk,5740
|
42
|
+
spiral/core/table/__init__.pyi,sha256=sjjShdgM_Uh8Roou1k02MnrqYpdAX4QuyRlIRlnyp1M,3073
|
43
|
+
spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
|
44
44
|
spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
|
45
45
|
spiral/core/table/spec/__init__.pyi,sha256=0NyGeyEhV_ebwKWVU3sqSvdF2D9v8kEVwo6wYAHF99M,5579
|
46
46
|
spiral/dataset.py,sha256=NNqG-oOrhbmNC2OMZ9AYAm4YkwwBozeRI6zXtz4cspA,8008
|
47
47
|
spiral/datetime_.py,sha256=1TA1RYIRU22qcUuipIjVhAtGnPDVn2z9WttuhkmfkwY,964
|
48
48
|
spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
49
|
-
spiral/debug/manifests.py,sha256=
|
49
|
+
spiral/debug/manifests.py,sha256=oaPB4534pQdqvPXCZetVNSvvhpdXTrv_1pN-_bAkeAo,2893
|
50
50
|
spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
|
51
51
|
spiral/debug/scan.py,sha256=9bMmVQFs5M6Rldm0fmrmmvn9LbSSTKBV5tIu37mEn78,8938
|
52
52
|
spiral/expressions/__init__.py,sha256=T8PIb0_UB9kynK0dpWbUD4No5lKRTG-wKnao8xOcXjY,6381
|
@@ -84,16 +84,16 @@ spiral/protogen/_/substrait/extensions/__init__.py,sha256=sCMvwWCXWu2cSGiTEH0hRj
|
|
84
84
|
spiral/protogen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
spiral/protogen/util.py,sha256=smnvVo6nYH3FfDm9jqhNLaXz4bbTBaQezHQDCTvZyiQ,1486
|
86
86
|
spiral/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
|
-
spiral/scan.py,sha256=
|
87
|
+
spiral/scan.py,sha256=20-NSGsoXYf6uKQ7yEdbbwT8ijIK7KxKTctycsl0AIk,7073
|
88
88
|
spiral/server.py,sha256=ztBmB5lBnUz-smQxR_tC8AI5SOhz17wH0MI3GuzDUdM,600
|
89
89
|
spiral/settings.py,sha256=Nap68xM-1ZvF3yDhkyRnNDIAVMIgxmIksglg_1iT0-0,3069
|
90
90
|
spiral/snapshot.py,sha256=_l2wrqUXz2RARjIDxOWw4aQpegJohvggIoWuCllzStA,1506
|
91
91
|
spiral/streaming_/__init__.py,sha256=s7MlW2ERsuZmZGExLFL6RcZon2e0tNBocBg5ANgki7k,61
|
92
|
-
spiral/streaming_/reader.py,sha256=
|
93
|
-
spiral/streaming_/stream.py,sha256=
|
92
|
+
spiral/streaming_/reader.py,sha256=Kpqknv2jn12jUhHOEEDArj0JZwrWb8XjoOGs9HrdVyA,4047
|
93
|
+
spiral/streaming_/stream.py,sha256=nxJEisPfZ2-Ebkm83hz_3v8NH27FxBku-1jw7UDlQuM,5881
|
94
94
|
spiral/substrait_.py,sha256=AKeOD4KIXvz2J4TYxnIneOiHddtBIyOhuNxVO_uH0eg,12592
|
95
|
-
spiral/table.py,sha256=
|
95
|
+
spiral/table.py,sha256=ZQFq5tuovDjQcpi38b5FUMuHNGI5XV0MnZbC6vbza1o,10312
|
96
96
|
spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
|
97
|
-
spiral/transaction.py,sha256=
|
97
|
+
spiral/transaction.py,sha256=nSykH4UGs9hGtWuSWK9YyT9jfEuvzfkKoUgMM5Xt4zU,1841
|
98
98
|
spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
|
99
|
-
pyspiral-0.6.
|
99
|
+
pyspiral-0.6.2.dist-info/RECORD,,
|
spiral/__init__.py
CHANGED
@@ -7,9 +7,10 @@ from spiral import _lib
|
|
7
7
|
assert _lib, "Spiral library"
|
8
8
|
|
9
9
|
from spiral.client import Spiral # noqa: E402
|
10
|
+
from spiral.core.client import ShuffleStrategy # noqa: E402
|
10
11
|
from spiral.key_space_index import KeySpaceIndex # noqa: E402
|
11
12
|
from spiral.project import Project # noqa: E402
|
12
|
-
from spiral.scan import Scan
|
13
|
+
from spiral.scan import Scan # noqa: E402
|
13
14
|
from spiral.snapshot import Snapshot # noqa: E402
|
14
15
|
from spiral.table import Table # noqa: E402
|
15
16
|
from spiral.text_index import TextIndex # noqa: E402
|
spiral/_lib.abi3.so
CHANGED
Binary file
|
spiral/cli/key_spaces.py
CHANGED
@@ -56,6 +56,27 @@ def ls(
|
|
56
56
|
CONSOLE.print(rich_table)
|
57
57
|
|
58
58
|
|
59
|
+
@app.command(help="Show index partitions.")
|
60
|
+
def show(
|
61
|
+
project: ProjectArg,
|
62
|
+
name: Annotated[str | None, Option(help="Index name.")] = None,
|
63
|
+
):
|
64
|
+
"""Show index partitions."""
|
65
|
+
index_id = get_index_id(project, name)
|
66
|
+
index = state.spiral.key_space_index(index_id)
|
67
|
+
shards = state.spiral._ops().compute_shards(index.core)
|
68
|
+
|
69
|
+
rich_table = rich.table.Table("Begin", "End", "Cardinality", title=f"Index {index.name} Partitions")
|
70
|
+
for partition in shards:
|
71
|
+
rich_table.add_row(
|
72
|
+
# TODO(marko): This isn't really pretty...
|
73
|
+
repr(partition.key_range.begin),
|
74
|
+
repr(partition.key_range.end),
|
75
|
+
str(partition.cardinality),
|
76
|
+
)
|
77
|
+
CONSOLE.print(rich_table)
|
78
|
+
|
79
|
+
|
59
80
|
@app.command(help="Trigger a sync job for an index.")
|
60
81
|
def sync(
|
61
82
|
project: ProjectArg,
|
spiral/cli/tables.py
CHANGED
@@ -111,12 +111,12 @@ def manifests(
|
|
111
111
|
_, t = get_table(project, table, dataset)
|
112
112
|
s = t.snapshot()
|
113
113
|
|
114
|
-
key_space_state = state.spiral._ops().key_space_state(
|
114
|
+
key_space_state = state.spiral._ops().key_space_state(s.core) # pyright: ignore[reportPrivateUsage]
|
115
115
|
key_space_manifest = key_space_state.manifest
|
116
116
|
|
117
|
-
column_groups_states = state.spiral._ops().column_groups_states(
|
117
|
+
column_groups_states = state.spiral._ops().column_groups_states(s.core, key_space_state) # pyright: ignore[reportPrivateUsage]
|
118
118
|
|
119
|
-
display_manifests(key_space_manifest, (
|
119
|
+
display_manifests(key_space_manifest, [(x.column_group, x.manifest) for x in column_groups_states])
|
120
120
|
|
121
121
|
|
122
122
|
@app.command(help="Display the manifests which would be read by a scan of the given column group.")
|
spiral/core/client/__init__.pyi
CHANGED
@@ -124,6 +124,25 @@ class Shard:
|
|
124
124
|
key_range: KeyRange
|
125
125
|
cardinality: int
|
126
126
|
|
127
|
+
def __init__(self, key_range: KeyRange, cardinality: int): ...
|
128
|
+
|
129
|
+
class ShuffleStrategy:
|
130
|
+
# Results are buffered in a pool of `shuffle_buffer_size` rows and shuffled again.
|
131
|
+
shuffle_buffer_size: int
|
132
|
+
|
133
|
+
# All randomness is derived from this seed. If None, a random seed is generated from the OS.
|
134
|
+
seed: int | None
|
135
|
+
|
136
|
+
# Externally provided shards to shuffle before reading rows.
|
137
|
+
shards: list[Shard] | None
|
138
|
+
|
139
|
+
def __init__(
|
140
|
+
self,
|
141
|
+
seed: int | None = None,
|
142
|
+
shuffle_buffer_size: int | None = None,
|
143
|
+
shards: list[Shard] | None = None,
|
144
|
+
): ...
|
145
|
+
|
127
146
|
class Operations:
|
128
147
|
def flush_wal(self, table: Table) -> None:
|
129
148
|
"""
|
@@ -163,21 +182,19 @@ class Operations:
|
|
163
182
|
Index table changes up to the given snapshot.
|
164
183
|
"""
|
165
184
|
...
|
166
|
-
def key_space_state(self,
|
185
|
+
def key_space_state(self, snapshot: Snapshot) -> KeySpaceState:
|
167
186
|
"""
|
168
187
|
The key space state for the table.
|
169
188
|
"""
|
170
189
|
...
|
171
190
|
def column_group_state(
|
172
|
-
self,
|
191
|
+
self, snapshot: Snapshot, key_space_state: KeySpaceState, column_group: ColumnGroup
|
173
192
|
) -> ColumnGroupState:
|
174
193
|
"""
|
175
194
|
The state the column group of the table.
|
176
195
|
"""
|
177
196
|
...
|
178
|
-
def column_groups_states(
|
179
|
-
self, table: Table, key_space_state: KeySpaceState, *, asof: int | None = None
|
180
|
-
) -> list[ColumnGroupState]:
|
197
|
+
def column_groups_states(self, snapshot: Snapshot, key_space_state: KeySpaceState) -> list[ColumnGroupState]:
|
181
198
|
"""
|
182
199
|
The state of each column group of the table.
|
183
200
|
"""
|
spiral/core/table/__init__.pyi
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Any
|
2
2
|
|
3
3
|
import pyarrow as pa
|
4
|
+
from spiral.core.client import ShuffleStrategy
|
4
5
|
from spiral.expressions import Expr
|
5
6
|
|
6
7
|
from .manifests import FragmentManifest
|
@@ -76,6 +77,7 @@ class Scan:
|
|
76
77
|
output_path: str,
|
77
78
|
key_range: KeyRange,
|
78
79
|
expected_cardinality: int | None = None,
|
80
|
+
shard_row_block_size: int = 8192,
|
79
81
|
) -> None: ...
|
80
82
|
|
81
83
|
class KeySpaceState:
|
@@ -93,27 +95,7 @@ class Transaction:
|
|
93
95
|
status: str
|
94
96
|
|
95
97
|
def write(self, expr: Expr, *, partition_size_bytes: int | None = None): ...
|
98
|
+
def drop_columns(self, column_paths: list[str]): ...
|
96
99
|
def commit(self): ...
|
97
100
|
def abort(self): ...
|
98
101
|
def metrics(self) -> dict[str, Any]: ...
|
99
|
-
|
100
|
-
class ShuffleStrategy:
|
101
|
-
# Results are buffered in a pool of `buffer_size` rows and shuffled again.
|
102
|
-
shuffle_buffer_size: int
|
103
|
-
|
104
|
-
# All randomness is derived from this seed. If None, a random seed is generated from the OS.
|
105
|
-
seed: int | None
|
106
|
-
|
107
|
-
# `approximate_batch_size` controls the maximum approximate size of each shard. Shards that
|
108
|
-
# are larger than this size are further split assuming uniform distribution of keys. Note
|
109
|
-
# that this is a best-effort and can be widely off. The purpose of this is to improve
|
110
|
-
# shuffling, rather than to support sharding. If not present, splits derived from the table
|
111
|
-
# are used in the attempt to minimize wasted reads.
|
112
|
-
approximate_buffer_size: int | None
|
113
|
-
|
114
|
-
def __init__(
|
115
|
-
self,
|
116
|
-
seed: int | None = None,
|
117
|
-
shard_size: int | None = None,
|
118
|
-
buffer_size: int | None = None,
|
119
|
-
): ...
|
@@ -8,7 +8,7 @@ class FragmentManifest:
|
|
8
8
|
def __getitem__(self, idx: int): ...
|
9
9
|
def to_arrow(self) -> pa.RecordBatchReader: ...
|
10
10
|
@staticmethod
|
11
|
-
def compute_schema(
|
11
|
+
def compute_schema() -> pa.Schema: ...
|
12
12
|
@staticmethod
|
13
13
|
def from_fragment(fragment_file: FragmentFile) -> FragmentManifest: ...
|
14
14
|
@staticmethod
|
spiral/debug/manifests.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
from collections.abc import Iterable
|
2
|
-
|
3
1
|
from spiral import datetime_
|
4
2
|
from spiral.core.table import Scan
|
5
3
|
from spiral.core.table.manifests import FragmentManifest
|
@@ -13,15 +11,15 @@ def display_scan_manifests(scan: Scan):
|
|
13
11
|
raise NotImplementedError("Multiple table scans are not supported.")
|
14
12
|
table_id = scan.table_ids()[0]
|
15
13
|
key_space_manifest = scan.key_space_state(table_id).manifest
|
16
|
-
column_group_manifests =
|
14
|
+
column_group_manifests = [
|
17
15
|
(column_group, scan.column_group_state(column_group).manifest) for column_group in scan.column_groups()
|
18
|
-
|
16
|
+
]
|
19
17
|
|
20
18
|
display_manifests(key_space_manifest, column_group_manifests)
|
21
19
|
|
22
20
|
|
23
21
|
def display_manifests(
|
24
|
-
key_space_manifest: FragmentManifest, column_group_manifests:
|
22
|
+
key_space_manifest: FragmentManifest, column_group_manifests: list[tuple[ColumnGroup, FragmentManifest]]
|
25
23
|
):
|
26
24
|
_table_of_fragments(
|
27
25
|
key_space_manifest,
|
spiral/scan.py
CHANGED
@@ -3,7 +3,8 @@ from typing import TYPE_CHECKING, Any
|
|
3
3
|
|
4
4
|
import pyarrow as pa
|
5
5
|
|
6
|
-
from spiral.core.
|
6
|
+
from spiral.core.client import ShuffleStrategy
|
7
|
+
from spiral.core.table import KeyRange
|
7
8
|
from spiral.core.table import Scan as CoreScan
|
8
9
|
from spiral.core.table.spec import Schema
|
9
10
|
from spiral.settings import CI, DEV
|
@@ -177,7 +178,7 @@ class Scan:
|
|
177
178
|
# Print manifests in a human-readable format.
|
178
179
|
from spiral.debug.manifests import display_scan_manifests
|
179
180
|
|
180
|
-
display_scan_manifests(self.
|
181
|
+
display_scan_manifests(self.core)
|
181
182
|
|
182
183
|
def _dump_metrics(self):
|
183
184
|
# Print metrics in a human-readable format.
|
spiral/streaming_/reader.py
CHANGED
@@ -70,6 +70,20 @@ class SpiralReader:
|
|
70
70
|
"""
|
71
71
|
return [(FileInfo(basename=self.filename), None)]
|
72
72
|
|
73
|
+
def get_max_size(self) -> int:
|
74
|
+
"""Get the full size of this shard.
|
75
|
+
|
76
|
+
"Max" in this case means both the raw (decompressed) and zip (compressed) versions are
|
77
|
+
resident (assuming it has a zip form). This is the maximum disk usage the shard can reach.
|
78
|
+
When compressed was used, even if keep_zip is ``False``, the zip form must still be
|
79
|
+
resident at the same time as the raw form during shard decompression.
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
int: Size in bytes.
|
83
|
+
"""
|
84
|
+
# TODO(marko): This is used to check cache limit is possible...
|
85
|
+
return 0
|
86
|
+
|
73
87
|
@functools.cached_property
|
74
88
|
def filename(self) -> str:
|
75
89
|
"""Used by SpiralStream to identify shard's file-on-disk, if it exists."""
|
spiral/streaming_/stream.py
CHANGED
@@ -24,10 +24,13 @@ class SpiralStream:
|
|
24
24
|
Stream can be passed to MDS's StreamingDataset in `streams` argument.
|
25
25
|
"""
|
26
26
|
|
27
|
-
def __init__(
|
27
|
+
def __init__(
|
28
|
+
self, scan: CoreScan, shards: list[Shard], cache_dir: str | None = None, shard_row_block_size: int = 8192
|
29
|
+
):
|
28
30
|
self._scan = scan
|
29
31
|
# TODO(marko): Read shards only on world.is_local_leader in `get_shards` and materialize on disk.
|
30
32
|
self._shards = shards
|
33
|
+
self.shard_row_block_size = shard_row_block_size
|
31
34
|
|
32
35
|
if cache_dir is not None:
|
33
36
|
if not os.path.exists(cache_dir):
|
@@ -92,7 +95,12 @@ class SpiralStream:
|
|
92
95
|
return 0
|
93
96
|
|
94
97
|
# This method exists but it's hidden.
|
95
|
-
self._scan._prepare_shard(
|
98
|
+
self._scan._prepare_shard(
|
99
|
+
shard_path,
|
100
|
+
shard.shard.key_range,
|
101
|
+
expected_cardinality=shard.shard.cardinality,
|
102
|
+
shard_row_block_size=self.shard_row_block_size,
|
103
|
+
)
|
96
104
|
|
97
105
|
# Get the size of the file on disk.
|
98
106
|
stat = os.stat(shard_path)
|
spiral/table.py
CHANGED
@@ -109,6 +109,17 @@ class Table(Expr):
|
|
109
109
|
partition_size_bytes=partition_size_bytes,
|
110
110
|
)
|
111
111
|
|
112
|
+
def drop_columns(self, column_paths: list[str]) -> None:
|
113
|
+
"""
|
114
|
+
Drops the specified columns from the table.
|
115
|
+
|
116
|
+
|
117
|
+
:param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
|
118
|
+
All columns must exist, if a a column doesn't exist the function will return an error.
|
119
|
+
"""
|
120
|
+
with self.txn() as txn:
|
121
|
+
txn.drop_columns(column_paths)
|
122
|
+
|
112
123
|
def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
|
113
124
|
"""Returns a snapshot of the table at the given timestamp."""
|
114
125
|
if isinstance(asof, datetime):
|
@@ -183,7 +194,9 @@ class Table(Expr):
|
|
183
194
|
*,
|
184
195
|
index: "KeySpaceIndex",
|
185
196
|
batch_size: int | None = None,
|
197
|
+
cache_dir: str | None = None,
|
186
198
|
cache_limit: int | str | None = None,
|
199
|
+
predownload: int | None = None,
|
187
200
|
sampling_method: str = "balanced",
|
188
201
|
sampling_granularity: int = 1,
|
189
202
|
partition_algo: str = "relaxed",
|
@@ -208,12 +221,13 @@ class Table(Expr):
|
|
208
221
|
"""
|
209
222
|
from streaming import StreamingDataset
|
210
223
|
|
211
|
-
stream = self.to_streaming(index=index)
|
224
|
+
stream = self.to_streaming(index=index, cache_dir=cache_dir)
|
212
225
|
|
213
226
|
return StreamingDataset(
|
214
227
|
streams=[stream],
|
215
228
|
batch_size=batch_size,
|
216
229
|
cache_limit=cache_limit,
|
230
|
+
predownload=predownload,
|
217
231
|
sampling_method=sampling_method,
|
218
232
|
sampling_granularity=sampling_granularity,
|
219
233
|
partition_algo=partition_algo,
|
@@ -226,13 +240,14 @@ class Table(Expr):
|
|
226
240
|
replication=replication,
|
227
241
|
)
|
228
242
|
|
229
|
-
def to_streaming(self, index: "KeySpaceIndex") -> "streaming.Stream":
|
243
|
+
def to_streaming(self, index: "KeySpaceIndex", *, cache_dir: str | None = None) -> "streaming.Stream":
|
230
244
|
"""Returns a stream to be used with MosaicML's StreamingDataset.
|
231
245
|
|
232
246
|
Requires `streaming` package to be installed.
|
233
247
|
|
234
248
|
Args:
|
235
249
|
index: Prebuilt KeysIndex to use when creating the stream. The index's `asof` will be used when scanning.
|
250
|
+
cache_dir: Directory to use for caching data. If None, a temporary directory will be used.
|
236
251
|
"""
|
237
252
|
from spiral.streaming_ import SpiralStream
|
238
253
|
|
@@ -254,4 +269,4 @@ class Table(Expr):
|
|
254
269
|
# We have a world there and can compute shards only on leader.
|
255
270
|
shards = self.spiral._core._ops().compute_shards(index=index.core)
|
256
271
|
|
257
|
-
return SpiralStream(scan=scan.core, shards=shards) # type: ignore[return-value]
|
272
|
+
return SpiralStream(scan=scan.core, shards=shards, cache_dir=cache_dir) # type: ignore[return-value]
|
spiral/transaction.py
CHANGED
@@ -39,6 +39,16 @@ class Transaction:
|
|
39
39
|
|
40
40
|
self._core.write(expr.__expr__, partition_size_bytes=partition_size_bytes)
|
41
41
|
|
42
|
+
def drop_columns(self, column_paths: list[str]):
|
43
|
+
"""
|
44
|
+
Drops the specified columns from the table.
|
45
|
+
|
46
|
+
|
47
|
+
:param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
|
48
|
+
All columns must exist, if a a column doesn't exist the function will return an error.
|
49
|
+
"""
|
50
|
+
self._core.drop_columns(column_paths)
|
51
|
+
|
42
52
|
def commit(self):
|
43
53
|
"""Commit the transaction."""
|
44
54
|
self._core.commit()
|
File without changes
|
File without changes
|