pyspiral 0.6.12__cp312-abi3-manylinux_2_28_x86_64.whl → 0.6.14__cp312-abi3-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.6.12.dist-info → pyspiral-0.6.14.dist-info}/METADATA +9 -8
- {pyspiral-0.6.12.dist-info → pyspiral-0.6.14.dist-info}/RECORD +21 -19
- spiral/__init__.py +7 -0
- spiral/_lib.abi3.so +0 -0
- spiral/cli/iceberg.py +1 -1
- spiral/cli/key_spaces.py +14 -0
- spiral/client.py +2 -9
- spiral/core/table/__init__.pyi +10 -1
- spiral/core/table/spec/__init__.pyi +3 -1
- spiral/dataloader.py +6 -1
- spiral/enrichment.py +153 -0
- spiral/expressions/__init__.py +3 -0
- spiral/expressions/http.py +16 -0
- spiral/expressions/s3.py +2 -5
- spiral/expressions/udf.py +6 -1
- spiral/scan.py +71 -56
- spiral/settings.py +9 -6
- spiral/table.py +32 -20
- spiral/transaction.py +18 -3
- {pyspiral-0.6.12.dist-info → pyspiral-0.6.14.dist-info}/WHEEL +0 -0
- {pyspiral-0.6.12.dist-info → pyspiral-0.6.14.dist-info}/entry_points.txt +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyspiral
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.14
|
|
4
4
|
Classifier: Intended Audience :: Science/Research
|
|
5
5
|
Classifier: Operating System :: OS Independent
|
|
6
6
|
Classifier: Programming Language :: Python
|
|
7
7
|
Classifier: Programming Language :: Python :: 3
|
|
8
8
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
11
9
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
10
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
11
|
Classifier: Programming Language :: Rust
|
|
@@ -31,20 +29,23 @@ Requires-Dist: typer>=0.16
|
|
|
31
29
|
Requires-Dist: xxhash>=3.4.1
|
|
32
30
|
Requires-Dist: polars>=1.31.0 ; extra == 'polars'
|
|
33
31
|
Requires-Dist: duckdb>=1.3.2 ; extra == 'duckdb'
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist:
|
|
32
|
+
Requires-Dist: pyiceberg[s3fs]>=0.9.1 ; extra == 'iceberg'
|
|
33
|
+
Requires-Dist: datasets>=4.0.0 ; extra == 'huggingface'
|
|
36
34
|
Requires-Dist: mosaicml-streaming>=0.13.0 ; extra == 'streaming'
|
|
37
35
|
Requires-Dist: vortex-data>=0.52.1 ; extra == 'streaming'
|
|
36
|
+
Requires-Dist: dask>=2025.10.0 ; extra == 'dask'
|
|
37
|
+
Requires-Dist: distributed>=2025.10.0 ; extra == 'dask'
|
|
38
38
|
Provides-Extra: polars
|
|
39
39
|
Provides-Extra: duckdb
|
|
40
|
-
Provides-Extra:
|
|
41
|
-
Provides-Extra:
|
|
40
|
+
Provides-Extra: iceberg
|
|
41
|
+
Provides-Extra: huggingface
|
|
42
42
|
Provides-Extra: streaming
|
|
43
|
+
Provides-Extra: dask
|
|
43
44
|
Summary: Python client for Spiral.
|
|
44
45
|
Home-Page: https://spiraldb.com
|
|
45
46
|
Author-email: SpiralDB <hello@spiraldb.com>
|
|
46
47
|
License: Proprietary License
|
|
47
|
-
Requires-Python: >=3.
|
|
48
|
+
Requires-Python: >=3.12
|
|
48
49
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
49
50
|
|
|
50
51
|
# PySpiral
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
pyspiral-0.6.
|
|
2
|
-
pyspiral-0.6.
|
|
3
|
-
pyspiral-0.6.
|
|
4
|
-
spiral/__init__.py,sha256=
|
|
5
|
-
spiral/_lib.abi3.so,sha256=
|
|
1
|
+
pyspiral-0.6.14.dist-info/METADATA,sha256=WGdXph89n9RYh7SmWR_6GH9vAn-5suV4sAQAf1E8QCM,1875
|
|
2
|
+
pyspiral-0.6.14.dist-info/WHEEL,sha256=ydlpo1_yEJ2g1Axq3LoOd_OfioJa2swc2j5IDCa4uho,107
|
|
3
|
+
pyspiral-0.6.14.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
|
|
4
|
+
spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
|
|
5
|
+
spiral/_lib.abi3.so,sha256=RY0P0UG_ejwYXZQcr3pkECfKKzRfNBnfHDBIcve6lrk,67382808
|
|
6
6
|
spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
|
|
7
7
|
spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
|
|
8
8
|
spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
|
|
@@ -23,8 +23,8 @@ spiral/cli/admin.py,sha256=-ubYqs8nKjnQStbQ68jpWx_9xh0TsaxI0wM1Hfko8_U,319
|
|
|
23
23
|
spiral/cli/app.py,sha256=smzGj5a2RwhM9RQChmlEeKZLN4Fk60-bP7Lm5_Is1Rw,2760
|
|
24
24
|
spiral/cli/console.py,sha256=6JHbAQV6MFWz3P-VzqPOjhHpkIQagsCdzTMvmuDKMkU,2580
|
|
25
25
|
spiral/cli/fs.py,sha256=vaPcSc2YghhHeipxNitIdsHaBhFwlwkvPFqYsFSN9P0,2927
|
|
26
|
-
spiral/cli/iceberg.py,sha256=
|
|
27
|
-
spiral/cli/key_spaces.py,sha256=
|
|
26
|
+
spiral/cli/iceberg.py,sha256=wdMyl0j821MLnXNZ6Kwm65ogh98C-pjMJm3Y6YqlnTI,3249
|
|
27
|
+
spiral/cli/key_spaces.py,sha256=Xaw7WH-Qw_j6AxisdIoKfjAgVRXLM9qBFzuCTjPAFLI,3516
|
|
28
28
|
spiral/cli/login.py,sha256=2tw6uN5rEpiMMAmjQSB3-JUPf3C0Wc1eTGCDxhYtJps,731
|
|
29
29
|
spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
|
|
30
30
|
spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
|
|
@@ -35,7 +35,7 @@ spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
|
|
|
35
35
|
spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
|
|
36
36
|
spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
|
|
37
37
|
spiral/cli/workloads.py,sha256=2_SLfQTFN6y73R9H0i9dk8VIOVagKxSxOpHXC56yptY,2015
|
|
38
|
-
spiral/client.py,sha256=
|
|
38
|
+
spiral/client.py,sha256=zMp-xXGL4R1Py_rYrC5o3jFLam1oA74azi50dvMP-_o,6329
|
|
39
39
|
spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
spiral/core/_tools/__init__.pyi,sha256=b2KLfTOQ67pjfbYt07o0IGiTu5o2bZw69lllV8v0Dps,143
|
|
41
41
|
spiral/core/authn/__init__.pyi,sha256=z_GWyIS62fuiYQrYO8hzw4W8oGaiciqS1u5qtAt54VY,769
|
|
@@ -49,26 +49,28 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
|
|
|
49
49
|
spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
|
|
50
50
|
spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
|
|
51
51
|
spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
|
|
52
|
-
spiral/core/table/__init__.pyi,sha256=
|
|
52
|
+
spiral/core/table/__init__.pyi,sha256=YBL12_JPTWz2mNbqlDqbT1exxVJYzwfXdHCi6Z37JxA,3841
|
|
53
53
|
spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
|
|
54
54
|
spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
|
|
55
|
-
spiral/core/table/spec/__init__.pyi,sha256=
|
|
56
|
-
spiral/dataloader.py,sha256=
|
|
55
|
+
spiral/core/table/spec/__init__.pyi,sha256=twzX4vFmgBxInZWq_nyP6DR9OQjjOVrbZMn97kndeS8,5808
|
|
56
|
+
spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
|
|
57
57
|
spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
|
|
58
58
|
spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
|
|
59
59
|
spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
|
|
61
61
|
spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
|
|
62
62
|
spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
|
|
63
|
-
spiral/
|
|
63
|
+
spiral/enrichment.py,sha256=e2yzNWTTG73uEkLTc4ccTNRQ94cBtM04eGzlJ2-kBOI,5851
|
|
64
|
+
spiral/expressions/__init__.py,sha256=Fp7Xx3exh9KJad92tgd_TGGIpYLQTHqWjW-pexzQibU,7981
|
|
64
65
|
spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
|
|
66
|
+
spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
|
|
65
67
|
spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
|
|
66
|
-
spiral/expressions/s3.py,sha256=
|
|
68
|
+
spiral/expressions/s3.py,sha256=bkd0HANerNKlOblp2z7JJOSWjF9Bw9lZe1A-KTrUEgk,378
|
|
67
69
|
spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
|
|
68
70
|
spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
|
|
69
71
|
spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
|
|
70
72
|
spiral/expressions/tiff.py,sha256=4dngO97bT1QY0By7-PxOQVmSwQC3PQAiixVhLJ-4HMQ,7986
|
|
71
|
-
spiral/expressions/udf.py,sha256=
|
|
73
|
+
spiral/expressions/udf.py,sha256=XOxa7Kocb4Cg4q_qFvRT6hVnVzi22CQenqrvS-TL-VY,1936
|
|
72
74
|
spiral/grpc_.py,sha256=f3czdP1Mxme42Y5--a5ogYq1TTiWn-J_MlGjwJ2mWwM,1015
|
|
73
75
|
spiral/iceberg.py,sha256=JGq62Qnf296r9_hRAoH85GQq45-uSBjwXWw_CvPi6G4,930
|
|
74
76
|
spiral/iterable_dataset.py,sha256=Eekg9ad8tcwXcloHWReBbvCSr5ZappRHn2ldKTvwqS0,4622
|
|
@@ -91,16 +93,16 @@ spiral/protogen/_/substrait/extensions/__init__.py,sha256=nhnEnho70GAT8WPj2xtwJU
|
|
|
91
93
|
spiral/protogen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
94
|
spiral/protogen/util.py,sha256=smnvVo6nYH3FfDm9jqhNLaXz4bbTBaQezHQDCTvZyiQ,1486
|
|
93
95
|
spiral/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
94
|
-
spiral/scan.py,sha256=
|
|
96
|
+
spiral/scan.py,sha256=csbk5ePbU-RlEVIF7isccF2zRBB8L8ZY_HEpalMjgLY,12340
|
|
95
97
|
spiral/server.py,sha256=ztBmB5lBnUz-smQxR_tC8AI5SOhz17wH0MI3GuzDUdM,600
|
|
96
|
-
spiral/settings.py,sha256=
|
|
98
|
+
spiral/settings.py,sha256=sUhMMBCXaPvUYztN_gztD9TjeUYJwVeEcJrq4FLy6M0,3232
|
|
97
99
|
spiral/snapshot.py,sha256=cTobi5jtiANxalGA-isokQHblNmXGtuUvgUGGNVybsI,1555
|
|
98
100
|
spiral/streaming_/__init__.py,sha256=s7MlW2ERsuZmZGExLFL6RcZon2e0tNBocBg5ANgki7k,61
|
|
99
101
|
spiral/streaming_/reader.py,sha256=tl_lC9xgh1-QFhsZn4xQT7It3PVTzHCEUT2BG2dWBRQ,4166
|
|
100
102
|
spiral/streaming_/stream.py,sha256=DM1hBDHnWm1ZFKZ-hZ4zxeSXITcUI6kWzwdJZvywI8o,5915
|
|
101
103
|
spiral/substrait_.py,sha256=AKeOD4KIXvz2J4TYxnIneOiHddtBIyOhuNxVO_uH0eg,12592
|
|
102
|
-
spiral/table.py,sha256=
|
|
104
|
+
spiral/table.py,sha256=prjDBcm6Qerdq3ypXzfbXb7ngAcO0j-Z9aTeZvzKoqs,12209
|
|
103
105
|
spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
|
|
104
|
-
spiral/transaction.py,sha256=
|
|
106
|
+
spiral/transaction.py,sha256=hQm6DfCklMDpIYJ9qA2wR45cCuUPGCiJy1tHGE3AsEY,3418
|
|
105
107
|
spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
|
|
106
|
-
pyspiral-0.6.
|
|
108
|
+
pyspiral-0.6.14.dist-info/RECORD,,
|
spiral/__init__.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
"""Python client for Spiral"""
|
|
2
2
|
|
|
3
|
+
import importlib
|
|
4
|
+
|
|
3
5
|
# This is here to make sure we load the native extension first
|
|
4
6
|
from spiral import _lib
|
|
5
7
|
|
|
6
8
|
# Eagerly import the Spiral library
|
|
7
9
|
assert _lib, "Spiral library"
|
|
8
10
|
|
|
11
|
+
|
|
9
12
|
from spiral.client import Spiral # noqa: E402
|
|
10
13
|
from spiral.core.client import Shard, ShuffleConfig # noqa: E402
|
|
11
14
|
from spiral.dataloader import SpiralDataLoader, World # noqa: E402
|
|
15
|
+
from spiral.enrichment import Enrichment # noqa: E402
|
|
12
16
|
from spiral.iceberg import Iceberg # noqa: E402
|
|
13
17
|
from spiral.key_space_index import KeySpaceIndex # noqa: E402
|
|
14
18
|
from spiral.project import Project # noqa: E402
|
|
@@ -24,6 +28,7 @@ __all__ = [
|
|
|
24
28
|
"Table",
|
|
25
29
|
"Snapshot",
|
|
26
30
|
"Transaction",
|
|
31
|
+
"Enrichment",
|
|
27
32
|
"Scan",
|
|
28
33
|
"Shard",
|
|
29
34
|
"ShuffleConfig",
|
|
@@ -33,3 +38,5 @@ __all__ = [
|
|
|
33
38
|
"World",
|
|
34
39
|
"Iceberg",
|
|
35
40
|
]
|
|
41
|
+
|
|
42
|
+
__version__ = importlib.metadata.version("pyspiral")
|
spiral/_lib.abi3.so
CHANGED
|
Binary file
|
spiral/cli/iceberg.py
CHANGED
|
@@ -8,7 +8,7 @@ from typer import Argument
|
|
|
8
8
|
from spiral.cli import CONSOLE, ERR_CONSOLE, AsyncTyper, state
|
|
9
9
|
from spiral.cli.types import ProjectArg
|
|
10
10
|
|
|
11
|
-
app = AsyncTyper(short_help="Apache Iceberg Catalog")
|
|
11
|
+
app = AsyncTyper(short_help="Apache Iceberg Catalog.")
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@app.command(help="List namespaces.")
|
spiral/cli/key_spaces.py
CHANGED
|
@@ -87,3 +87,17 @@ def sync(
|
|
|
87
87
|
index_id = get_index_id(project, name)
|
|
88
88
|
response = state.spiral.api.key_space_indexes.sync_index(index_id, SyncIndexRequest(resources=resources))
|
|
89
89
|
CONSOLE.print(f"Triggered sync job {response.worker_id} for index {index_id}.")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# TODO(marko): This will be removed.
|
|
93
|
+
@app.command(help="Run a sync and wait for it to complete.")
|
|
94
|
+
def sync_local(
|
|
95
|
+
project: ProjectArg,
|
|
96
|
+
name: Annotated[str | None, Option(help="Index name.")] = None,
|
|
97
|
+
):
|
|
98
|
+
"""Run a sync and wait for it to complete."""
|
|
99
|
+
index_id = get_index_id(project, name)
|
|
100
|
+
index = state.spiral.key_space_index(index_id)
|
|
101
|
+
snapshot = state.spiral.table(index.table_id).snapshot()
|
|
102
|
+
state.spiral.internal.update_key_space_index(index.core, snapshot.core)
|
|
103
|
+
CONSOLE.print(f"Index {index.name} is up to date as-of {snapshot.asof}.")
|
spiral/client.py
CHANGED
|
@@ -10,7 +10,7 @@ from spiral.core.client import Internal
|
|
|
10
10
|
from spiral.core.client import Spiral as CoreSpiral
|
|
11
11
|
from spiral.datetime_ import timestamp_micros
|
|
12
12
|
from spiral.expressions import ExprLike
|
|
13
|
-
from spiral.scan import Scan
|
|
13
|
+
from spiral.scan import Scan
|
|
14
14
|
from spiral.settings import Settings, settings
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
@@ -121,6 +121,7 @@ class Spiral:
|
|
|
121
121
|
where = se.lift(where)
|
|
122
122
|
|
|
123
123
|
return Scan(
|
|
124
|
+
self,
|
|
124
125
|
self.core.scan(
|
|
125
126
|
projection.__expr__,
|
|
126
127
|
filter=where.__expr__ if where else None,
|
|
@@ -128,14 +129,6 @@ class Spiral:
|
|
|
128
129
|
),
|
|
129
130
|
)
|
|
130
131
|
|
|
131
|
-
def load_scan(self, scan_state: ScanState) -> Scan:
|
|
132
|
-
"""Load a scan from a serialized scan state.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
scan_state: The serialized scan state.
|
|
136
|
-
"""
|
|
137
|
-
return Scan(self.core.load_scan(scan_state.core))
|
|
138
|
-
|
|
139
132
|
# TODO(marko): This should be query, and search should be query + scan.
|
|
140
133
|
def search(
|
|
141
134
|
self,
|
spiral/core/table/__init__.pyi
CHANGED
|
@@ -70,6 +70,7 @@ class Scan:
|
|
|
70
70
|
def scan_state(self) -> ScanState: ...
|
|
71
71
|
def to_record_batches(
|
|
72
72
|
self,
|
|
73
|
+
key_range: KeyRange | None = None,
|
|
73
74
|
key_table: pa.Table | pa.RecordBatch | None = None,
|
|
74
75
|
batch_readahead: int | None = None,
|
|
75
76
|
) -> pa.RecordBatchReader: ...
|
|
@@ -101,10 +102,18 @@ class Transaction:
|
|
|
101
102
|
status: str
|
|
102
103
|
|
|
103
104
|
def write(self, table: pa.RecordBatchReader, *, partition_size_bytes: int | None = None): ...
|
|
104
|
-
def writeback(
|
|
105
|
+
def writeback(
|
|
106
|
+
self,
|
|
107
|
+
scan: Scan,
|
|
108
|
+
*,
|
|
109
|
+
key_range: KeyRange | None = None,
|
|
110
|
+
partition_size_bytes: int | None = None,
|
|
111
|
+
batch_readahead: int | None = None,
|
|
112
|
+
): ...
|
|
105
113
|
def drop_columns(self, column_paths: list[str]): ...
|
|
106
114
|
def take(self) -> list[Operation]: ...
|
|
107
115
|
def include(self, ops: list[Operation]): ...
|
|
108
116
|
def commit(self): ...
|
|
109
117
|
def abort(self): ...
|
|
118
|
+
def is_empty(self) -> bool: ...
|
|
110
119
|
def metrics(self) -> dict[str, Any]: ...
|
spiral/dataloader.py
CHANGED
|
@@ -121,6 +121,7 @@ class SpiralDataLoader:
|
|
|
121
121
|
# TODO(os): accept vortex arrays here instead of Arrow
|
|
122
122
|
transform_fn: Callable[[pa.RecordBatch], Any] | None = None,
|
|
123
123
|
map_workers: int = 0,
|
|
124
|
+
infinite: bool = False,
|
|
124
125
|
):
|
|
125
126
|
"""Initialize SpiralDataLoader.
|
|
126
127
|
|
|
@@ -145,6 +146,9 @@ class SpiralDataLoader:
|
|
|
145
146
|
map_workers: Number of worker processes for parallel transform_fn
|
|
146
147
|
application. 0 means single-process (no parallelism). Use this for
|
|
147
148
|
CPU-bound transforms like tokenization or audio decoding.
|
|
149
|
+
infinite: Whether to cycle through the dataset infinitely. If True,
|
|
150
|
+
the dataloader will repeat the dataset indefinitely. If False,
|
|
151
|
+
the dataloader will stop after going through the dataset once.
|
|
148
152
|
"""
|
|
149
153
|
self.scan = scan
|
|
150
154
|
self.shards = shards if shards is not None else scan.shards()
|
|
@@ -157,6 +161,7 @@ class SpiralDataLoader:
|
|
|
157
161
|
self.batch_readahead = batch_readahead
|
|
158
162
|
self.transform_fn = transform_fn
|
|
159
163
|
self.map_workers = map_workers
|
|
164
|
+
self.infinite = infinite
|
|
160
165
|
|
|
161
166
|
self._samples_yielded = 0
|
|
162
167
|
|
|
@@ -176,7 +181,7 @@ class SpiralDataLoader:
|
|
|
176
181
|
shuffle=shuffle,
|
|
177
182
|
max_batch_size=self.batch_size,
|
|
178
183
|
batch_readahead=self.batch_readahead,
|
|
179
|
-
infinite=
|
|
184
|
+
infinite=self.infinite,
|
|
180
185
|
)
|
|
181
186
|
|
|
182
187
|
if self.skip_samples > 0:
|
spiral/enrichment.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import logging
|
|
3
|
+
from functools import partial
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
from spiral.core.client import Shard
|
|
7
|
+
from spiral.core.table.spec import Operation
|
|
8
|
+
from spiral.expressions import Expr
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from spiral import KeySpaceIndex, Table
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Enrichment:
|
|
17
|
+
"""
|
|
18
|
+
An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
|
|
19
|
+
with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
|
|
20
|
+
horizontally expanding tables are a powerful primitive.
|
|
21
|
+
|
|
22
|
+
NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
table: "Table",
|
|
28
|
+
projection: Expr,
|
|
29
|
+
where: Expr | None,
|
|
30
|
+
):
|
|
31
|
+
self._table = table
|
|
32
|
+
self._projection = projection
|
|
33
|
+
self._where = where
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def table(self) -> "Table":
|
|
37
|
+
"""The table to write back into."""
|
|
38
|
+
return self._table
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def projection(self) -> Expr:
|
|
42
|
+
"""The projection expression."""
|
|
43
|
+
return self._projection
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def where(self) -> Expr | None:
|
|
47
|
+
"""The filter expression."""
|
|
48
|
+
return self._where
|
|
49
|
+
|
|
50
|
+
def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
|
|
51
|
+
"""Apply the enrichment onto the table in a streaming fashion.
|
|
52
|
+
|
|
53
|
+
For large tables, consider using `apply_dask` for distributed execution.
|
|
54
|
+
"""
|
|
55
|
+
scan = self._table.spiral.scan(self._projection, where=self._where)
|
|
56
|
+
|
|
57
|
+
with self._table.txn() as txn:
|
|
58
|
+
txn.writeback(
|
|
59
|
+
scan,
|
|
60
|
+
partition_size_bytes=partition_size_bytes,
|
|
61
|
+
batch_readahead=batch_readahead,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# TODO(marko): Need to figure out this sharding with key space index in places.
|
|
65
|
+
# We could compute on-demand instead of requiring a resource.
|
|
66
|
+
def apply_dask(
|
|
67
|
+
self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
|
|
68
|
+
) -> None:
|
|
69
|
+
"""Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
|
|
70
|
+
|
|
71
|
+
If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
|
|
72
|
+
|
|
73
|
+
IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
|
|
74
|
+
usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
|
|
75
|
+
encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
|
|
76
|
+
executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
|
|
77
|
+
If that is not possible, please reach out to the support for assistance.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
index: Optional key space index to use for sharding the enrichment.
|
|
81
|
+
If not provided, the table's default sharding will be used.
|
|
82
|
+
**kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
|
|
83
|
+
such as `address` to connect to an existing cluster.
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
from dask.distributed import Client
|
|
87
|
+
except ImportError:
|
|
88
|
+
raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
|
|
89
|
+
|
|
90
|
+
# Connect before doing any work.
|
|
91
|
+
dask_client = Client(**kwargs)
|
|
92
|
+
|
|
93
|
+
# Start a transaction BEFORE the planning scan.
|
|
94
|
+
tx = self._table.txn()
|
|
95
|
+
plan_scan = self._table.spiral.scan(self._projection, where=self._where)
|
|
96
|
+
|
|
97
|
+
# Determine the "tasks". Use the index if provided.
|
|
98
|
+
shards = plan_scan.shards()
|
|
99
|
+
if index is not None:
|
|
100
|
+
# TODO(marko): This will use index's asof automatically.
|
|
101
|
+
shards = self._table.spiral.internal.compute_shards(index.core)
|
|
102
|
+
|
|
103
|
+
# Partially bind the enrichment function.
|
|
104
|
+
_compute = partial(
|
|
105
|
+
_enrichment_task,
|
|
106
|
+
settings_dict=self._table.spiral.config.model_dump(),
|
|
107
|
+
state_json=plan_scan.core.scan_state().to_json(),
|
|
108
|
+
output_table_id=self._table.table_id,
|
|
109
|
+
partition_size_bytes=partition_size_bytes,
|
|
110
|
+
)
|
|
111
|
+
enrichments = dask_client.map(_compute, shards)
|
|
112
|
+
|
|
113
|
+
logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
|
|
114
|
+
for result in dask_client.gather(enrichments):
|
|
115
|
+
result: EnrichmentTaskResult
|
|
116
|
+
tx.include(result.ops)
|
|
117
|
+
|
|
118
|
+
if tx.is_empty():
|
|
119
|
+
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
tx.commit()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclasses.dataclass
|
|
126
|
+
class EnrichmentTaskResult:
|
|
127
|
+
ops: list[Operation]
|
|
128
|
+
|
|
129
|
+
def __getstate__(self):
|
|
130
|
+
return {"ops": [op.to_json() for op in self.ops]}
|
|
131
|
+
|
|
132
|
+
def __setstate__(self, state):
|
|
133
|
+
self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# NOTE(marko): This function must be picklable!
|
|
137
|
+
def _enrichment_task(
|
|
138
|
+
shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
|
|
139
|
+
) -> EnrichmentTaskResult:
|
|
140
|
+
# Returns operations that can be included in a transaction.
|
|
141
|
+
from spiral import Scan, Spiral
|
|
142
|
+
from spiral.core.table import ScanState
|
|
143
|
+
from spiral.settings import Settings
|
|
144
|
+
|
|
145
|
+
settings: Settings = Settings.model_validate(settings_dict)
|
|
146
|
+
sp = Spiral(config=settings)
|
|
147
|
+
state = ScanState.from_json(state_json)
|
|
148
|
+
task_scan = Scan(sp, sp.core.load_scan(state))
|
|
149
|
+
table = sp.table(output_table_id)
|
|
150
|
+
|
|
151
|
+
task_tx = table.txn()
|
|
152
|
+
task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
|
|
153
|
+
return EnrichmentTaskResult(ops=task_tx.take())
|
spiral/expressions/__init__.py
CHANGED
|
@@ -8,7 +8,9 @@ import pyarrow as pa
|
|
|
8
8
|
|
|
9
9
|
from spiral import _lib, arrow_
|
|
10
10
|
|
|
11
|
+
from . import http as http
|
|
11
12
|
from . import list_ as list
|
|
13
|
+
from . import s3 as s3
|
|
12
14
|
from . import str_ as str
|
|
13
15
|
from . import struct as struct
|
|
14
16
|
from . import text as text
|
|
@@ -47,6 +49,7 @@ __all__ = [
|
|
|
47
49
|
"xor",
|
|
48
50
|
"text",
|
|
49
51
|
"s3",
|
|
52
|
+
"http",
|
|
50
53
|
"UDF",
|
|
51
54
|
]
|
|
52
55
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from spiral import _lib
|
|
2
|
+
from spiral.expressions.base import Expr, ExprLike
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get(expr: ExprLike) -> Expr:
|
|
6
|
+
"""Read data from the URL.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
expr: URLs of the data that needs to be read.
|
|
10
|
+
"""
|
|
11
|
+
from spiral import expressions as se
|
|
12
|
+
|
|
13
|
+
expr = se.lift(expr)
|
|
14
|
+
|
|
15
|
+
# This just works :)
|
|
16
|
+
return Expr(_lib.expr.s3.get(expr.__expr__))
|
spiral/expressions/s3.py
CHANGED
spiral/expressions/udf.py
CHANGED
|
@@ -46,7 +46,12 @@ class UDF(abc.ABC):
|
|
|
46
46
|
|
|
47
47
|
@abc.abstractmethod
|
|
48
48
|
def return_type(self, scope: pa.DataType) -> pa.DataType:
|
|
49
|
-
"""Must return the return type of the UDF given the input scope type.
|
|
49
|
+
"""Must return the return type of the UDF given the input scope type.
|
|
50
|
+
|
|
51
|
+
IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
|
|
52
|
+
including nested structs, meaning that all fields in structs must also be nullable,
|
|
53
|
+
and if those fields are structs, their fields must also be nullable, and so on.
|
|
54
|
+
"""
|
|
50
55
|
...
|
|
51
56
|
|
|
52
57
|
@abc.abstractmethod
|
spiral/scan.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
from functools import partial
|
|
1
2
|
from typing import TYPE_CHECKING, Any, Optional
|
|
2
3
|
|
|
3
4
|
import pyarrow as pa
|
|
4
5
|
|
|
5
6
|
from spiral.core.client import Shard, ShuffleConfig
|
|
7
|
+
from spiral.core.table import KeyRange
|
|
6
8
|
from spiral.core.table import Scan as CoreScan
|
|
7
|
-
from spiral.core.table import ScanState as CoreScanState
|
|
8
9
|
from spiral.core.table.spec import Schema
|
|
9
10
|
from spiral.settings import CI, DEV
|
|
10
11
|
|
|
@@ -16,37 +17,17 @@ if TYPE_CHECKING:
|
|
|
16
17
|
import streaming # noqa
|
|
17
18
|
import torch.utils.data as torchdata # noqa
|
|
18
19
|
|
|
20
|
+
from spiral.client import Spiral
|
|
19
21
|
from spiral.dataloader import SpiralDataLoader, World # noqa
|
|
20
22
|
|
|
21
23
|
|
|
22
|
-
class ScanState:
|
|
23
|
-
"""
|
|
24
|
-
Evaluated properties of the scan
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
__slots__ = ("core",)
|
|
28
|
-
|
|
29
|
-
def __init__(self, core: CoreScanState):
|
|
30
|
-
self.core = core
|
|
31
|
-
|
|
32
|
-
def __getstate__(self):
|
|
33
|
-
return self.core.to_json()
|
|
34
|
-
|
|
35
|
-
def __setstate__(self, state):
|
|
36
|
-
self.core = CoreScanState.from_json(state)
|
|
37
|
-
|
|
38
|
-
|
|
39
24
|
class Scan:
|
|
40
25
|
"""Scan object."""
|
|
41
26
|
|
|
42
|
-
def __init__(self, core: CoreScan):
|
|
27
|
+
def __init__(self, spiral: "Spiral", core: CoreScan):
|
|
28
|
+
self.spiral = spiral
|
|
43
29
|
self.core = core
|
|
44
30
|
|
|
45
|
-
@property
|
|
46
|
-
def scan_state(self) -> ScanState:
|
|
47
|
-
"""Returns evaluated properties of the scan."""
|
|
48
|
-
return ScanState(self.core.scan_state())
|
|
49
|
-
|
|
50
31
|
@property
|
|
51
32
|
def metrics(self) -> dict[str, Any]:
|
|
52
33
|
"""Returns metrics about the scan."""
|
|
@@ -72,6 +53,8 @@ class Scan:
|
|
|
72
53
|
|
|
73
54
|
def to_record_batches(
|
|
74
55
|
self,
|
|
56
|
+
*,
|
|
57
|
+
key_range: KeyRange | None = None,
|
|
75
58
|
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
|
76
59
|
batch_size: int | None = None,
|
|
77
60
|
batch_readahead: int | None = None,
|
|
@@ -79,6 +62,9 @@ class Scan:
|
|
|
79
62
|
"""Read as a stream of RecordBatches.
|
|
80
63
|
|
|
81
64
|
Args:
|
|
65
|
+
key_range: Optional key range to filter the scan.
|
|
66
|
+
If provided, the scan will only return rows within the key range.
|
|
67
|
+
Only one of key_range or key_table can be provided.
|
|
82
68
|
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
|
83
69
|
If None, the scan will be executed without a key table.
|
|
84
70
|
batch_size: the maximum number of rows per returned batch.
|
|
@@ -86,6 +72,9 @@ class Scan:
|
|
|
86
72
|
RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
|
|
87
73
|
batch_readahead: the number of batches to prefetch in the background.
|
|
88
74
|
"""
|
|
75
|
+
if key_range is not None and key_table is not None:
|
|
76
|
+
raise ValueError("Only one of key_range or key_table can be provided.")
|
|
77
|
+
|
|
89
78
|
if isinstance(key_table, pa.RecordBatchReader):
|
|
90
79
|
if batch_size is not None:
|
|
91
80
|
raise ValueError(
|
|
@@ -94,46 +83,54 @@ class Scan:
|
|
|
94
83
|
elif isinstance(key_table, pa.Table):
|
|
95
84
|
key_table = key_table.to_reader(max_chunksize=batch_size)
|
|
96
85
|
|
|
97
|
-
return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
|
86
|
+
return self.core.to_record_batches(key_range=key_range, key_table=key_table, batch_readahead=batch_readahead)
|
|
98
87
|
|
|
99
88
|
def to_table(
|
|
100
89
|
self,
|
|
90
|
+
*,
|
|
91
|
+
key_range: KeyRange | None = None,
|
|
101
92
|
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
|
102
93
|
) -> pa.Table:
|
|
103
94
|
"""Read into a single PyArrow Table.
|
|
104
95
|
|
|
105
96
|
Args:
|
|
97
|
+
key_range: Optional key range to filter the scan.
|
|
98
|
+
If provided, the scan will only return rows within the key range.
|
|
99
|
+
Only one of key_range or key_table can be provided.
|
|
106
100
|
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
|
107
101
|
If None, the scan will be executed without a key table.
|
|
108
102
|
"""
|
|
109
103
|
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
|
110
|
-
if DEV and not CI and key_table is None:
|
|
104
|
+
if DEV and not CI and key_table is None and key_range is None:
|
|
111
105
|
rb = self.core.to_record_batch()
|
|
112
106
|
return pa.Table.from_batches([rb])
|
|
113
107
|
|
|
114
|
-
return self.to_record_batches(key_table=key_table).read_all()
|
|
108
|
+
return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
|
|
115
109
|
|
|
116
110
|
def to_dask(self) -> "dd.DataFrame":
|
|
117
111
|
"""Read into a Dask DataFrame.
|
|
118
112
|
|
|
119
113
|
Requires the `dask` package to be installed.
|
|
114
|
+
|
|
115
|
+
IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
|
|
116
|
+
usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
|
|
117
|
+
encountering such issues, please reach out to the support for assistance.
|
|
120
118
|
"""
|
|
121
119
|
import dask.dataframe as dd
|
|
122
|
-
import pandas as pd
|
|
123
120
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
121
|
+
_read_shard = partial(
|
|
122
|
+
_read_shard_task,
|
|
123
|
+
settings_dict=self.spiral.config.model_dump(),
|
|
124
|
+
state_json=self.core.scan_state().to_json(),
|
|
125
|
+
)
|
|
129
126
|
return dd.from_map(_read_shard, self.shards())
|
|
130
127
|
|
|
131
|
-
def to_pandas(self) -> "pd.DataFrame":
|
|
128
|
+
def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
|
|
132
129
|
"""Read into a Pandas DataFrame.
|
|
133
130
|
|
|
134
131
|
Requires the `pandas` package to be installed.
|
|
135
132
|
"""
|
|
136
|
-
return self.to_table().to_pandas()
|
|
133
|
+
return self.to_table(key_range=key_range).to_pandas()
|
|
137
134
|
|
|
138
135
|
def to_polars(self) -> "pl.DataFrame":
|
|
139
136
|
"""Read into a Polars DataFrame.
|
|
@@ -188,16 +185,18 @@ class Scan:
|
|
|
188
185
|
|
|
189
186
|
Returns:
|
|
190
187
|
SpiralDataLoader with shards partitioned for this rank.
|
|
191
|
-
"""
|
|
192
|
-
# Example usage:
|
|
193
|
-
#
|
|
194
|
-
# Auto-detect from PyTorch distributed:
|
|
195
|
-
# loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
|
|
196
|
-
#
|
|
197
|
-
# Explicit world configuration:
|
|
198
|
-
# world = World(rank=0, world_size=4)
|
|
199
|
-
# loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
|
|
200
188
|
|
|
189
|
+
Auto-detect from PyTorch distributed:
|
|
190
|
+
```python
|
|
191
|
+
loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Explicit world configuration:
|
|
195
|
+
```python
|
|
196
|
+
world = World(rank=0, world_size=4)
|
|
197
|
+
loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
|
|
198
|
+
```
|
|
199
|
+
"""
|
|
201
200
|
from spiral.dataloader import SpiralDataLoader, World
|
|
202
201
|
|
|
203
202
|
if world is None:
|
|
@@ -231,19 +230,21 @@ class Scan:
|
|
|
231
230
|
|
|
232
231
|
Returns:
|
|
233
232
|
New SpiralDataLoader instance configured to resume from the checkpoint.
|
|
233
|
+
|
|
234
|
+
Save checkpoint during training:
|
|
235
|
+
```python
|
|
236
|
+
loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
|
|
237
|
+
checkpoint = loader.state_dict()
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
Resume later - uses same shards from checkpoint:
|
|
241
|
+
```python
|
|
242
|
+
resumed_loader = scan.resume_data_loader(
|
|
243
|
+
checkpoint,
|
|
244
|
+
batch_size=32,
|
|
245
|
+
transform_fn=my_transform,
|
|
246
|
+
)
|
|
234
247
|
"""
|
|
235
|
-
# Example usage:
|
|
236
|
-
#
|
|
237
|
-
# Save checkpoint during training:
|
|
238
|
-
# loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
|
|
239
|
-
# checkpoint = loader.state_dict()
|
|
240
|
-
#
|
|
241
|
-
# Resume later - uses same shards from checkpoint:
|
|
242
|
-
# resumed_loader = scan.resume_data_loader(
|
|
243
|
-
# checkpoint,
|
|
244
|
-
# batch_size=32,
|
|
245
|
-
# transform_fn=my_transform,
|
|
246
|
-
# )
|
|
247
248
|
from spiral.dataloader import SpiralDataLoader
|
|
248
249
|
|
|
249
250
|
return SpiralDataLoader.from_state_dict(self, state, **kwargs)
|
|
@@ -311,3 +312,17 @@ class Scan:
|
|
|
311
312
|
from spiral.debug.metrics import display_metrics
|
|
312
313
|
|
|
313
314
|
display_metrics(self.metrics)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# NOTE(marko): This function must be picklable!
|
|
318
|
+
def _read_shard_task(shard: Shard, *, settings_dict, state_json) -> "pd.DataFrame":
|
|
319
|
+
from spiral import Spiral
|
|
320
|
+
from spiral.core.table import ScanState
|
|
321
|
+
from spiral.settings import Settings
|
|
322
|
+
|
|
323
|
+
settings: Settings = Settings.model_validate(settings_dict)
|
|
324
|
+
sp = Spiral(config=settings)
|
|
325
|
+
state = ScanState.from_json(state_json)
|
|
326
|
+
task_scan = Scan(sp, sp.core.load_scan(state))
|
|
327
|
+
|
|
328
|
+
return task_scan.to_pandas(key_range=shard.key_range)
|
spiral/settings.py
CHANGED
|
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
from typing import TYPE_CHECKING, Annotated
|
|
5
5
|
|
|
6
6
|
import typer
|
|
7
|
-
from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
|
|
7
|
+
from pydantic import Field, PlainSerializer, ValidatorFunctionWrapHandler, WrapValidator
|
|
8
8
|
from pydantic_settings import (
|
|
9
9
|
BaseSettings,
|
|
10
10
|
InitSettingsSource,
|
|
@@ -28,13 +28,16 @@ PACKAGE_NAME = "pyspiral"
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def validate_token(v, handler: ValidatorFunctionWrapHandler):
|
|
31
|
-
if isinstance(v, str):
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
raise ValueError("Token value must be a string")
|
|
31
|
+
if not isinstance(v, str):
|
|
32
|
+
raise ValueError("Token value (SPIRAL__SPIRALDB__TOKEN) must be a string")
|
|
33
|
+
return Token(v)
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
TokenType = Annotated[
|
|
36
|
+
TokenType = Annotated[
|
|
37
|
+
Token,
|
|
38
|
+
WrapValidator(validate_token),
|
|
39
|
+
PlainSerializer(lambda token: token.expose_secret(), return_type=str),
|
|
40
|
+
]
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class SpiralDBSettings(BaseSettings):
|
spiral/table.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
3
3
|
|
|
4
4
|
from spiral.core.table import Table as CoreTable
|
|
5
5
|
from spiral.core.table.spec import Schema
|
|
6
|
+
from spiral.enrichment import Enrichment
|
|
6
7
|
from spiral.expressions.base import Expr, ExprLike
|
|
7
8
|
from spiral.settings import settings
|
|
8
9
|
from spiral.snapshot import Snapshot
|
|
@@ -12,13 +13,11 @@ if TYPE_CHECKING:
|
|
|
12
13
|
import duckdb
|
|
13
14
|
import polars as pl
|
|
14
15
|
import pyarrow.dataset as ds
|
|
15
|
-
import streaming
|
|
16
|
-
import torch.utils.data as torchdata # noqa
|
|
17
16
|
|
|
18
17
|
from spiral.client import Spiral
|
|
19
18
|
from spiral.dataloader import SpiralDataLoader
|
|
20
19
|
from spiral.key_space_index import KeySpaceIndex
|
|
21
|
-
from spiral.
|
|
20
|
+
from spiral.streaming_ import SpiralStream
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class Table(Expr):
|
|
@@ -51,6 +50,14 @@ class Table(Expr):
|
|
|
51
50
|
"""Returns the fully qualified identifier of the table."""
|
|
52
51
|
return self._identifier or self.table_id
|
|
53
52
|
|
|
53
|
+
@property
|
|
54
|
+
def project(self) -> str | None:
|
|
55
|
+
"""Returns the project of the table."""
|
|
56
|
+
if self._identifier is None:
|
|
57
|
+
return None
|
|
58
|
+
project, _, _ = self._identifier.split(".")
|
|
59
|
+
return project
|
|
60
|
+
|
|
54
61
|
@property
|
|
55
62
|
def dataset(self) -> str | None:
|
|
56
63
|
"""Returns the dataset of the table."""
|
|
@@ -111,24 +118,29 @@ class Table(Expr):
|
|
|
111
118
|
partition_size_bytes=partition_size_bytes,
|
|
112
119
|
)
|
|
113
120
|
|
|
114
|
-
def
|
|
121
|
+
def enrich(
|
|
115
122
|
self,
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"""Write back the results of a scan to the table.
|
|
123
|
+
*projections: ExprLike,
|
|
124
|
+
where: ExprLike | None = None,
|
|
125
|
+
) -> Enrichment:
|
|
126
|
+
"""Returns an Enrichment object that, when applied, produces new columns.
|
|
121
127
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
128
|
+
Enrichment can be applied in different ways, e.g. distributed.
|
|
129
|
+
|
|
130
|
+
:param projections: Projection expressions deriving new columns to write back.
|
|
131
|
+
Expressions can be over multiple Spiral tables, but all tables including
|
|
132
|
+
this one must share the same key schema.
|
|
133
|
+
:param where: Optional filter expression to apply when reading the input tables.
|
|
126
134
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
135
|
+
from spiral import expressions as se
|
|
136
|
+
|
|
137
|
+
# Combine table with all projections into a single struct.
|
|
138
|
+
# The table is included to ensure key columns are present in the scan output.
|
|
139
|
+
projection = se.merge(self, *projections)
|
|
140
|
+
if where is not None:
|
|
141
|
+
where = se.lift(where)
|
|
142
|
+
|
|
143
|
+
return Enrichment(self, projection, where)
|
|
132
144
|
|
|
133
145
|
def drop_columns(self, column_paths: list[str]) -> None:
|
|
134
146
|
"""
|
|
@@ -275,7 +287,7 @@ class Table(Expr):
|
|
|
275
287
|
projection: Expr | None = None,
|
|
276
288
|
cache_dir: str | None = None,
|
|
277
289
|
shard_row_block_size: int | None = None,
|
|
278
|
-
) -> "
|
|
290
|
+
) -> "SpiralStream":
|
|
279
291
|
"""Returns a stream to be used with MosaicML's StreamingDataset.
|
|
280
292
|
|
|
281
293
|
Requires `streaming` package to be installed.
|
|
@@ -310,4 +322,4 @@ class Table(Expr):
|
|
|
310
322
|
shards=shards,
|
|
311
323
|
cache_dir=cache_dir,
|
|
312
324
|
shard_row_block_size=shard_row_block_size,
|
|
313
|
-
)
|
|
325
|
+
)
|
spiral/transaction.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from spiral.core.table import KeyRange
|
|
1
2
|
from spiral.core.table import Transaction as CoreTransaction
|
|
2
3
|
from spiral.core.table.spec import Operation
|
|
3
4
|
from spiral.expressions.base import ExprLike
|
|
@@ -19,6 +20,10 @@ class Transaction:
|
|
|
19
20
|
"""The status of the transaction."""
|
|
20
21
|
return self._core.status
|
|
21
22
|
|
|
23
|
+
def is_empty(self) -> bool:
|
|
24
|
+
"""Check if the transaction has no operations."""
|
|
25
|
+
return self._core.is_empty()
|
|
26
|
+
|
|
22
27
|
def __enter__(self):
|
|
23
28
|
return self
|
|
24
29
|
|
|
@@ -41,16 +46,26 @@ class Transaction:
|
|
|
41
46
|
|
|
42
47
|
self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
|
|
43
48
|
|
|
44
|
-
def writeback(
|
|
49
|
+
def writeback(
|
|
50
|
+
self,
|
|
51
|
+
scan: Scan,
|
|
52
|
+
*,
|
|
53
|
+
key_range: KeyRange | None = None,
|
|
54
|
+
partition_size_bytes: int | None = None,
|
|
55
|
+
batch_readahead: int | None = None,
|
|
56
|
+
):
|
|
45
57
|
"""Write back the results of a scan to the table.
|
|
46
58
|
|
|
47
59
|
:param scan: The scan to write back.
|
|
48
60
|
The scan does NOT need to be over the same table as transaction,
|
|
49
61
|
but it does need to have the same key schema.
|
|
62
|
+
:param key_range: Optional key range to limit the writeback to.
|
|
50
63
|
:param partition_size_bytes: The maximum partition size in bytes.
|
|
51
|
-
|
|
64
|
+
:param batch_readahead: The number of batches to read ahead when evaluating the scan.
|
|
52
65
|
"""
|
|
53
|
-
self._core.writeback(
|
|
66
|
+
self._core.writeback(
|
|
67
|
+
scan.core, key_range=key_range, partition_size_bytes=partition_size_bytes, batch_readahead=batch_readahead
|
|
68
|
+
)
|
|
54
69
|
|
|
55
70
|
def drop_columns(self, column_paths: list[str]):
|
|
56
71
|
"""
|
|
File without changes
|
|
File without changes
|