pyspiral 0.6.12__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.6.12
3
+ Version: 0.6.13
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -31,15 +31,18 @@ Requires-Dist: typer>=0.16
31
31
  Requires-Dist: xxhash>=3.4.1
32
32
  Requires-Dist: polars>=1.31.0 ; extra == 'polars'
33
33
  Requires-Dist: duckdb>=1.3.2 ; extra == 'duckdb'
34
- Requires-Dist: datasets>=4.0.0 ; extra == 'datasets'
35
- Requires-Dist: pyiceberg>=0.9.1 ; extra == 'pyiceberg'
34
+ Requires-Dist: pyiceberg[s3fs]>=0.9.1 ; extra == 'iceberg'
35
+ Requires-Dist: datasets>=4.0.0 ; extra == 'huggingface'
36
36
  Requires-Dist: mosaicml-streaming>=0.13.0 ; extra == 'streaming'
37
37
  Requires-Dist: vortex-data>=0.52.1 ; extra == 'streaming'
38
+ Requires-Dist: dask>=2025.10.0 ; extra == 'dask'
39
+ Requires-Dist: distributed>=2025.10.0 ; extra == 'dask'
38
40
  Provides-Extra: polars
39
41
  Provides-Extra: duckdb
40
- Provides-Extra: datasets
41
- Provides-Extra: pyiceberg
42
+ Provides-Extra: iceberg
43
+ Provides-Extra: huggingface
42
44
  Provides-Extra: streaming
45
+ Provides-Extra: dask
43
46
  Summary: Python client for Spiral.
44
47
  Home-Page: https://spiraldb.com
45
48
  Author-email: SpiralDB <hello@spiraldb.com>
@@ -1,8 +1,8 @@
1
- pyspiral-0.6.12.dist-info/METADATA,sha256=ANXjtdzd8s_zdWLd-mTm0X07pWbjlpkjQP8X5yP4qpY,1843
2
- pyspiral-0.6.12.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
3
- pyspiral-0.6.12.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
4
- spiral/__init__.py,sha256=n4JNLrO3wyw_k_U_JKyNiGON0wEpfvqxDhDdB2P6dhM,1007
5
- spiral/_lib.abi3.so,sha256=XaAqnKj8sXWK4OjxsLtR-hAI2hAw523pL2aTyBrF_Is,61055320
1
+ pyspiral-0.6.13.dist-info/METADATA,sha256=AvViHjB1v9OqYTKONYf_DbfCB0HktAy-numkhQhuq20,1977
2
+ pyspiral-0.6.13.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
3
+ pyspiral-0.6.13.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
4
+ spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
5
+ spiral/_lib.abi3.so,sha256=ciM05gN0vs_C6SCqv5LmppQZOZkZcpSb8unZAFc9oXA,61221816
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -23,8 +23,8 @@ spiral/cli/admin.py,sha256=-ubYqs8nKjnQStbQ68jpWx_9xh0TsaxI0wM1Hfko8_U,319
23
23
  spiral/cli/app.py,sha256=smzGj5a2RwhM9RQChmlEeKZLN4Fk60-bP7Lm5_Is1Rw,2760
24
24
  spiral/cli/console.py,sha256=6JHbAQV6MFWz3P-VzqPOjhHpkIQagsCdzTMvmuDKMkU,2580
25
25
  spiral/cli/fs.py,sha256=vaPcSc2YghhHeipxNitIdsHaBhFwlwkvPFqYsFSN9P0,2927
26
- spiral/cli/iceberg.py,sha256=Q14tcGcn1LixbFCYP0GhfYwFFXTmmi8tqBPYwalJEyE,3248
27
- spiral/cli/key_spaces.py,sha256=TF1tbRnrjemp4aMAbLc7o4_jPChIumaQGPuvfW0sR5o,2945
26
+ spiral/cli/iceberg.py,sha256=wdMyl0j821MLnXNZ6Kwm65ogh98C-pjMJm3Y6YqlnTI,3249
27
+ spiral/cli/key_spaces.py,sha256=Xaw7WH-Qw_j6AxisdIoKfjAgVRXLM9qBFzuCTjPAFLI,3516
28
28
  spiral/cli/login.py,sha256=2tw6uN5rEpiMMAmjQSB3-JUPf3C0Wc1eTGCDxhYtJps,731
29
29
  spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
30
30
  spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
@@ -35,7 +35,7 @@ spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
35
35
  spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
36
36
  spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
37
37
  spiral/cli/workloads.py,sha256=2_SLfQTFN6y73R9H0i9dk8VIOVagKxSxOpHXC56yptY,2015
38
- spiral/client.py,sha256=pw6vB85oLVbBudc_HRzmLCItcecsTjNM5SMu_kVOMCo,6568
38
+ spiral/client.py,sha256=zMp-xXGL4R1Py_rYrC5o3jFLam1oA74azi50dvMP-_o,6329
39
39
  spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  spiral/core/_tools/__init__.pyi,sha256=b2KLfTOQ67pjfbYt07o0IGiTu5o2bZw69lllV8v0Dps,143
41
41
  spiral/core/authn/__init__.pyi,sha256=z_GWyIS62fuiYQrYO8hzw4W8oGaiciqS1u5qtAt54VY,769
@@ -49,26 +49,28 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
49
49
  spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
50
50
  spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
51
51
  spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
52
- spiral/core/table/__init__.pyi,sha256=HN4ag8E1QDF_VgekJZqjhuQLhorU3ivjIOBHai2OEVc,3672
52
+ spiral/core/table/__init__.pyi,sha256=YBL12_JPTWz2mNbqlDqbT1exxVJYzwfXdHCi6Z37JxA,3841
53
53
  spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
54
54
  spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
55
- spiral/core/table/spec/__init__.pyi,sha256=PgacM_fZmkHuplj7IbYrj5KfFI3-VPYnyuzI2w7A70Y,5717
56
- spiral/dataloader.py,sha256=2haLoI6KLrzXfPozAgEa-eCOSDsNldJ1qwCmFpNMyTQ,10281
55
+ spiral/core/table/spec/__init__.pyi,sha256=twzX4vFmgBxInZWq_nyP6DR9OQjjOVrbZMn97kndeS8,5808
56
+ spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
57
57
  spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
58
58
  spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
59
59
  spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/expressions/__init__.py,sha256=UNxK5qQNrl-BuHsjKcWDj35w5lJviLkGFUQj8OhLID0,7919
63
+ spiral/enrichment.py,sha256=e2yzNWTTG73uEkLTc4ccTNRQ94cBtM04eGzlJ2-kBOI,5851
64
+ spiral/expressions/__init__.py,sha256=Fp7Xx3exh9KJad92tgd_TGGIpYLQTHqWjW-pexzQibU,7981
64
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
+ spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
65
67
  spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
66
- spiral/expressions/s3.py,sha256=D-kuLifIEY314Q8rB2-ZP8U-IT0FywtbJDMuyusBKiQ,414
68
+ spiral/expressions/s3.py,sha256=bkd0HANerNKlOblp2z7JJOSWjF9Bw9lZe1A-KTrUEgk,378
67
69
  spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
68
70
  spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
69
71
  spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
70
72
  spiral/expressions/tiff.py,sha256=4dngO97bT1QY0By7-PxOQVmSwQC3PQAiixVhLJ-4HMQ,7986
71
- spiral/expressions/udf.py,sha256=yvZCuGK9S9Sa9I18h-apUxsDni2B7E9WEqPrxHBjUWE,1657
73
+ spiral/expressions/udf.py,sha256=XOxa7Kocb4Cg4q_qFvRT6hVnVzi22CQenqrvS-TL-VY,1936
72
74
  spiral/grpc_.py,sha256=f3czdP1Mxme42Y5--a5ogYq1TTiWn-J_MlGjwJ2mWwM,1015
73
75
  spiral/iceberg.py,sha256=JGq62Qnf296r9_hRAoH85GQq45-uSBjwXWw_CvPi6G4,930
74
76
  spiral/iterable_dataset.py,sha256=Eekg9ad8tcwXcloHWReBbvCSr5ZappRHn2ldKTvwqS0,4622
@@ -91,16 +93,16 @@ spiral/protogen/_/substrait/extensions/__init__.py,sha256=nhnEnho70GAT8WPj2xtwJU
91
93
  spiral/protogen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
94
  spiral/protogen/util.py,sha256=smnvVo6nYH3FfDm9jqhNLaXz4bbTBaQezHQDCTvZyiQ,1486
93
95
  spiral/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
- spiral/scan.py,sha256=fXZq0NL9YApt-UwkGpaT0ETn-rK-1_tltq7nqsImZI4,11199
96
+ spiral/scan.py,sha256=csbk5ePbU-RlEVIF7isccF2zRBB8L8ZY_HEpalMjgLY,12340
95
97
  spiral/server.py,sha256=ztBmB5lBnUz-smQxR_tC8AI5SOhz17wH0MI3GuzDUdM,600
96
- spiral/settings.py,sha256=JRQSwjJyNaCqQdQLxiqB_O_LZRQXMLyshJBrI2LZHwM,3113
98
+ spiral/settings.py,sha256=sUhMMBCXaPvUYztN_gztD9TjeUYJwVeEcJrq4FLy6M0,3232
97
99
  spiral/snapshot.py,sha256=cTobi5jtiANxalGA-isokQHblNmXGtuUvgUGGNVybsI,1555
98
100
  spiral/streaming_/__init__.py,sha256=s7MlW2ERsuZmZGExLFL6RcZon2e0tNBocBg5ANgki7k,61
99
101
  spiral/streaming_/reader.py,sha256=tl_lC9xgh1-QFhsZn4xQT7It3PVTzHCEUT2BG2dWBRQ,4166
100
102
  spiral/streaming_/stream.py,sha256=DM1hBDHnWm1ZFKZ-hZ4zxeSXITcUI6kWzwdJZvywI8o,5915
101
103
  spiral/substrait_.py,sha256=AKeOD4KIXvz2J4TYxnIneOiHddtBIyOhuNxVO_uH0eg,12592
102
- spiral/table.py,sha256=dwQr1EAACbfxG8fISFqRrUEAE2P2y6xsx0vFK9Gwyfc,11662
104
+ spiral/table.py,sha256=prjDBcm6Qerdq3ypXzfbXb7ngAcO0j-Z9aTeZvzKoqs,12209
103
105
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
104
- spiral/transaction.py,sha256=M_Tf-TijVBluuInWk6XSFNCR2dKN4S9EdsHM3QD20ng,2948
106
+ spiral/transaction.py,sha256=hQm6DfCklMDpIYJ9qA2wR45cCuUPGCiJy1tHGE3AsEY,3418
105
107
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
106
- pyspiral-0.6.12.dist-info/RECORD,,
108
+ pyspiral-0.6.13.dist-info/RECORD,,
spiral/__init__.py CHANGED
@@ -1,14 +1,18 @@
1
1
  """Python client for Spiral"""
2
2
 
3
+ import importlib
4
+
3
5
  # This is here to make sure we load the native extension first
4
6
  from spiral import _lib
5
7
 
6
8
  # Eagerly import the Spiral library
7
9
  assert _lib, "Spiral library"
8
10
 
11
+
9
12
  from spiral.client import Spiral # noqa: E402
10
13
  from spiral.core.client import Shard, ShuffleConfig # noqa: E402
11
14
  from spiral.dataloader import SpiralDataLoader, World # noqa: E402
15
+ from spiral.enrichment import Enrichment # noqa: E402
12
16
  from spiral.iceberg import Iceberg # noqa: E402
13
17
  from spiral.key_space_index import KeySpaceIndex # noqa: E402
14
18
  from spiral.project import Project # noqa: E402
@@ -24,6 +28,7 @@ __all__ = [
24
28
  "Table",
25
29
  "Snapshot",
26
30
  "Transaction",
31
+ "Enrichment",
27
32
  "Scan",
28
33
  "Shard",
29
34
  "ShuffleConfig",
@@ -33,3 +38,5 @@ __all__ = [
33
38
  "World",
34
39
  "Iceberg",
35
40
  ]
41
+
42
+ __version__ = importlib.metadata.version("pyspiral")
spiral/_lib.abi3.so CHANGED
Binary file
spiral/cli/iceberg.py CHANGED
@@ -8,7 +8,7 @@ from typer import Argument
8
8
  from spiral.cli import CONSOLE, ERR_CONSOLE, AsyncTyper, state
9
9
  from spiral.cli.types import ProjectArg
10
10
 
11
- app = AsyncTyper(short_help="Apache Iceberg Catalog")
11
+ app = AsyncTyper(short_help="Apache Iceberg Catalog.")
12
12
 
13
13
 
14
14
  @app.command(help="List namespaces.")
spiral/cli/key_spaces.py CHANGED
@@ -87,3 +87,17 @@ def sync(
87
87
  index_id = get_index_id(project, name)
88
88
  response = state.spiral.api.key_space_indexes.sync_index(index_id, SyncIndexRequest(resources=resources))
89
89
  CONSOLE.print(f"Triggered sync job {response.worker_id} for index {index_id}.")
90
+
91
+
92
+ # TODO(marko): This will be removed.
93
+ @app.command(help="Run a sync and wait for it to complete.")
94
+ def sync_local(
95
+ project: ProjectArg,
96
+ name: Annotated[str | None, Option(help="Index name.")] = None,
97
+ ):
98
+ """Run a sync and wait for it to complete."""
99
+ index_id = get_index_id(project, name)
100
+ index = state.spiral.key_space_index(index_id)
101
+ snapshot = state.spiral.table(index.table_id).snapshot()
102
+ state.spiral.internal.update_key_space_index(index.core, snapshot.core)
103
+ CONSOLE.print(f"Index {index.name} is up to date as-of {snapshot.asof}.")
spiral/client.py CHANGED
@@ -10,7 +10,7 @@ from spiral.core.client import Internal
10
10
  from spiral.core.client import Spiral as CoreSpiral
11
11
  from spiral.datetime_ import timestamp_micros
12
12
  from spiral.expressions import ExprLike
13
- from spiral.scan import Scan, ScanState
13
+ from spiral.scan import Scan
14
14
  from spiral.settings import Settings, settings
15
15
 
16
16
  if TYPE_CHECKING:
@@ -121,6 +121,7 @@ class Spiral:
121
121
  where = se.lift(where)
122
122
 
123
123
  return Scan(
124
+ self,
124
125
  self.core.scan(
125
126
  projection.__expr__,
126
127
  filter=where.__expr__ if where else None,
@@ -128,14 +129,6 @@ class Spiral:
128
129
  ),
129
130
  )
130
131
 
131
- def load_scan(self, scan_state: ScanState) -> Scan:
132
- """Load a scan from a serialized scan state.
133
-
134
- Args:
135
- scan_state: The serialized scan state.
136
- """
137
- return Scan(self.core.load_scan(scan_state.core))
138
-
139
132
  # TODO(marko): This should be query, and search should be query + scan.
140
133
  def search(
141
134
  self,
@@ -70,6 +70,7 @@ class Scan:
70
70
  def scan_state(self) -> ScanState: ...
71
71
  def to_record_batches(
72
72
  self,
73
+ key_range: KeyRange | None = None,
73
74
  key_table: pa.Table | pa.RecordBatch | None = None,
74
75
  batch_readahead: int | None = None,
75
76
  ) -> pa.RecordBatchReader: ...
@@ -101,10 +102,18 @@ class Transaction:
101
102
  status: str
102
103
 
103
104
  def write(self, table: pa.RecordBatchReader, *, partition_size_bytes: int | None = None): ...
104
- def writeback(self, scan: Scan, *, partition_size_bytes: int | None = None, batch_readahead: int | None = None): ...
105
+ def writeback(
106
+ self,
107
+ scan: Scan,
108
+ *,
109
+ key_range: KeyRange | None = None,
110
+ partition_size_bytes: int | None = None,
111
+ batch_readahead: int | None = None,
112
+ ): ...
105
113
  def drop_columns(self, column_paths: list[str]): ...
106
114
  def take(self) -> list[Operation]: ...
107
115
  def include(self, ops: list[Operation]): ...
108
116
  def commit(self): ...
109
117
  def abort(self): ...
118
+ def is_empty(self) -> bool: ...
110
119
  def metrics(self) -> dict[str, Any]: ...
@@ -64,7 +64,9 @@ class ColumnGroupMetadata:
64
64
 
65
65
  class Operation:
66
66
  # Base class for all operations in the WAL.
67
- ...
67
+ def to_json(self) -> str: ...
68
+ @staticmethod
69
+ def from_json(json: str) -> Operation: ...
68
70
 
69
71
  class LogEntry:
70
72
  ts: int
spiral/dataloader.py CHANGED
@@ -121,6 +121,7 @@ class SpiralDataLoader:
121
121
  # TODO(os): accept vortex arrays here instead of Arrow
122
122
  transform_fn: Callable[[pa.RecordBatch], Any] | None = None,
123
123
  map_workers: int = 0,
124
+ infinite: bool = False,
124
125
  ):
125
126
  """Initialize SpiralDataLoader.
126
127
 
@@ -145,6 +146,9 @@ class SpiralDataLoader:
145
146
  map_workers: Number of worker processes for parallel transform_fn
146
147
  application. 0 means single-process (no parallelism). Use this for
147
148
  CPU-bound transforms like tokenization or audio decoding.
149
+ infinite: Whether to cycle through the dataset infinitely. If True,
150
+ the dataloader will repeat the dataset indefinitely. If False,
151
+ the dataloader will stop after going through the dataset once.
148
152
  """
149
153
  self.scan = scan
150
154
  self.shards = shards if shards is not None else scan.shards()
@@ -157,6 +161,7 @@ class SpiralDataLoader:
157
161
  self.batch_readahead = batch_readahead
158
162
  self.transform_fn = transform_fn
159
163
  self.map_workers = map_workers
164
+ self.infinite = infinite
160
165
 
161
166
  self._samples_yielded = 0
162
167
 
@@ -176,7 +181,7 @@ class SpiralDataLoader:
176
181
  shuffle=shuffle,
177
182
  max_batch_size=self.batch_size,
178
183
  batch_readahead=self.batch_readahead,
179
- infinite=False,
184
+ infinite=self.infinite,
180
185
  )
181
186
 
182
187
  if self.skip_samples > 0:
spiral/enrichment.py ADDED
@@ -0,0 +1,153 @@
1
+ import dataclasses
2
+ import logging
3
+ from functools import partial
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ from spiral.core.client import Shard
7
+ from spiral.core.table.spec import Operation
8
+ from spiral.expressions import Expr
9
+
10
+ if TYPE_CHECKING:
11
+ from spiral import KeySpaceIndex, Table
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Enrichment:
17
+ """
18
+ An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
19
+ with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
20
+ horizontally expanding tables are a powerful primitive.
21
+
22
+ NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ table: "Table",
28
+ projection: Expr,
29
+ where: Expr | None,
30
+ ):
31
+ self._table = table
32
+ self._projection = projection
33
+ self._where = where
34
+
35
+ @property
36
+ def table(self) -> "Table":
37
+ """The table to write back into."""
38
+ return self._table
39
+
40
+ @property
41
+ def projection(self) -> Expr:
42
+ """The projection expression."""
43
+ return self._projection
44
+
45
+ @property
46
+ def where(self) -> Expr | None:
47
+ """The filter expression."""
48
+ return self._where
49
+
50
+ def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
51
+ """Apply the enrichment onto the table in a streaming fashion.
52
+
53
+ For large tables, consider using `apply_dask` for distributed execution.
54
+ """
55
+ scan = self._table.spiral.scan(self._projection, where=self._where)
56
+
57
+ with self._table.txn() as txn:
58
+ txn.writeback(
59
+ scan,
60
+ partition_size_bytes=partition_size_bytes,
61
+ batch_readahead=batch_readahead,
62
+ )
63
+
64
+ # TODO(marko): Need to figure out this sharding with key space index in places.
65
+ # We could compute on-demand instead of requiring a resource.
66
+ def apply_dask(
67
+ self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
68
+ ) -> None:
69
+ """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
70
+
71
+ If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
72
+
73
+ IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
74
+ usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
75
+ encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
76
+ executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
77
+ If that is not possible, please reach out to the support for assistance.
78
+
79
+ Args:
80
+ index: Optional key space index to use for sharding the enrichment.
81
+ If not provided, the table's default sharding will be used.
82
+ **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
83
+ such as `address` to connect to an existing cluster.
84
+ """
85
+ try:
86
+ from dask.distributed import Client
87
+ except ImportError:
88
+ raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
89
+
90
+ # Connect before doing any work.
91
+ dask_client = Client(**kwargs)
92
+
93
+ # Start a transaction BEFORE the planning scan.
94
+ tx = self._table.txn()
95
+ plan_scan = self._table.spiral.scan(self._projection, where=self._where)
96
+
97
+ # Determine the "tasks". Use the index if provided.
98
+ shards = plan_scan.shards()
99
+ if index is not None:
100
+ # TODO(marko): This will use index's asof automatically.
101
+ shards = self._table.spiral.internal.compute_shards(index.core)
102
+
103
+ # Partially bind the enrichment function.
104
+ _compute = partial(
105
+ _enrichment_task,
106
+ settings_dict=self._table.spiral.config.model_dump(),
107
+ state_json=plan_scan.core.scan_state().to_json(),
108
+ output_table_id=self._table.table_id,
109
+ partition_size_bytes=partition_size_bytes,
110
+ )
111
+ enrichments = dask_client.map(_compute, shards)
112
+
113
+ logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
114
+ for result in dask_client.gather(enrichments):
115
+ result: EnrichmentTaskResult
116
+ tx.include(result.ops)
117
+
118
+ if tx.is_empty():
119
+ logger.warning("Transaction not committed. No rows were read for enrichment.")
120
+ return
121
+
122
+ tx.commit()
123
+
124
+
125
+ @dataclasses.dataclass
126
+ class EnrichmentTaskResult:
127
+ ops: list[Operation]
128
+
129
+ def __getstate__(self):
130
+ return {"ops": [op.to_json() for op in self.ops]}
131
+
132
+ def __setstate__(self, state):
133
+ self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
134
+
135
+
136
+ # NOTE(marko): This function must be picklable!
137
+ def _enrichment_task(
138
+ shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
139
+ ) -> EnrichmentTaskResult:
140
+ # Returns operations that can be included in a transaction.
141
+ from spiral import Scan, Spiral
142
+ from spiral.core.table import ScanState
143
+ from spiral.settings import Settings
144
+
145
+ settings: Settings = Settings.model_validate(settings_dict)
146
+ sp = Spiral(config=settings)
147
+ state = ScanState.from_json(state_json)
148
+ task_scan = Scan(sp, sp.core.load_scan(state))
149
+ table = sp.table(output_table_id)
150
+
151
+ task_tx = table.txn()
152
+ task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
153
+ return EnrichmentTaskResult(ops=task_tx.take())
@@ -8,7 +8,9 @@ import pyarrow as pa
8
8
 
9
9
  from spiral import _lib, arrow_
10
10
 
11
+ from . import http as http
11
12
  from . import list_ as list
13
+ from . import s3 as s3
12
14
  from . import str_ as str
13
15
  from . import struct as struct
14
16
  from . import text as text
@@ -47,6 +49,7 @@ __all__ = [
47
49
  "xor",
48
50
  "text",
49
51
  "s3",
52
+ "http",
50
53
  "UDF",
51
54
  ]
52
55
 
@@ -0,0 +1,16 @@
1
+ from spiral import _lib
2
+ from spiral.expressions.base import Expr, ExprLike
3
+
4
+
5
+ def get(expr: ExprLike) -> Expr:
6
+ """Read data from the URL.
7
+
8
+ Args:
9
+ expr: URLs of the data that needs to be read.
10
+ """
11
+ from spiral import expressions as se
12
+
13
+ expr = se.lift(expr)
14
+
15
+ # This just works :)
16
+ return Expr(_lib.expr.s3.get(expr.__expr__))
spiral/expressions/s3.py CHANGED
@@ -11,8 +11,5 @@ def get(expr: ExprLike) -> Expr:
11
11
  from spiral import expressions as se
12
12
 
13
13
  expr = se.lift(expr)
14
- return Expr(
15
- _lib.expr.s3.get(
16
- expr.__expr__,
17
- )
18
- )
14
+
15
+ return Expr(_lib.expr.s3.get(expr.__expr__))
spiral/expressions/udf.py CHANGED
@@ -46,7 +46,12 @@ class UDF(abc.ABC):
46
46
 
47
47
  @abc.abstractmethod
48
48
  def return_type(self, scope: pa.DataType) -> pa.DataType:
49
- """Must return the return type of the UDF given the input scope type."""
49
+ """Must return the return type of the UDF given the input scope type.
50
+
51
+ IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
52
+ including nested structs, meaning that all fields in structs must also be nullable,
53
+ and if those fields are structs, their fields must also be nullable, and so on.
54
+ """
50
55
  ...
51
56
 
52
57
  @abc.abstractmethod
spiral/scan.py CHANGED
@@ -1,10 +1,11 @@
1
+ from functools import partial
1
2
  from typing import TYPE_CHECKING, Any, Optional
2
3
 
3
4
  import pyarrow as pa
4
5
 
5
6
  from spiral.core.client import Shard, ShuffleConfig
7
+ from spiral.core.table import KeyRange
6
8
  from spiral.core.table import Scan as CoreScan
7
- from spiral.core.table import ScanState as CoreScanState
8
9
  from spiral.core.table.spec import Schema
9
10
  from spiral.settings import CI, DEV
10
11
 
@@ -16,37 +17,17 @@ if TYPE_CHECKING:
16
17
  import streaming # noqa
17
18
  import torch.utils.data as torchdata # noqa
18
19
 
20
+ from spiral.client import Spiral
19
21
  from spiral.dataloader import SpiralDataLoader, World # noqa
20
22
 
21
23
 
22
- class ScanState:
23
- """
24
- Evaluated properties of the scan
25
- """
26
-
27
- __slots__ = ("core",)
28
-
29
- def __init__(self, core: CoreScanState):
30
- self.core = core
31
-
32
- def __getstate__(self):
33
- return self.core.to_json()
34
-
35
- def __setstate__(self, state):
36
- self.core = CoreScanState.from_json(state)
37
-
38
-
39
24
  class Scan:
40
25
  """Scan object."""
41
26
 
42
- def __init__(self, core: CoreScan):
27
+ def __init__(self, spiral: "Spiral", core: CoreScan):
28
+ self.spiral = spiral
43
29
  self.core = core
44
30
 
45
- @property
46
- def scan_state(self) -> ScanState:
47
- """Returns evaluated properties of the scan."""
48
- return ScanState(self.core.scan_state())
49
-
50
31
  @property
51
32
  def metrics(self) -> dict[str, Any]:
52
33
  """Returns metrics about the scan."""
@@ -72,6 +53,8 @@ class Scan:
72
53
 
73
54
  def to_record_batches(
74
55
  self,
56
+ *,
57
+ key_range: KeyRange | None = None,
75
58
  key_table: pa.Table | pa.RecordBatchReader | None = None,
76
59
  batch_size: int | None = None,
77
60
  batch_readahead: int | None = None,
@@ -79,6 +62,9 @@ class Scan:
79
62
  """Read as a stream of RecordBatches.
80
63
 
81
64
  Args:
65
+ key_range: Optional key range to filter the scan.
66
+ If provided, the scan will only return rows within the key range.
67
+ Only one of key_range or key_table can be provided.
82
68
  key_table: a table of keys to "take" (including aux columns for cell-push-down).
83
69
  If None, the scan will be executed without a key table.
84
70
  batch_size: the maximum number of rows per returned batch.
@@ -86,6 +72,9 @@ class Scan:
86
72
  RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
87
73
  batch_readahead: the number of batches to prefetch in the background.
88
74
  """
75
+ if key_range is not None and key_table is not None:
76
+ raise ValueError("Only one of key_range or key_table can be provided.")
77
+
89
78
  if isinstance(key_table, pa.RecordBatchReader):
90
79
  if batch_size is not None:
91
80
  raise ValueError(
@@ -94,46 +83,54 @@ class Scan:
94
83
  elif isinstance(key_table, pa.Table):
95
84
  key_table = key_table.to_reader(max_chunksize=batch_size)
96
85
 
97
- return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
86
+ return self.core.to_record_batches(key_range=key_range, key_table=key_table, batch_readahead=batch_readahead)
98
87
 
99
88
  def to_table(
100
89
  self,
90
+ *,
91
+ key_range: KeyRange | None = None,
101
92
  key_table: pa.Table | pa.RecordBatchReader | None = None,
102
93
  ) -> pa.Table:
103
94
  """Read into a single PyArrow Table.
104
95
 
105
96
  Args:
97
+ key_range: Optional key range to filter the scan.
98
+ If provided, the scan will only return rows within the key range.
99
+ Only one of key_range or key_table can be provided.
106
100
  key_table: a table of keys to "take" (including aux columns for cell-push-down).
107
101
  If None, the scan will be executed without a key table.
108
102
  """
109
103
  # NOTE: Evaluates fully on Rust side which improved debuggability.
110
- if DEV and not CI and key_table is None:
104
+ if DEV and not CI and key_table is None and key_range is None:
111
105
  rb = self.core.to_record_batch()
112
106
  return pa.Table.from_batches([rb])
113
107
 
114
- return self.to_record_batches(key_table=key_table).read_all()
108
+ return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
115
109
 
116
110
  def to_dask(self) -> "dd.DataFrame":
117
111
  """Read into a Dask DataFrame.
118
112
 
119
113
  Requires the `dask` package to be installed.
114
+
115
+ IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
116
+ usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
117
+ encountering such issues, please reach out to the support for assistance.
120
118
  """
121
119
  import dask.dataframe as dd
122
- import pandas as pd
123
120
 
124
- def _read_shard(shard: Shard) -> pd.DataFrame:
125
- # TODO(ngates): we need a way to preserve the existing asofs?
126
- raise NotImplementedError()
127
-
128
- # Fetch a set of partition ranges
121
+ _read_shard = partial(
122
+ _read_shard_task,
123
+ settings_dict=self.spiral.config.model_dump(),
124
+ state_json=self.core.scan_state().to_json(),
125
+ )
129
126
  return dd.from_map(_read_shard, self.shards())
130
127
 
131
- def to_pandas(self) -> "pd.DataFrame":
128
+ def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
132
129
  """Read into a Pandas DataFrame.
133
130
 
134
131
  Requires the `pandas` package to be installed.
135
132
  """
136
- return self.to_table().to_pandas()
133
+ return self.to_table(key_range=key_range).to_pandas()
137
134
 
138
135
  def to_polars(self) -> "pl.DataFrame":
139
136
  """Read into a Polars DataFrame.
@@ -188,16 +185,18 @@ class Scan:
188
185
 
189
186
  Returns:
190
187
  SpiralDataLoader with shards partitioned for this rank.
191
- """
192
- # Example usage:
193
- #
194
- # Auto-detect from PyTorch distributed:
195
- # loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
196
- #
197
- # Explicit world configuration:
198
- # world = World(rank=0, world_size=4)
199
- # loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
200
188
 
189
+ Auto-detect from PyTorch distributed:
190
+ ```python
191
+ loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
192
+ ```
193
+
194
+ Explicit world configuration:
195
+ ```python
196
+ world = World(rank=0, world_size=4)
197
+ loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
198
+ ```
199
+ """
201
200
  from spiral.dataloader import SpiralDataLoader, World
202
201
 
203
202
  if world is None:
@@ -231,19 +230,21 @@ class Scan:
231
230
 
232
231
  Returns:
233
232
  New SpiralDataLoader instance configured to resume from the checkpoint.
233
+
234
+ Save checkpoint during training:
235
+ ```python
236
+ loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
237
+ checkpoint = loader.state_dict()
238
+ ```
239
+
240
+ Resume later - uses same shards from checkpoint:
241
+ ```python
242
+ resumed_loader = scan.resume_data_loader(
243
+ checkpoint,
244
+ batch_size=32,
245
+ transform_fn=my_transform,
246
+ )
234
247
  """
235
- # Example usage:
236
- #
237
- # Save checkpoint during training:
238
- # loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
239
- # checkpoint = loader.state_dict()
240
- #
241
- # Resume later - uses same shards from checkpoint:
242
- # resumed_loader = scan.resume_data_loader(
243
- # checkpoint,
244
- # batch_size=32,
245
- # transform_fn=my_transform,
246
- # )
247
248
  from spiral.dataloader import SpiralDataLoader
248
249
 
249
250
  return SpiralDataLoader.from_state_dict(self, state, **kwargs)
@@ -311,3 +312,17 @@ class Scan:
311
312
  from spiral.debug.metrics import display_metrics
312
313
 
313
314
  display_metrics(self.metrics)
315
+
316
+
317
+ # NOTE(marko): This function must be picklable!
318
+ def _read_shard_task(shard: Shard, *, settings_dict, state_json) -> "pd.DataFrame":
319
+ from spiral import Spiral
320
+ from spiral.core.table import ScanState
321
+ from spiral.settings import Settings
322
+
323
+ settings: Settings = Settings.model_validate(settings_dict)
324
+ sp = Spiral(config=settings)
325
+ state = ScanState.from_json(state_json)
326
+ task_scan = Scan(sp, sp.core.load_scan(state))
327
+
328
+ return task_scan.to_pandas(key_range=shard.key_range)
spiral/settings.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Annotated
5
5
 
6
6
  import typer
7
- from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
7
+ from pydantic import Field, PlainSerializer, ValidatorFunctionWrapHandler, WrapValidator
8
8
  from pydantic_settings import (
9
9
  BaseSettings,
10
10
  InitSettingsSource,
@@ -28,13 +28,16 @@ PACKAGE_NAME = "pyspiral"
28
28
 
29
29
 
30
30
  def validate_token(v, handler: ValidatorFunctionWrapHandler):
31
- if isinstance(v, str):
32
- return Token(v)
33
- else:
34
- raise ValueError("Token value must be a string")
31
+ if not isinstance(v, str):
32
+ raise ValueError("Token value (SPIRAL__SPIRALDB__TOKEN) must be a string")
33
+ return Token(v)
35
34
 
36
35
 
37
- TokenType = Annotated[Token, WrapValidator(validate_token)]
36
+ TokenType = Annotated[
37
+ Token,
38
+ WrapValidator(validate_token),
39
+ PlainSerializer(lambda token: token.expose_secret(), return_type=str),
40
+ ]
38
41
 
39
42
 
40
43
  class SpiralDBSettings(BaseSettings):
spiral/table.py CHANGED
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
3
3
 
4
4
  from spiral.core.table import Table as CoreTable
5
5
  from spiral.core.table.spec import Schema
6
+ from spiral.enrichment import Enrichment
6
7
  from spiral.expressions.base import Expr, ExprLike
7
8
  from spiral.settings import settings
8
9
  from spiral.snapshot import Snapshot
@@ -12,13 +13,11 @@ if TYPE_CHECKING:
12
13
  import duckdb
13
14
  import polars as pl
14
15
  import pyarrow.dataset as ds
15
- import streaming
16
- import torch.utils.data as torchdata # noqa
17
16
 
18
17
  from spiral.client import Spiral
19
18
  from spiral.dataloader import SpiralDataLoader
20
19
  from spiral.key_space_index import KeySpaceIndex
21
- from spiral.scan import Scan
20
+ from spiral.streaming_ import SpiralStream
22
21
 
23
22
 
24
23
  class Table(Expr):
@@ -51,6 +50,14 @@ class Table(Expr):
51
50
  """Returns the fully qualified identifier of the table."""
52
51
  return self._identifier or self.table_id
53
52
 
53
+ @property
54
+ def project(self) -> str | None:
55
+ """Returns the project of the table."""
56
+ if self._identifier is None:
57
+ return None
58
+ project, _, _ = self._identifier.split(".")
59
+ return project
60
+
54
61
  @property
55
62
  def dataset(self) -> str | None:
56
63
  """Returns the dataset of the table."""
@@ -111,24 +118,29 @@ class Table(Expr):
111
118
  partition_size_bytes=partition_size_bytes,
112
119
  )
113
120
 
114
- def writeback(
121
+ def enrich(
115
122
  self,
116
- scan: "Scan",
117
- *,
118
- partition_size_bytes: int | None = None,
119
- ) -> None:
120
- """Write back the results of a scan to the table.
123
+ *projections: ExprLike,
124
+ where: ExprLike | None = None,
125
+ ) -> Enrichment:
126
+ """Returns an Enrichment object that, when applied, produces new columns.
121
127
 
122
- :param scan: The scan to write back.
123
- The scan does NOT need to be over the same table as transaction,
124
- but it does need to have the same key schema.
125
- :param partition_size_bytes: The maximum partition size in bytes.
128
+ Enrichment can be applied in different ways, e.g. distributed.
129
+
130
+ :param projections: Projection expressions deriving new columns to write back.
131
+ Expressions can be over multiple Spiral tables, but all tables including
132
+ this one must share the same key schema.
133
+ :param where: Optional filter expression to apply when reading the input tables.
126
134
  """
127
- with self.txn() as txn:
128
- txn.writeback(
129
- scan,
130
- partition_size_bytes=partition_size_bytes,
131
- )
135
+ from spiral import expressions as se
136
+
137
+ # Combine table with all projections into a single struct.
138
+ # The table is included to ensure key columns are present in the scan output.
139
+ projection = se.merge(self, *projections)
140
+ if where is not None:
141
+ where = se.lift(where)
142
+
143
+ return Enrichment(self, projection, where)
132
144
 
133
145
  def drop_columns(self, column_paths: list[str]) -> None:
134
146
  """
@@ -275,7 +287,7 @@ class Table(Expr):
275
287
  projection: Expr | None = None,
276
288
  cache_dir: str | None = None,
277
289
  shard_row_block_size: int | None = None,
278
- ) -> "streaming.Stream":
290
+ ) -> "SpiralStream":
279
291
  """Returns a stream to be used with MosaicML's StreamingDataset.
280
292
 
281
293
  Requires `streaming` package to be installed.
@@ -310,4 +322,4 @@ class Table(Expr):
310
322
  shards=shards,
311
323
  cache_dir=cache_dir,
312
324
  shard_row_block_size=shard_row_block_size,
313
- ) # type: ignore[return-value]
325
+ )
spiral/transaction.py CHANGED
@@ -1,3 +1,4 @@
1
+ from spiral.core.table import KeyRange
1
2
  from spiral.core.table import Transaction as CoreTransaction
2
3
  from spiral.core.table.spec import Operation
3
4
  from spiral.expressions.base import ExprLike
@@ -19,6 +20,10 @@ class Transaction:
19
20
  """The status of the transaction."""
20
21
  return self._core.status
21
22
 
23
+ def is_empty(self) -> bool:
24
+ """Check if the transaction has no operations."""
25
+ return self._core.is_empty()
26
+
22
27
  def __enter__(self):
23
28
  return self
24
29
 
@@ -41,16 +46,26 @@ class Transaction:
41
46
 
42
47
  self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
43
48
 
44
- def writeback(self, scan: Scan, *, partition_size_bytes: int | None = None):
49
+ def writeback(
50
+ self,
51
+ scan: Scan,
52
+ *,
53
+ key_range: KeyRange | None = None,
54
+ partition_size_bytes: int | None = None,
55
+ batch_readahead: int | None = None,
56
+ ):
45
57
  """Write back the results of a scan to the table.
46
58
 
47
59
  :param scan: The scan to write back.
48
60
  The scan does NOT need to be over the same table as transaction,
49
61
  but it does need to have the same key schema.
62
+ :param key_range: Optional key range to limit the writeback to.
50
63
  :param partition_size_bytes: The maximum partition size in bytes.
51
- If not provided, the default partition size is used.
64
+ :param batch_readahead: The number of batches to read ahead when evaluating the scan.
52
65
  """
53
- self._core.writeback(scan.core, partition_size_bytes=partition_size_bytes)
66
+ self._core.writeback(
67
+ scan.core, key_range=key_range, partition_size_bytes=partition_size_bytes, batch_readahead=batch_readahead
68
+ )
54
69
 
55
70
  def drop_columns(self, column_paths: list[str]):
56
71
  """