PyPI - pyspiral - Versions diffs - 0.6.12__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

pyspiral 0.6.12__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyspiral might be problematic. Click here for more details.

Files changed (21) hide show

{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/RECORD +21 -19
spiral/__init__.py +7 -0
spiral/_lib.abi3.so +0 -0
spiral/cli/iceberg.py +1 -1
spiral/cli/key_spaces.py +14 -0
spiral/client.py +2 -9
spiral/core/table/__init__.pyi +10 -1
spiral/core/table/spec/__init__.pyi +3 -1
spiral/dataloader.py +6 -1
spiral/enrichment.py +153 -0
spiral/expressions/__init__.py +3 -0
spiral/expressions/http.py +16 -0
spiral/expressions/s3.py +2 -5
spiral/expressions/udf.py +6 -1
spiral/scan.py +71 -56
spiral/settings.py +9 -6
spiral/table.py +32 -20
spiral/transaction.py +18 -3
{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0

{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyspiral
-Version: 0.6.12
+Version: 0.6.13
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
@@ -31,15 +31,18 @@ Requires-Dist: typer>=0.16
 Requires-Dist: xxhash>=3.4.1
 Requires-Dist: polars>=1.31.0 ; extra == 'polars'
 Requires-Dist: duckdb>=1.3.2 ; extra == 'duckdb'
-Requires-Dist: datasets>=4.0.0 ; extra == 'datasets'
-Requires-Dist: pyiceberg>=0.9.1 ; extra == 'pyiceberg'
+Requires-Dist: pyiceberg[s3fs]>=0.9.1 ; extra == 'iceberg'
+Requires-Dist: datasets>=4.0.0 ; extra == 'huggingface'
 Requires-Dist: mosaicml-streaming>=0.13.0 ; extra == 'streaming'
 Requires-Dist: vortex-data>=0.52.1 ; extra == 'streaming'
+Requires-Dist: dask>=2025.10.0 ; extra == 'dask'
+Requires-Dist: distributed>=2025.10.0 ; extra == 'dask'
 Provides-Extra: polars
 Provides-Extra: duckdb
-Provides-Extra: datasets
-Provides-Extra: pyiceberg
+Provides-Extra: iceberg
+Provides-Extra: huggingface
 Provides-Extra: streaming
+Provides-Extra: dask
 Summary: Python client for Spiral.
 Home-Page: https://spiraldb.com
 Author-email: SpiralDB <hello@spiraldb.com>

{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-pyspiral-0.6.12.dist-info/METADATA,sha256=ANXjtdzd8s_zdWLd-mTm0X07pWbjlpkjQP8X5yP4qpY,1843
-pyspiral-0.6.12.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
-pyspiral-0.6.12.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
-spiral/__init__.py,sha256=n4JNLrO3wyw_k_U_JKyNiGON0wEpfvqxDhDdB2P6dhM,1007
-spiral/_lib.abi3.so,sha256=XaAqnKj8sXWK4OjxsLtR-hAI2hAw523pL2aTyBrF_Is,61055320
+pyspiral-0.6.13.dist-info/METADATA,sha256=AvViHjB1v9OqYTKONYf_DbfCB0HktAy-numkhQhuq20,1977
+pyspiral-0.6.13.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
+pyspiral-0.6.13.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
+spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
+spiral/_lib.abi3.so,sha256=ciM05gN0vs_C6SCqv5LmppQZOZkZcpSb8unZAFc9oXA,61221816
 spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
 spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
 spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -23,8 +23,8 @@ spiral/cli/admin.py,sha256=-ubYqs8nKjnQStbQ68jpWx_9xh0TsaxI0wM1Hfko8_U,319
 spiral/cli/app.py,sha256=smzGj5a2RwhM9RQChmlEeKZLN4Fk60-bP7Lm5_Is1Rw,2760
 spiral/cli/console.py,sha256=6JHbAQV6MFWz3P-VzqPOjhHpkIQagsCdzTMvmuDKMkU,2580
 spiral/cli/fs.py,sha256=vaPcSc2YghhHeipxNitIdsHaBhFwlwkvPFqYsFSN9P0,2927
-spiral/cli/iceberg.py,sha256=Q14tcGcn1LixbFCYP0GhfYwFFXTmmi8tqBPYwalJEyE,3248
-spiral/cli/key_spaces.py,sha256=TF1tbRnrjemp4aMAbLc7o4_jPChIumaQGPuvfW0sR5o,2945
+spiral/cli/iceberg.py,sha256=wdMyl0j821MLnXNZ6Kwm65ogh98C-pjMJm3Y6YqlnTI,3249
+spiral/cli/key_spaces.py,sha256=Xaw7WH-Qw_j6AxisdIoKfjAgVRXLM9qBFzuCTjPAFLI,3516
 spiral/cli/login.py,sha256=2tw6uN5rEpiMMAmjQSB3-JUPf3C0Wc1eTGCDxhYtJps,731
 spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
 spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
@@ -35,7 +35,7 @@ spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
 spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
 spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
 spiral/cli/workloads.py,sha256=2_SLfQTFN6y73R9H0i9dk8VIOVagKxSxOpHXC56yptY,2015
-spiral/client.py,sha256=pw6vB85oLVbBudc_HRzmLCItcecsTjNM5SMu_kVOMCo,6568
+spiral/client.py,sha256=zMp-xXGL4R1Py_rYrC5o3jFLam1oA74azi50dvMP-_o,6329
 spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 spiral/core/_tools/__init__.pyi,sha256=b2KLfTOQ67pjfbYt07o0IGiTu5o2bZw69lllV8v0Dps,143
 spiral/core/authn/__init__.pyi,sha256=z_GWyIS62fuiYQrYO8hzw4W8oGaiciqS1u5qtAt54VY,769
@@ -49,26 +49,28 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
 spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
 spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
 spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
-spiral/core/table/__init__.pyi,sha256=HN4ag8E1QDF_VgekJZqjhuQLhorU3ivjIOBHai2OEVc,3672
+spiral/core/table/__init__.pyi,sha256=YBL12_JPTWz2mNbqlDqbT1exxVJYzwfXdHCi6Z37JxA,3841
 spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
 spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
-spiral/core/table/spec/__init__.pyi,sha256=PgacM_fZmkHuplj7IbYrj5KfFI3-VPYnyuzI2w7A70Y,5717
-spiral/dataloader.py,sha256=2haLoI6KLrzXfPozAgEa-eCOSDsNldJ1qwCmFpNMyTQ,10281
+spiral/core/table/spec/__init__.pyi,sha256=twzX4vFmgBxInZWq_nyP6DR9OQjjOVrbZMn97kndeS8,5808
+spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
 spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
 spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
 spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
 spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
 spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
-spiral/expressions/__init__.py,sha256=UNxK5qQNrl-BuHsjKcWDj35w5lJviLkGFUQj8OhLID0,7919
+spiral/enrichment.py,sha256=e2yzNWTTG73uEkLTc4ccTNRQ94cBtM04eGzlJ2-kBOI,5851
+spiral/expressions/__init__.py,sha256=Fp7Xx3exh9KJad92tgd_TGGIpYLQTHqWjW-pexzQibU,7981
 spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
+spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
 spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
-spiral/expressions/s3.py,sha256=D-kuLifIEY314Q8rB2-ZP8U-IT0FywtbJDMuyusBKiQ,414
+spiral/expressions/s3.py,sha256=bkd0HANerNKlOblp2z7JJOSWjF9Bw9lZe1A-KTrUEgk,378
 spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
 spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
 spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
 spiral/expressions/tiff.py,sha256=4dngO97bT1QY0By7-PxOQVmSwQC3PQAiixVhLJ-4HMQ,7986
-spiral/expressions/udf.py,sha256=yvZCuGK9S9Sa9I18h-apUxsDni2B7E9WEqPrxHBjUWE,1657
+spiral/expressions/udf.py,sha256=XOxa7Kocb4Cg4q_qFvRT6hVnVzi22CQenqrvS-TL-VY,1936
 spiral/grpc_.py,sha256=f3czdP1Mxme42Y5--a5ogYq1TTiWn-J_MlGjwJ2mWwM,1015
 spiral/iceberg.py,sha256=JGq62Qnf296r9_hRAoH85GQq45-uSBjwXWw_CvPi6G4,930
 spiral/iterable_dataset.py,sha256=Eekg9ad8tcwXcloHWReBbvCSr5ZappRHn2ldKTvwqS0,4622
@@ -91,16 +93,16 @@ spiral/protogen/_/substrait/extensions/__init__.py,sha256=nhnEnho70GAT8WPj2xtwJU
 spiral/protogen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 spiral/protogen/util.py,sha256=smnvVo6nYH3FfDm9jqhNLaXz4bbTBaQezHQDCTvZyiQ,1486
 spiral/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-spiral/scan.py,sha256=fXZq0NL9YApt-UwkGpaT0ETn-rK-1_tltq7nqsImZI4,11199
+spiral/scan.py,sha256=csbk5ePbU-RlEVIF7isccF2zRBB8L8ZY_HEpalMjgLY,12340
 spiral/server.py,sha256=ztBmB5lBnUz-smQxR_tC8AI5SOhz17wH0MI3GuzDUdM,600
-spiral/settings.py,sha256=JRQSwjJyNaCqQdQLxiqB_O_LZRQXMLyshJBrI2LZHwM,3113
+spiral/settings.py,sha256=sUhMMBCXaPvUYztN_gztD9TjeUYJwVeEcJrq4FLy6M0,3232
 spiral/snapshot.py,sha256=cTobi5jtiANxalGA-isokQHblNmXGtuUvgUGGNVybsI,1555
 spiral/streaming_/__init__.py,sha256=s7MlW2ERsuZmZGExLFL6RcZon2e0tNBocBg5ANgki7k,61
 spiral/streaming_/reader.py,sha256=tl_lC9xgh1-QFhsZn4xQT7It3PVTzHCEUT2BG2dWBRQ,4166
 spiral/streaming_/stream.py,sha256=DM1hBDHnWm1ZFKZ-hZ4zxeSXITcUI6kWzwdJZvywI8o,5915
 spiral/substrait_.py,sha256=AKeOD4KIXvz2J4TYxnIneOiHddtBIyOhuNxVO_uH0eg,12592
-spiral/table.py,sha256=dwQr1EAACbfxG8fISFqRrUEAE2P2y6xsx0vFK9Gwyfc,11662
+spiral/table.py,sha256=prjDBcm6Qerdq3ypXzfbXb7ngAcO0j-Z9aTeZvzKoqs,12209
 spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
-spiral/transaction.py,sha256=M_Tf-TijVBluuInWk6XSFNCR2dKN4S9EdsHM3QD20ng,2948
+spiral/transaction.py,sha256=hQm6DfCklMDpIYJ9qA2wR45cCuUPGCiJy1tHGE3AsEY,3418
 spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
-pyspiral-0.6.12.dist-info/RECORD,,
+pyspiral-0.6.13.dist-info/RECORD,,

spiral/__init__.py CHANGED Viewed

@@ -1,14 +1,18 @@
 """Python client for Spiral"""
+import importlib
 # This is here to make sure we load the native extension first
 from spiral import _lib
 # Eagerly import the Spiral library
 assert _lib, "Spiral library"
 from spiral.client import Spiral  # noqa: E402
 from spiral.core.client import Shard, ShuffleConfig  # noqa: E402
 from spiral.dataloader import SpiralDataLoader, World  # noqa: E402
+from spiral.enrichment import Enrichment  # noqa: E402
 from spiral.iceberg import Iceberg  # noqa: E402
 from spiral.key_space_index import KeySpaceIndex  # noqa: E402
 from spiral.project import Project  # noqa: E402
@@ -24,6 +28,7 @@ __all__ = [
     "Table",
     "Snapshot",
     "Transaction",
+    "Enrichment",
     "Scan",
     "Shard",
     "ShuffleConfig",
@@ -33,3 +38,5 @@ __all__ = [
     "World",
     "Iceberg",
 ]
+__version__ = importlib.metadata.version("pyspiral")

spiral/_lib.abi3.so CHANGED Viewed

Binary file

spiral/cli/iceberg.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typer import Argument
 from spiral.cli import CONSOLE, ERR_CONSOLE, AsyncTyper, state
 from spiral.cli.types import ProjectArg
-app = AsyncTyper(short_help="Apache Iceberg Catalog")
+app = AsyncTyper(short_help="Apache Iceberg Catalog.")
 @app.command(help="List namespaces.")

spiral/cli/key_spaces.py CHANGED Viewed

@@ -87,3 +87,17 @@ def sync(
     index_id = get_index_id(project, name)
     response = state.spiral.api.key_space_indexes.sync_index(index_id, SyncIndexRequest(resources=resources))
     CONSOLE.print(f"Triggered sync job {response.worker_id} for index {index_id}.")
+# TODO(marko): This will be removed.
+@app.command(help="Run a sync and wait for it to complete.")
+def sync_local(
+    project: ProjectArg,
+    name: Annotated[str | None, Option(help="Index name.")] = None,
+):
+    """Run a sync and wait for it to complete."""
+    index_id = get_index_id(project, name)
+    index = state.spiral.key_space_index(index_id)
+    snapshot = state.spiral.table(index.table_id).snapshot()
+    state.spiral.internal.update_key_space_index(index.core, snapshot.core)
+    CONSOLE.print(f"Index {index.name} is up to date as-of {snapshot.asof}.")

spiral/client.py CHANGED Viewed

@@ -10,7 +10,7 @@ from spiral.core.client import Internal
 from spiral.core.client import Spiral as CoreSpiral
 from spiral.datetime_ import timestamp_micros
 from spiral.expressions import ExprLike
-from spiral.scan import Scan, ScanState
+from spiral.scan import Scan
 from spiral.settings import Settings, settings
 if TYPE_CHECKING:
@@ -121,6 +121,7 @@ class Spiral:
             where = se.lift(where)
         return Scan(
+            self,
             self.core.scan(
                 projection.__expr__,
                 filter=where.__expr__ if where else None,
@@ -128,14 +129,6 @@ class Spiral:
             ),
         )
-    def load_scan(self, scan_state: ScanState) -> Scan:
-        """Load a scan from a serialized scan state.
-        Args:
-            scan_state: The serialized scan state.
-        """
-        return Scan(self.core.load_scan(scan_state.core))
     # TODO(marko): This should be query, and search should be query + scan.
     def search(
         self,

spiral/core/table/__init__.pyi CHANGED Viewed

@@ -70,6 +70,7 @@ class Scan:
     def scan_state(self) -> ScanState: ...
     def to_record_batches(
         self,
+        key_range: KeyRange | None = None,
         key_table: pa.Table | pa.RecordBatch | None = None,
         batch_readahead: int | None = None,
     ) -> pa.RecordBatchReader: ...
@@ -101,10 +102,18 @@ class Transaction:
     status: str
     def write(self, table: pa.RecordBatchReader, *, partition_size_bytes: int | None = None): ...
-    def writeback(self, scan: Scan, *, partition_size_bytes: int | None = None, batch_readahead: int | None = None): ...
+    def writeback(
+        self,
+        scan: Scan,
+        *,
+        key_range: KeyRange | None = None,
+        partition_size_bytes: int | None = None,
+        batch_readahead: int | None = None,
+    ): ...
     def drop_columns(self, column_paths: list[str]): ...
     def take(self) -> list[Operation]: ...
     def include(self, ops: list[Operation]): ...
     def commit(self): ...
     def abort(self): ...
+    def is_empty(self) -> bool: ...
     def metrics(self) -> dict[str, Any]: ...

spiral/core/table/spec/__init__.pyi CHANGED Viewed

@@ -64,7 +64,9 @@ class ColumnGroupMetadata:
 class Operation:
     # Base class for all operations in the WAL.
-    ...
+    def to_json(self) -> str: ...
+    @staticmethod
+    def from_json(json: str) -> Operation: ...
 class LogEntry:
     ts: int

spiral/dataloader.py CHANGED Viewed

@@ -121,6 +121,7 @@ class SpiralDataLoader:
         # TODO(os): accept vortex arrays here instead of Arrow
         transform_fn: Callable[[pa.RecordBatch], Any] | None = None,
         map_workers: int = 0,
+        infinite: bool = False,
     ):
         """Initialize SpiralDataLoader.
@@ -145,6 +146,9 @@ class SpiralDataLoader:
             map_workers: Number of worker processes for parallel transform_fn
                 application. 0 means single-process (no parallelism). Use this for
                 CPU-bound transforms like tokenization or audio decoding.
+            infinite: Whether to cycle through the dataset infinitely. If True,
+                the dataloader will repeat the dataset indefinitely. If False,
+                the dataloader will stop after going through the dataset once.
         """
         self.scan = scan
         self.shards = shards if shards is not None else scan.shards()
@@ -157,6 +161,7 @@ class SpiralDataLoader:
         self.batch_readahead = batch_readahead
         self.transform_fn = transform_fn
         self.map_workers = map_workers
+        self.infinite = infinite
         self._samples_yielded = 0
@@ -176,7 +181,7 @@ class SpiralDataLoader:
             shuffle=shuffle,
             max_batch_size=self.batch_size,
             batch_readahead=self.batch_readahead,
-            infinite=False,
+            infinite=self.infinite,
         )
         if self.skip_samples > 0:

spiral/enrichment.py ADDED Viewed

@@ -0,0 +1,153 @@
+import dataclasses
+import logging
+from functools import partial
+from typing import TYPE_CHECKING, Optional
+from spiral.core.client import Shard
+from spiral.core.table.spec import Operation
+from spiral.expressions import Expr
+if TYPE_CHECKING:
+    from spiral import KeySpaceIndex, Table
+logger = logging.getLogger(__name__)
+class Enrichment:
+    """
+    An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
+    with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
+    horizontally expanding tables are a powerful primitive.
+    NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
+    """
+    def __init__(
+        self,
+        table: "Table",
+        projection: Expr,
+        where: Expr | None,
+    ):
+        self._table = table
+        self._projection = projection
+        self._where = where
+    @property
+    def table(self) -> "Table":
+        """The table to write back into."""
+        return self._table
+    @property
+    def projection(self) -> Expr:
+        """The projection expression."""
+        return self._projection
+    @property
+    def where(self) -> Expr | None:
+        """The filter expression."""
+        return self._where
+    def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
+        """Apply the enrichment onto the table in a streaming fashion.
+        For large tables, consider using `apply_dask` for distributed execution.
+        """
+        scan = self._table.spiral.scan(self._projection, where=self._where)
+        with self._table.txn() as txn:
+            txn.writeback(
+                scan,
+                partition_size_bytes=partition_size_bytes,
+                batch_readahead=batch_readahead,
+            )
+    # TODO(marko): Need to figure out this sharding with key space index in places.
+    #   We could compute on-demand instead of requiring a resource.
+    def apply_dask(
+        self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
+    ) -> None:
+        """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
+        If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
+        IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
+        usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
+        encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
+        executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
+        If that is not possible, please reach out to the support for assistance.
+        Args:
+            index: Optional key space index to use for sharding the enrichment.
+                If not provided, the table's default sharding will be used.
+            **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
+                such as `address` to connect to an existing cluster.
+        """
+        try:
+            from dask.distributed import Client
+        except ImportError:
+            raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
+        # Connect before doing any work.
+        dask_client = Client(**kwargs)
+        # Start a transaction BEFORE the planning scan.
+        tx = self._table.txn()
+        plan_scan = self._table.spiral.scan(self._projection, where=self._where)
+        # Determine the "tasks". Use the index if provided.
+        shards = plan_scan.shards()
+        if index is not None:
+            # TODO(marko): This will use index's asof automatically.
+            shards = self._table.spiral.internal.compute_shards(index.core)
+        # Partially bind the enrichment function.
+        _compute = partial(
+            _enrichment_task,
+            settings_dict=self._table.spiral.config.model_dump(),
+            state_json=plan_scan.core.scan_state().to_json(),
+            output_table_id=self._table.table_id,
+            partition_size_bytes=partition_size_bytes,
+        )
+        enrichments = dask_client.map(_compute, shards)
+        logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
+        for result in dask_client.gather(enrichments):
+            result: EnrichmentTaskResult
+            tx.include(result.ops)
+        if tx.is_empty():
+            logger.warning("Transaction not committed. No rows were read for enrichment.")
+            return
+        tx.commit()
+@dataclasses.dataclass
+class EnrichmentTaskResult:
+    ops: list[Operation]
+    def __getstate__(self):
+        return {"ops": [op.to_json() for op in self.ops]}
+    def __setstate__(self, state):
+        self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
+# NOTE(marko): This function must be picklable!
+def _enrichment_task(
+    shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
+) -> EnrichmentTaskResult:
+    # Returns operations that can be included in a transaction.
+    from spiral import Scan, Spiral
+    from spiral.core.table import ScanState
+    from spiral.settings import Settings
+    settings: Settings = Settings.model_validate(settings_dict)
+    sp = Spiral(config=settings)
+    state = ScanState.from_json(state_json)
+    task_scan = Scan(sp, sp.core.load_scan(state))
+    table = sp.table(output_table_id)
+    task_tx = table.txn()
+    task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
+    return EnrichmentTaskResult(ops=task_tx.take())

spiral/expressions/__init__.py CHANGED Viewed

@@ -8,7 +8,9 @@ import pyarrow as pa
 from spiral import _lib, arrow_
+from . import http as http
 from . import list_ as list
+from . import s3 as s3
 from . import str_ as str
 from . import struct as struct
 from . import text as text
@@ -47,6 +49,7 @@ __all__ = [
     "xor",
     "text",
     "s3",
+    "http",
     "UDF",
 ]

spiral/expressions/http.py ADDED Viewed

@@ -0,0 +1,16 @@
+from spiral import _lib
+from spiral.expressions.base import Expr, ExprLike
+def get(expr: ExprLike) -> Expr:
+    """Read data from the URL.
+    Args:
+        expr: URLs of the data that needs to be read.
+    """
+    from spiral import expressions as se
+    expr = se.lift(expr)
+    # This just works :)
+    return Expr(_lib.expr.s3.get(expr.__expr__))

spiral/expressions/s3.py CHANGED Viewed

@@ -11,8 +11,5 @@ def get(expr: ExprLike) -> Expr:
     from spiral import expressions as se
     expr = se.lift(expr)
-    return Expr(
-        _lib.expr.s3.get(
-            expr.__expr__,
-        )
-    )
+    return Expr(_lib.expr.s3.get(expr.__expr__))

spiral/expressions/udf.py CHANGED Viewed

@@ -46,7 +46,12 @@ class UDF(abc.ABC):
     @abc.abstractmethod
     def return_type(self, scope: pa.DataType) -> pa.DataType:
-        """Must return the return type of the UDF given the input scope type."""
+        """Must return the return type of the UDF given the input scope type.
+        IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
+        including nested structs, meaning that all fields in structs must also be nullable,
+        and if those fields are structs, their fields must also be nullable, and so on.
+        """
         ...
     @abc.abstractmethod

spiral/scan.py CHANGED Viewed

@@ -1,10 +1,11 @@
+from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 import pyarrow as pa
 from spiral.core.client import Shard, ShuffleConfig
+from spiral.core.table import KeyRange
 from spiral.core.table import Scan as CoreScan
-from spiral.core.table import ScanState as CoreScanState
 from spiral.core.table.spec import Schema
 from spiral.settings import CI, DEV
@@ -16,37 +17,17 @@ if TYPE_CHECKING:
     import streaming  # noqa
     import torch.utils.data as torchdata  # noqa
+    from spiral.client import Spiral
     from spiral.dataloader import SpiralDataLoader, World  # noqa
-class ScanState:
-    """
-    Evaluated properties of the scan
-    """
-    __slots__ = ("core",)
-    def __init__(self, core: CoreScanState):
-        self.core = core
-    def __getstate__(self):
-        return self.core.to_json()
-    def __setstate__(self, state):
-        self.core = CoreScanState.from_json(state)
 class Scan:
     """Scan object."""
-    def __init__(self, core: CoreScan):
+    def __init__(self, spiral: "Spiral", core: CoreScan):
+        self.spiral = spiral
         self.core = core
-    @property
-    def scan_state(self) -> ScanState:
-        """Returns evaluated properties of the scan."""
-        return ScanState(self.core.scan_state())
     @property
     def metrics(self) -> dict[str, Any]:
         """Returns metrics about the scan."""
@@ -72,6 +53,8 @@ class Scan:
     def to_record_batches(
         self,
+        *,
+        key_range: KeyRange | None = None,
         key_table: pa.Table | pa.RecordBatchReader | None = None,
         batch_size: int | None = None,
         batch_readahead: int | None = None,
@@ -79,6 +62,9 @@ class Scan:
         """Read as a stream of RecordBatches.
         Args:
+            key_range: Optional key range to filter the scan.
+                If provided, the scan will only return rows within the key range.
+                Only one of key_range or key_table can be provided.
             key_table: a table of keys to "take" (including aux columns for cell-push-down).
                 If None, the scan will be executed without a key table.
             batch_size: the maximum number of rows per returned batch.
@@ -86,6 +72,9 @@ class Scan:
                     RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
             batch_readahead: the number of batches to prefetch in the background.
         """
+        if key_range is not None and key_table is not None:
+            raise ValueError("Only one of key_range or key_table can be provided.")
         if isinstance(key_table, pa.RecordBatchReader):
             if batch_size is not None:
                 raise ValueError(
@@ -94,46 +83,54 @@ class Scan:
         elif isinstance(key_table, pa.Table):
             key_table = key_table.to_reader(max_chunksize=batch_size)
-        return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
+        return self.core.to_record_batches(key_range=key_range, key_table=key_table, batch_readahead=batch_readahead)
     def to_table(
         self,
+        *,
+        key_range: KeyRange | None = None,
         key_table: pa.Table | pa.RecordBatchReader | None = None,
     ) -> pa.Table:
         """Read into a single PyArrow Table.
         Args:
+            key_range: Optional key range to filter the scan.
+                If provided, the scan will only return rows within the key range.
+                Only one of key_range or key_table can be provided.
             key_table: a table of keys to "take" (including aux columns for cell-push-down).
                 If None, the scan will be executed without a key table.
         """
         # NOTE: Evaluates fully on Rust side which improved debuggability.
-        if DEV and not CI and key_table is None:
+        if DEV and not CI and key_table is None and key_range is None:
             rb = self.core.to_record_batch()
             return pa.Table.from_batches([rb])
-        return self.to_record_batches(key_table=key_table).read_all()
+        return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
     def to_dask(self) -> "dd.DataFrame":
         """Read into a Dask DataFrame.
         Requires the `dask` package to be installed.
+        IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
+        usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
+        encountering such issues, please reach out to the support for assistance.
         """
         import dask.dataframe as dd
-        import pandas as pd
-        def _read_shard(shard: Shard) -> pd.DataFrame:
-            # TODO(ngates): we need a way to preserve the existing asofs?
-            raise NotImplementedError()
-        # Fetch a set of partition ranges
+        _read_shard = partial(
+            _read_shard_task,
+            settings_dict=self.spiral.config.model_dump(),
+            state_json=self.core.scan_state().to_json(),
+        )
         return dd.from_map(_read_shard, self.shards())
-    def to_pandas(self) -> "pd.DataFrame":
+    def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
         """Read into a Pandas DataFrame.
         Requires the `pandas` package to be installed.
         """
-        return self.to_table().to_pandas()
+        return self.to_table(key_range=key_range).to_pandas()
     def to_polars(self) -> "pl.DataFrame":
         """Read into a Polars DataFrame.
@@ -188,16 +185,18 @@ class Scan:
         Returns:
             SpiralDataLoader with shards partitioned for this rank.
-        """
-        # Example usage:
-        #
-        # Auto-detect from PyTorch distributed:
-        #   loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
-        #
-        # Explicit world configuration:
-        #   world = World(rank=0, world_size=4)
-        #   loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
+        Auto-detect from PyTorch distributed:
+        ```python
+        loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
+        ```
+        Explicit world configuration:
+        ```python
+        world = World(rank=0, world_size=4)
+        loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
+        ```
+        """
         from spiral.dataloader import SpiralDataLoader, World
         if world is None:
@@ -231,19 +230,21 @@ class Scan:
         Returns:
             New SpiralDataLoader instance configured to resume from the checkpoint.
+        Save checkpoint during training:
+        ```python
+        loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
+        checkpoint = loader.state_dict()
+        ```
+        Resume later - uses same shards from checkpoint:
+        ```python
+        resumed_loader = scan.resume_data_loader(
+            checkpoint,
+            batch_size=32,
+            transform_fn=my_transform,
+        )
         """
-        # Example usage:
-        #
-        # Save checkpoint during training:
-        #   loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
-        #   checkpoint = loader.state_dict()
-        #
-        # Resume later - uses same shards from checkpoint:
-        #   resumed_loader = scan.resume_data_loader(
-        #       checkpoint,
-        #       batch_size=32,
-        #       transform_fn=my_transform,
-        #   )
         from spiral.dataloader import SpiralDataLoader
         return SpiralDataLoader.from_state_dict(self, state, **kwargs)
@@ -311,3 +312,17 @@ class Scan:
         from spiral.debug.metrics import display_metrics
         display_metrics(self.metrics)
+# NOTE(marko): This function must be picklable!
+def _read_shard_task(shard: Shard, *, settings_dict, state_json) -> "pd.DataFrame":
+    from spiral import Spiral
+    from spiral.core.table import ScanState
+    from spiral.settings import Settings
+    settings: Settings = Settings.model_validate(settings_dict)
+    sp = Spiral(config=settings)
+    state = ScanState.from_json(state_json)
+    task_scan = Scan(sp, sp.core.load_scan(state))
+    return task_scan.to_pandas(key_range=shard.key_range)

spiral/settings.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Annotated
 import typer
-from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
+from pydantic import Field, PlainSerializer, ValidatorFunctionWrapHandler, WrapValidator
 from pydantic_settings import (
     BaseSettings,
     InitSettingsSource,
@@ -28,13 +28,16 @@ PACKAGE_NAME = "pyspiral"
 def validate_token(v, handler: ValidatorFunctionWrapHandler):
-    if isinstance(v, str):
-        return Token(v)
-    else:
-        raise ValueError("Token value must be a string")
+    if not isinstance(v, str):
+        raise ValueError("Token value (SPIRAL__SPIRALDB__TOKEN) must be a string")
+    return Token(v)
-TokenType = Annotated[Token, WrapValidator(validate_token)]
+TokenType = Annotated[
+    Token,
+    WrapValidator(validate_token),
+    PlainSerializer(lambda token: token.expose_secret(), return_type=str),
+]
 class SpiralDBSettings(BaseSettings):

spiral/table.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
 from spiral.core.table import Table as CoreTable
 from spiral.core.table.spec import Schema
+from spiral.enrichment import Enrichment
 from spiral.expressions.base import Expr, ExprLike
 from spiral.settings import settings
 from spiral.snapshot import Snapshot
@@ -12,13 +13,11 @@ if TYPE_CHECKING:
     import duckdb
     import polars as pl
     import pyarrow.dataset as ds
-    import streaming
-    import torch.utils.data as torchdata  # noqa
     from spiral.client import Spiral
     from spiral.dataloader import SpiralDataLoader
     from spiral.key_space_index import KeySpaceIndex
-    from spiral.scan import Scan
+    from spiral.streaming_ import SpiralStream
 class Table(Expr):
@@ -51,6 +50,14 @@ class Table(Expr):
         """Returns the fully qualified identifier of the table."""
         return self._identifier or self.table_id
+    @property
+    def project(self) -> str | None:
+        """Returns the project of the table."""
+        if self._identifier is None:
+            return None
+        project, _, _ = self._identifier.split(".")
+        return project
     @property
     def dataset(self) -> str | None:
         """Returns the dataset of the table."""
@@ -111,24 +118,29 @@ class Table(Expr):
                 partition_size_bytes=partition_size_bytes,
             )
-    def writeback(
+    def enrich(
         self,
-        scan: "Scan",
-        *,
-        partition_size_bytes: int | None = None,
-    ) -> None:
-        """Write back the results of a scan to the table.
+        *projections: ExprLike,
+        where: ExprLike | None = None,
+    ) -> Enrichment:
+        """Returns an Enrichment object that, when applied, produces new columns.
-        :param scan: The scan to write back.
-            The scan does NOT need to be over the same table as transaction,
-            but it does need to have the same key schema.
-        :param partition_size_bytes: The maximum partition size in bytes.
+        Enrichment can be applied in different ways, e.g. distributed.
+        :param projections: Projection expressions deriving new columns to write back.
+            Expressions can be over multiple Spiral tables, but all tables including
+            this one must share the same key schema.
+        :param where: Optional filter expression to apply when reading the input tables.
         """
-        with self.txn() as txn:
-            txn.writeback(
-                scan,
-                partition_size_bytes=partition_size_bytes,
-            )
+        from spiral import expressions as se
+        # Combine table with all projections into a single struct.
+        # The table is included to ensure key columns are present in the scan output.
+        projection = se.merge(self, *projections)
+        if where is not None:
+            where = se.lift(where)
+        return Enrichment(self, projection, where)
     def drop_columns(self, column_paths: list[str]) -> None:
         """
@@ -275,7 +287,7 @@ class Table(Expr):
         projection: Expr | None = None,
         cache_dir: str | None = None,
         shard_row_block_size: int | None = None,
-    ) -> "streaming.Stream":
+    ) -> "SpiralStream":
         """Returns a stream to be used with MosaicML's StreamingDataset.
         Requires `streaming` package to be installed.
@@ -310,4 +322,4 @@ class Table(Expr):
             shards=shards,
             cache_dir=cache_dir,
             shard_row_block_size=shard_row_block_size,
-        )  # type: ignore[return-value]
+        )

spiral/transaction.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from spiral.core.table import KeyRange
 from spiral.core.table import Transaction as CoreTransaction
 from spiral.core.table.spec import Operation
 from spiral.expressions.base import ExprLike
@@ -19,6 +20,10 @@ class Transaction:
         """The status of the transaction."""
         return self._core.status
+    def is_empty(self) -> bool:
+        """Check if the transaction has no operations."""
+        return self._core.is_empty()
     def __enter__(self):
         return self
@@ -41,16 +46,26 @@ class Transaction:
         self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
-    def writeback(self, scan: Scan, *, partition_size_bytes: int | None = None):
+    def writeback(
+        self,
+        scan: Scan,
+        *,
+        key_range: KeyRange | None = None,
+        partition_size_bytes: int | None = None,
+        batch_readahead: int | None = None,
+    ):
         """Write back the results of a scan to the table.
         :param scan: The scan to write back.
             The scan does NOT need to be over the same table as transaction,
             but it does need to have the same key schema.
+        :param key_range: Optional key range to limit the writeback to.
         :param partition_size_bytes: The maximum partition size in bytes.
-            If not provided, the default partition size is used.
+        :param batch_readahead: The number of batches to read ahead when evaluating the scan.
         """
-        self._core.writeback(scan.core, partition_size_bytes=partition_size_bytes)
+        self._core.writeback(
+            scan.core, key_range=key_range, partition_size_bytes=partition_size_bytes, batch_readahead=batch_readahead
+        )
     def drop_columns(self, column_paths: list[str]):
         """

{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{pyspiral-0.6.12.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes