pyspiral 0.7.0__cp312-abi3-macosx_11_0_arm64.whl → 0.7.2__cp312-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -1,8 +1,8 @@
1
- pyspiral-0.7.0.dist-info/METADATA,sha256=zP3jypGjswSz8nqMxMj8N6a4udG_LoZZLQAf8Xepg64,1874
2
- pyspiral-0.7.0.dist-info/WHEEL,sha256=KQvxBiy7GLcML6Ad3w_ZPrgSvER1uXd7aYb6wy6b44Y,103
3
- pyspiral-0.7.0.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
1
+ pyspiral-0.7.2.dist-info/METADATA,sha256=f9Q9zUpltwDCVkaY3jVW_spoFHDt0PwAFYaRePg-joU,1874
2
+ pyspiral-0.7.2.dist-info/WHEEL,sha256=KQvxBiy7GLcML6Ad3w_ZPrgSvER1uXd7aYb6wy6b44Y,103
3
+ pyspiral-0.7.2.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
4
4
  spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
5
- spiral/_lib.abi3.so,sha256=Kdun77wTqTZOUrdRMBewZfE-ti3AGBGW9uWzd16W83U,70381040
5
+ spiral/_lib.abi3.so,sha256=HD0QUi6SdaOO9F3MbS2jrcgXg8oE7pBrwPfhWOClOt4,70358864
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -35,7 +35,7 @@ spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
35
35
  spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
36
36
  spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
37
37
  spiral/cli/workloads.py,sha256=2_SLfQTFN6y73R9H0i9dk8VIOVagKxSxOpHXC56yptY,2015
38
- spiral/client.py,sha256=zMp-xXGL4R1Py_rYrC5o3jFLam1oA74azi50dvMP-_o,6329
38
+ spiral/client.py,sha256=53dVv8wxYMmozUfR8MVcUufKGqdVIdb0yZ0gchczBoQ,6426
39
39
  spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  spiral/core/_tools/__init__.pyi,sha256=b2KLfTOQ67pjfbYt07o0IGiTu5o2bZw69lllV8v0Dps,143
41
41
  spiral/core/authn/__init__.pyi,sha256=z_GWyIS62fuiYQrYO8hzw4W8oGaiciqS1u5qtAt54VY,769
@@ -60,13 +60,13 @@ spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/enrichment.py,sha256=aXkKgV24w_0XmhTel_SOshJ2xCgkmXsFGZvaE7yXQ8k,6075
63
+ spiral/enrichment.py,sha256=iKZn4tLsRQZPtaY-WdJSqPZ3H5UMfVyavcwTKO_3aCw,6980
64
64
  spiral/expressions/__init__.py,sha256=vMNFeeozkWph3dBpEkHPThUhZdT9ZZzxHe71HnkWlDU,8020
65
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
- spiral/expressions/file.py,sha256=HRzGjc3goIlUlKjysoirexDaflNdnj9OoZ6j2uTKZnA,388
67
- spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
66
+ spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
67
+ spiral/expressions/http.py,sha256=OOHh0WBxg3vwza_m74-rkoQWSclRMI60aPAbQ6yKZi0,486
68
68
  spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
69
- spiral/expressions/s3.py,sha256=d6Z_nnBRXopoPWnwGxlgjwyeYlrIMmonX-83PkE8LPo,375
69
+ spiral/expressions/s3.py,sha256=PhQhMP-d8PLsSRtGCZbytnm7lI9VbDAbuSs2LBM4G7Q,505
70
70
  spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
71
71
  spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
72
72
  spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
@@ -102,8 +102,8 @@ spiral/streaming_/__init__.py,sha256=s7MlW2ERsuZmZGExLFL6RcZon2e0tNBocBg5ANgki7k
102
102
  spiral/streaming_/reader.py,sha256=tl_lC9xgh1-QFhsZn4xQT7It3PVTzHCEUT2BG2dWBRQ,4166
103
103
  spiral/streaming_/stream.py,sha256=DM1hBDHnWm1ZFKZ-hZ4zxeSXITcUI6kWzwdJZvywI8o,5915
104
104
  spiral/substrait_.py,sha256=AKeOD4KIXvz2J4TYxnIneOiHddtBIyOhuNxVO_uH0eg,12592
105
- spiral/table.py,sha256=prjDBcm6Qerdq3ypXzfbXb7ngAcO0j-Z9aTeZvzKoqs,12209
105
+ spiral/table.py,sha256=ep8ZYtl6POebkPViR2FrekhFazNmAbOAESoLUODlup8,12242
106
106
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
107
107
  spiral/transaction.py,sha256=KQhx3DvQyxG2C8md-YGsF_PgBRfayI0r_7ebMItDHdI,3938
108
108
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
109
- pyspiral-0.7.0.dist-info/RECORD,,
109
+ pyspiral-0.7.2.dist-info/RECORD,,
spiral/_lib.abi3.so CHANGED
Binary file
spiral/client.py CHANGED
@@ -116,6 +116,8 @@ class Spiral:
116
116
  asof = timestamp_micros(asof)
117
117
 
118
118
  # Combine all projections into a single struct.
119
+ if not projections:
120
+ raise ValueError("At least one projection is required.")
119
121
  projection = se.merge(*projections)
120
122
  if where is not None:
121
123
  where = se.lift(where)
spiral/enrichment.py CHANGED
@@ -4,10 +4,13 @@ from functools import partial
4
4
  from typing import TYPE_CHECKING, Optional
5
5
 
6
6
  from spiral.core.client import Shard
7
+ from spiral.core.table import Scan
7
8
  from spiral.core.table.spec import Operation
8
9
  from spiral.expressions import Expr
9
10
 
10
11
  if TYPE_CHECKING:
12
+ import dask.distributed
13
+
11
14
  from spiral import KeySpaceIndex, Table
12
15
 
13
16
  logger = logging.getLogger(__name__)
@@ -47,19 +50,37 @@ class Enrichment:
47
50
  """The filter expression."""
48
51
  return self._where
49
52
 
50
- def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
53
+ def _scan(self) -> Scan:
54
+ return self._table.spiral.scan(self._projection, where=self._where)
55
+
56
+ def apply(
57
+ self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None, tx_dump: str | None = None
58
+ ) -> None:
51
59
  """Apply the enrichment onto the table in a streaming fashion.
52
60
 
53
61
  For large tables, consider using `apply_dask` for distributed execution.
62
+
63
+ Args:
64
+ index: Optional key space index to use for sharding the enrichment.
65
+ If not provided, the table's default sharding will be used.
66
+ partition_size_bytes: The maximum partition size in bytes.
67
+ If not provided, the default partition size is used.
68
+ tx_dump: Optional path to dump the transaction JSON for debugging.
54
69
  """
55
- scan = self._table.spiral.scan(self._projection, where=self._where)
56
70
 
57
- with self._table.txn() as txn:
58
- txn.writeback(
59
- scan,
60
- partition_size_bytes=partition_size_bytes,
61
- batch_readahead=batch_readahead,
62
- )
71
+ txn = self._table.txn()
72
+
73
+ txn.writeback(
74
+ self._scan(),
75
+ partition_size_bytes=partition_size_bytes,
76
+ batch_readahead=batch_readahead,
77
+ )
78
+
79
+ if txn.is_empty():
80
+ logger.warning("Transaction not committed. No rows were read for enrichment.")
81
+ return
82
+
83
+ txn.commit(tx_dump=tx_dump)
63
84
 
64
85
  # TODO(marko): Need to figure out this sharding with key space index in places.
65
86
  # We could compute on-demand instead of requiring a resource.
@@ -69,6 +90,7 @@ class Enrichment:
69
90
  index: Optional["KeySpaceIndex"] = None,
70
91
  partition_size_bytes: int | None = None,
71
92
  tx_dump: str | None = None,
93
+ client: Optional["dask.distributed.Client"] = None,
72
94
  **kwargs,
73
95
  ) -> None:
74
96
  """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
@@ -86,20 +108,23 @@ class Enrichment:
86
108
  If not provided, the table's default sharding will be used.
87
109
  partition_size_bytes: The maximum partition size in bytes.
88
110
  If not provided, the default partition size is used.
111
+ tx_dump: Optional path to dump the transaction JSON for debugging.
112
+ client: Optional Dask distributed client. If not provided, a new client will be created
89
113
  **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
90
114
  such as `address` to connect to an existing cluster.
91
115
  """
92
- try:
93
- from dask.distributed import Client
94
- except ImportError:
95
- raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
116
+ if client is None:
117
+ try:
118
+ from dask.distributed import Client
119
+ except ImportError:
120
+ raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
96
121
 
97
- # Connect before doing any work.
98
- dask_client = Client(**kwargs)
122
+ # Connect before doing any work.
123
+ client = Client(**kwargs)
99
124
 
100
125
  # Start a transaction BEFORE the planning scan.
101
126
  tx = self._table.txn()
102
- plan_scan = self._table.spiral.scan(self._projection, where=self._where)
127
+ plan_scan = self._scan()
103
128
 
104
129
  # Determine the "tasks". Use the index if provided.
105
130
  shards = plan_scan.shards()
@@ -115,10 +140,10 @@ class Enrichment:
115
140
  output_table_id=self._table.table_id,
116
141
  partition_size_bytes=partition_size_bytes,
117
142
  )
118
- enrichments = dask_client.map(_compute, shards)
143
+ enrichments = client.map(_compute, shards)
119
144
 
120
- logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
121
- for result in dask_client.gather(enrichments):
145
+ logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
146
+ for result in client.gather(enrichments):
122
147
  result: EnrichmentTaskResult
123
148
  tx.include(result.ops)
124
149
 
@@ -2,15 +2,16 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from the local filesystem by the file:// URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
16
  # This just works :)
16
- return Expr(_lib.expr.s3.get(expr.__expr__))
17
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
@@ -2,15 +2,16 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from the URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
16
  # This just works :)
16
- return Expr(_lib.expr.s3.get(expr.__expr__))
17
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
spiral/expressions/s3.py CHANGED
@@ -2,14 +2,15 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from object storage by the s3:// URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read from object storage.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
- return Expr(_lib.expr.s3.get(expr.__expr__))
16
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
spiral/table.py CHANGED
@@ -134,9 +134,9 @@ class Table(Expr):
134
134
  """
135
135
  from spiral import expressions as se
136
136
 
137
- # Combine table with all projections into a single struct.
138
- # The table is included to ensure key columns are present in the scan output.
139
- projection = se.merge(self, *projections)
137
+ # TODO(marko): This shouldn't need to happen. We should be able to read keys from writeback scan.
138
+ # Include key columns in the projection.
139
+ projection = se.merge(self.select(*self.key_schema.names), *projections)
140
140
  if where is not None:
141
141
  where = se.lift(where)
142
142