pyspiral 0.7.1__cp312-abi3-manylinux_2_28_x86_64.whl → 0.7.2__cp312-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.7.1
3
+ Version: 0.7.2
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -1,8 +1,8 @@
1
- pyspiral-0.7.1.dist-info/METADATA,sha256=WQ0LCz8jtz9qHpU6Zim1I_134EwD3Ve8V4bW0nU_t-0,1874
2
- pyspiral-0.7.1.dist-info/WHEEL,sha256=ydlpo1_yEJ2g1Axq3LoOd_OfioJa2swc2j5IDCa4uho,107
3
- pyspiral-0.7.1.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
1
+ pyspiral-0.7.2.dist-info/METADATA,sha256=f9Q9zUpltwDCVkaY3jVW_spoFHDt0PwAFYaRePg-joU,1874
2
+ pyspiral-0.7.2.dist-info/WHEEL,sha256=ydlpo1_yEJ2g1Axq3LoOd_OfioJa2swc2j5IDCa4uho,107
3
+ pyspiral-0.7.2.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
4
4
  spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
5
- spiral/_lib.abi3.so,sha256=WqIyQu2RIN27T4K326ZBRMIVXysE6p-GW2ENnNueOm0,67478232
5
+ spiral/_lib.abi3.so,sha256=Zgzetu00z_NPMToA5146UIUniIBSmokc1ER53GRxCcA,67496792
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -60,13 +60,13 @@ spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/enrichment.py,sha256=4j5W68YEJOABw5CXrt0A5gMILdR8KaXPjeMLlAQ4Gi4,6072
63
+ spiral/enrichment.py,sha256=iKZn4tLsRQZPtaY-WdJSqPZ3H5UMfVyavcwTKO_3aCw,6980
64
64
  spiral/expressions/__init__.py,sha256=vMNFeeozkWph3dBpEkHPThUhZdT9ZZzxHe71HnkWlDU,8020
65
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
- spiral/expressions/file.py,sha256=HRzGjc3goIlUlKjysoirexDaflNdnj9OoZ6j2uTKZnA,388
67
- spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
66
+ spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
67
+ spiral/expressions/http.py,sha256=OOHh0WBxg3vwza_m74-rkoQWSclRMI60aPAbQ6yKZi0,486
68
68
  spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
69
- spiral/expressions/s3.py,sha256=d6Z_nnBRXopoPWnwGxlgjwyeYlrIMmonX-83PkE8LPo,375
69
+ spiral/expressions/s3.py,sha256=PhQhMP-d8PLsSRtGCZbytnm7lI9VbDAbuSs2LBM4G7Q,505
70
70
  spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
71
71
  spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
72
72
  spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
@@ -106,4 +106,4 @@ spiral/table.py,sha256=ep8ZYtl6POebkPViR2FrekhFazNmAbOAESoLUODlup8,12242
106
106
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
107
107
  spiral/transaction.py,sha256=KQhx3DvQyxG2C8md-YGsF_PgBRfayI0r_7ebMItDHdI,3938
108
108
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
109
- pyspiral-0.7.1.dist-info/RECORD,,
109
+ pyspiral-0.7.2.dist-info/RECORD,,
spiral/_lib.abi3.so CHANGED
Binary file
spiral/enrichment.py CHANGED
@@ -4,11 +4,14 @@ from functools import partial
4
4
  from typing import TYPE_CHECKING, Optional
5
5
 
6
6
  from spiral.core.client import Shard
7
+ from spiral.core.table import Scan
7
8
  from spiral.core.table.spec import Operation
8
9
  from spiral.expressions import Expr
9
10
 
10
11
  if TYPE_CHECKING:
11
- from spiral import KeySpaceIndex, Scan, Table
12
+ import dask.distributed
13
+
14
+ from spiral import KeySpaceIndex, Table
12
15
 
13
16
  logger = logging.getLogger(__name__)
14
17
 
@@ -47,20 +50,37 @@ class Enrichment:
47
50
  """The filter expression."""
48
51
  return self._where
49
52
 
50
- def _scan(self) -> "Scan":
53
+ def _scan(self) -> Scan:
51
54
  return self._table.spiral.scan(self._projection, where=self._where)
52
55
 
53
- def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
56
+ def apply(
57
+ self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None, tx_dump: str | None = None
58
+ ) -> None:
54
59
  """Apply the enrichment onto the table in a streaming fashion.
55
60
 
56
61
  For large tables, consider using `apply_dask` for distributed execution.
62
+
63
+ Args:
64
+ index: Optional key space index to use for sharding the enrichment.
65
+ If not provided, the table's default sharding will be used.
66
+ partition_size_bytes: The maximum partition size in bytes.
67
+ If not provided, the default partition size is used.
68
+ tx_dump: Optional path to dump the transaction JSON for debugging.
57
69
  """
58
- with self._table.txn() as txn:
59
- txn.writeback(
60
- self._scan(),
61
- partition_size_bytes=partition_size_bytes,
62
- batch_readahead=batch_readahead,
63
- )
70
+
71
+ txn = self._table.txn()
72
+
73
+ txn.writeback(
74
+ self._scan(),
75
+ partition_size_bytes=partition_size_bytes,
76
+ batch_readahead=batch_readahead,
77
+ )
78
+
79
+ if txn.is_empty():
80
+ logger.warning("Transaction not committed. No rows were read for enrichment.")
81
+ return
82
+
83
+ txn.commit(tx_dump=tx_dump)
64
84
 
65
85
  # TODO(marko): Need to figure out this sharding with key space index in places.
66
86
  # We could compute on-demand instead of requiring a resource.
@@ -70,6 +90,7 @@ class Enrichment:
70
90
  index: Optional["KeySpaceIndex"] = None,
71
91
  partition_size_bytes: int | None = None,
72
92
  tx_dump: str | None = None,
93
+ client: Optional["dask.distributed.Client"] = None,
73
94
  **kwargs,
74
95
  ) -> None:
75
96
  """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
@@ -87,16 +108,19 @@ class Enrichment:
87
108
  If not provided, the table's default sharding will be used.
88
109
  partition_size_bytes: The maximum partition size in bytes.
89
110
  If not provided, the default partition size is used.
111
+ tx_dump: Optional path to dump the transaction JSON for debugging.
112
+ client: Optional Dask distributed client. If not provided, a new client will be created
90
113
  **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
91
114
  such as `address` to connect to an existing cluster.
92
115
  """
93
- try:
94
- from dask.distributed import Client
95
- except ImportError:
96
- raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
116
+ if client is None:
117
+ try:
118
+ from dask.distributed import Client
119
+ except ImportError:
120
+ raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
97
121
 
98
- # Connect before doing any work.
99
- dask_client = Client(**kwargs)
122
+ # Connect before doing any work.
123
+ client = Client(**kwargs)
100
124
 
101
125
  # Start a transaction BEFORE the planning scan.
102
126
  tx = self._table.txn()
@@ -116,10 +140,10 @@ class Enrichment:
116
140
  output_table_id=self._table.table_id,
117
141
  partition_size_bytes=partition_size_bytes,
118
142
  )
119
- enrichments = dask_client.map(_compute, shards)
143
+ enrichments = client.map(_compute, shards)
120
144
 
121
- logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
122
- for result in dask_client.gather(enrichments):
145
+ logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
146
+ for result in client.gather(enrichments):
123
147
  result: EnrichmentTaskResult
124
148
  tx.include(result.ops)
125
149
 
@@ -2,15 +2,16 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from the local filesystem by the file:// URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
16
  # This just works :)
16
- return Expr(_lib.expr.s3.get(expr.__expr__))
17
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
@@ -2,15 +2,16 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from the URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
16
  # This just works :)
16
- return Expr(_lib.expr.s3.get(expr.__expr__))
17
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
spiral/expressions/s3.py CHANGED
@@ -2,14 +2,15 @@ from spiral import _lib
2
2
  from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
- def get(expr: ExprLike) -> Expr:
5
+ def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
6
6
  """Read data from object storage by the s3:// URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read from object storage.
10
+ abort_on_error: Should the expression abort on errors or just collect them.
10
11
  """
11
12
  from spiral import expressions as se
12
13
 
13
14
  expr = se.lift(expr)
14
15
 
15
- return Expr(_lib.expr.s3.get(expr.__expr__))
16
+ return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))