pyspiral 0.7.1__cp312-abi3-manylinux_2_28_aarch64.whl → 0.7.2__cp312-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.7.1.dist-info → pyspiral-0.7.2.dist-info}/METADATA +1 -1
- {pyspiral-0.7.1.dist-info → pyspiral-0.7.2.dist-info}/RECORD +9 -9
- spiral/_lib.abi3.so +0 -0
- spiral/enrichment.py +42 -18
- spiral/expressions/file.py +3 -2
- spiral/expressions/http.py +3 -2
- spiral/expressions/s3.py +3 -2
- {pyspiral-0.7.1.dist-info → pyspiral-0.7.2.dist-info}/WHEEL +0 -0
- {pyspiral-0.7.1.dist-info → pyspiral-0.7.2.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
pyspiral-0.7.
|
|
2
|
-
pyspiral-0.7.
|
|
3
|
-
pyspiral-0.7.
|
|
1
|
+
pyspiral-0.7.2.dist-info/METADATA,sha256=f9Q9zUpltwDCVkaY3jVW_spoFHDt0PwAFYaRePg-joU,1874
|
|
2
|
+
pyspiral-0.7.2.dist-info/WHEEL,sha256=I5JYpyYzeAl2SOerY_wvkm-HJti0rDQc6zMeJs35MpM,108
|
|
3
|
+
pyspiral-0.7.2.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
|
|
4
4
|
spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
|
|
5
|
-
spiral/_lib.abi3.so,sha256=
|
|
5
|
+
spiral/_lib.abi3.so,sha256=tHS470YRan_BmVwysmIRBdGo31icPbcNpj41dgsZDUs,61326408
|
|
6
6
|
spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
|
|
7
7
|
spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
|
|
8
8
|
spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
|
|
@@ -60,13 +60,13 @@ spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
60
60
|
spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
|
|
61
61
|
spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
|
|
62
62
|
spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
|
|
63
|
-
spiral/enrichment.py,sha256=
|
|
63
|
+
spiral/enrichment.py,sha256=iKZn4tLsRQZPtaY-WdJSqPZ3H5UMfVyavcwTKO_3aCw,6980
|
|
64
64
|
spiral/expressions/__init__.py,sha256=vMNFeeozkWph3dBpEkHPThUhZdT9ZZzxHe71HnkWlDU,8020
|
|
65
65
|
spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
|
|
66
|
-
spiral/expressions/file.py,sha256=
|
|
67
|
-
spiral/expressions/http.py,sha256=
|
|
66
|
+
spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
|
|
67
|
+
spiral/expressions/http.py,sha256=OOHh0WBxg3vwza_m74-rkoQWSclRMI60aPAbQ6yKZi0,486
|
|
68
68
|
spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
|
|
69
|
-
spiral/expressions/s3.py,sha256=
|
|
69
|
+
spiral/expressions/s3.py,sha256=PhQhMP-d8PLsSRtGCZbytnm7lI9VbDAbuSs2LBM4G7Q,505
|
|
70
70
|
spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
|
|
71
71
|
spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
|
|
72
72
|
spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
|
|
@@ -106,4 +106,4 @@ spiral/table.py,sha256=ep8ZYtl6POebkPViR2FrekhFazNmAbOAESoLUODlup8,12242
|
|
|
106
106
|
spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
|
|
107
107
|
spiral/transaction.py,sha256=KQhx3DvQyxG2C8md-YGsF_PgBRfayI0r_7ebMItDHdI,3938
|
|
108
108
|
spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
|
|
109
|
-
pyspiral-0.7.
|
|
109
|
+
pyspiral-0.7.2.dist-info/RECORD,,
|
spiral/_lib.abi3.so
CHANGED
|
Binary file
|
spiral/enrichment.py
CHANGED
|
@@ -4,11 +4,14 @@ from functools import partial
|
|
|
4
4
|
from typing import TYPE_CHECKING, Optional
|
|
5
5
|
|
|
6
6
|
from spiral.core.client import Shard
|
|
7
|
+
from spiral.core.table import Scan
|
|
7
8
|
from spiral.core.table.spec import Operation
|
|
8
9
|
from spiral.expressions import Expr
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
|
-
|
|
12
|
+
import dask.distributed
|
|
13
|
+
|
|
14
|
+
from spiral import KeySpaceIndex, Table
|
|
12
15
|
|
|
13
16
|
logger = logging.getLogger(__name__)
|
|
14
17
|
|
|
@@ -47,20 +50,37 @@ class Enrichment:
|
|
|
47
50
|
"""The filter expression."""
|
|
48
51
|
return self._where
|
|
49
52
|
|
|
50
|
-
def _scan(self) ->
|
|
53
|
+
def _scan(self) -> Scan:
|
|
51
54
|
return self._table.spiral.scan(self._projection, where=self._where)
|
|
52
55
|
|
|
53
|
-
def apply(
|
|
56
|
+
def apply(
|
|
57
|
+
self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None, tx_dump: str | None = None
|
|
58
|
+
) -> None:
|
|
54
59
|
"""Apply the enrichment onto the table in a streaming fashion.
|
|
55
60
|
|
|
56
61
|
For large tables, consider using `apply_dask` for distributed execution.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
index: Optional key space index to use for sharding the enrichment.
|
|
65
|
+
If not provided, the table's default sharding will be used.
|
|
66
|
+
partition_size_bytes: The maximum partition size in bytes.
|
|
67
|
+
If not provided, the default partition size is used.
|
|
68
|
+
tx_dump: Optional path to dump the transaction JSON for debugging.
|
|
57
69
|
"""
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
70
|
+
|
|
71
|
+
txn = self._table.txn()
|
|
72
|
+
|
|
73
|
+
txn.writeback(
|
|
74
|
+
self._scan(),
|
|
75
|
+
partition_size_bytes=partition_size_bytes,
|
|
76
|
+
batch_readahead=batch_readahead,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if txn.is_empty():
|
|
80
|
+
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
txn.commit(tx_dump=tx_dump)
|
|
64
84
|
|
|
65
85
|
# TODO(marko): Need to figure out this sharding with key space index in places.
|
|
66
86
|
# We could compute on-demand instead of requiring a resource.
|
|
@@ -70,6 +90,7 @@ class Enrichment:
|
|
|
70
90
|
index: Optional["KeySpaceIndex"] = None,
|
|
71
91
|
partition_size_bytes: int | None = None,
|
|
72
92
|
tx_dump: str | None = None,
|
|
93
|
+
client: Optional["dask.distributed.Client"] = None,
|
|
73
94
|
**kwargs,
|
|
74
95
|
) -> None:
|
|
75
96
|
"""Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
|
|
@@ -87,16 +108,19 @@ class Enrichment:
|
|
|
87
108
|
If not provided, the table's default sharding will be used.
|
|
88
109
|
partition_size_bytes: The maximum partition size in bytes.
|
|
89
110
|
If not provided, the default partition size is used.
|
|
111
|
+
tx_dump: Optional path to dump the transaction JSON for debugging.
|
|
112
|
+
client: Optional Dask distributed client. If not provided, a new client will be created
|
|
90
113
|
**kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
|
|
91
114
|
such as `address` to connect to an existing cluster.
|
|
92
115
|
"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
116
|
+
if client is None:
|
|
117
|
+
try:
|
|
118
|
+
from dask.distributed import Client
|
|
119
|
+
except ImportError:
|
|
120
|
+
raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
|
|
97
121
|
|
|
98
|
-
|
|
99
|
-
|
|
122
|
+
# Connect before doing any work.
|
|
123
|
+
client = Client(**kwargs)
|
|
100
124
|
|
|
101
125
|
# Start a transaction BEFORE the planning scan.
|
|
102
126
|
tx = self._table.txn()
|
|
@@ -116,10 +140,10 @@ class Enrichment:
|
|
|
116
140
|
output_table_id=self._table.table_id,
|
|
117
141
|
partition_size_bytes=partition_size_bytes,
|
|
118
142
|
)
|
|
119
|
-
enrichments =
|
|
143
|
+
enrichments = client.map(_compute, shards)
|
|
120
144
|
|
|
121
|
-
logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {
|
|
122
|
-
for result in
|
|
145
|
+
logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
|
|
146
|
+
for result in client.gather(enrichments):
|
|
123
147
|
result: EnrichmentTaskResult
|
|
124
148
|
tx.include(result.ops)
|
|
125
149
|
|
spiral/expressions/file.py
CHANGED
|
@@ -2,15 +2,16 @@ from spiral import _lib
|
|
|
2
2
|
from spiral.expressions.base import Expr, ExprLike
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def get(expr: ExprLike) -> Expr:
|
|
5
|
+
def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
|
|
6
6
|
"""Read data from the local filesystem by the file:// URL.
|
|
7
7
|
|
|
8
8
|
Args:
|
|
9
9
|
expr: URLs of the data that needs to be read.
|
|
10
|
+
abort_on_error: Should the expression abort on errors or just collect them.
|
|
10
11
|
"""
|
|
11
12
|
from spiral import expressions as se
|
|
12
13
|
|
|
13
14
|
expr = se.lift(expr)
|
|
14
15
|
|
|
15
16
|
# This just works :)
|
|
16
|
-
return Expr(_lib.expr.s3.get(expr.__expr__))
|
|
17
|
+
return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
|
spiral/expressions/http.py
CHANGED
|
@@ -2,15 +2,16 @@ from spiral import _lib
|
|
|
2
2
|
from spiral.expressions.base import Expr, ExprLike
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def get(expr: ExprLike) -> Expr:
|
|
5
|
+
def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
|
|
6
6
|
"""Read data from the URL.
|
|
7
7
|
|
|
8
8
|
Args:
|
|
9
9
|
expr: URLs of the data that needs to be read.
|
|
10
|
+
abort_on_error: Should the expression abort on errors or just collect them.
|
|
10
11
|
"""
|
|
11
12
|
from spiral import expressions as se
|
|
12
13
|
|
|
13
14
|
expr = se.lift(expr)
|
|
14
15
|
|
|
15
16
|
# This just works :)
|
|
16
|
-
return Expr(_lib.expr.s3.get(expr.__expr__))
|
|
17
|
+
return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
|
spiral/expressions/s3.py
CHANGED
|
@@ -2,14 +2,15 @@ from spiral import _lib
|
|
|
2
2
|
from spiral.expressions.base import Expr, ExprLike
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def get(expr: ExprLike) -> Expr:
|
|
5
|
+
def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
|
|
6
6
|
"""Read data from object storage by the s3:// URL.
|
|
7
7
|
|
|
8
8
|
Args:
|
|
9
9
|
expr: URLs of the data that needs to be read from object storage.
|
|
10
|
+
abort_on_error: Should the expression abort on errors or just collect them.
|
|
10
11
|
"""
|
|
11
12
|
from spiral import expressions as se
|
|
12
13
|
|
|
13
14
|
expr = se.lift(expr)
|
|
14
15
|
|
|
15
|
-
return Expr(_lib.expr.s3.get(expr.__expr__))
|
|
16
|
+
return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))
|
|
File without changes
|
|
File without changes
|