pyspiral 0.6.14__cp312-abi3-manylinux_2_28_x86_64.whl → 0.6.15__cp312-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.6.14
3
+ Version: 0.6.15
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -1,8 +1,8 @@
1
- pyspiral-0.6.14.dist-info/METADATA,sha256=WGdXph89n9RYh7SmWR_6GH9vAn-5suV4sAQAf1E8QCM,1875
2
- pyspiral-0.6.14.dist-info/WHEEL,sha256=ydlpo1_yEJ2g1Axq3LoOd_OfioJa2swc2j5IDCa4uho,107
3
- pyspiral-0.6.14.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
1
+ pyspiral-0.6.15.dist-info/METADATA,sha256=1wRDpjCt2Im2qJm0yh94g-_VYl-19528b6-Vc_ad0D4,1875
2
+ pyspiral-0.6.15.dist-info/WHEEL,sha256=ydlpo1_yEJ2g1Axq3LoOd_OfioJa2swc2j5IDCa4uho,107
3
+ pyspiral-0.6.15.dist-info/entry_points.txt,sha256=uft7u-a6g40NLt4Q6BleWbK4NY0M8nZuYPpP8DV0EOk,45
4
4
  spiral/__init__.py,sha256=gAysTwG_oEeKVMdCOfOzDhl0bM2miiK8Ds2vvUihBWw,1153
5
- spiral/_lib.abi3.so,sha256=RY0P0UG_ejwYXZQcr3pkECfKKzRfNBnfHDBIcve6lrk,67382808
5
+ spiral/_lib.abi3.so,sha256=8HKaTcnoTj9_TI4HxYNFZIvUszRpRO9r3_YAQ_pXVno,67411480
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -60,12 +60,13 @@ spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/enrichment.py,sha256=e2yzNWTTG73uEkLTc4ccTNRQ94cBtM04eGzlJ2-kBOI,5851
64
- spiral/expressions/__init__.py,sha256=Fp7Xx3exh9KJad92tgd_TGGIpYLQTHqWjW-pexzQibU,7981
63
+ spiral/enrichment.py,sha256=YDaXcJPtmJzpLrYmn2pdllVcRIkXlb578KKgkIb38Eo,6518
64
+ spiral/expressions/__init__.py,sha256=vMNFeeozkWph3dBpEkHPThUhZdT9ZZzxHe71HnkWlDU,8020
65
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
+ spiral/expressions/file.py,sha256=HRzGjc3goIlUlKjysoirexDaflNdnj9OoZ6j2uTKZnA,388
66
67
  spiral/expressions/http.py,sha256=WfHVLqz_LjBr78mN3ARBRQqgBrkao7-S73JxjC4Xwvo,356
67
68
  spiral/expressions/list_.py,sha256=MMt5lf5H1M3O-x6N_PvqOLGq9NOk6Ukv0fPWwPC_uy4,1809
68
- spiral/expressions/s3.py,sha256=bkd0HANerNKlOblp2z7JJOSWjF9Bw9lZe1A-KTrUEgk,378
69
+ spiral/expressions/s3.py,sha256=d6Z_nnBRXopoPWnwGxlgjwyeYlrIMmonX-83PkE8LPo,375
69
70
  spiral/expressions/str_.py,sha256=tY8RXW3JWvr1-bEfCZtk5FAf11wKJnXPuA9EoeJ9tA4,1265
70
71
  spiral/expressions/struct.py,sha256=pGAnCDh6AK0BK1XfZ1qG4ce4ranIQEE1HQsgmzBcfwQ,2038
71
72
  spiral/expressions/text.py,sha256=-02gBWYoyNQ3qQ1--9HTa8IryUDojYQVIp8C7rgnOWQ,1893
@@ -105,4 +106,4 @@ spiral/table.py,sha256=prjDBcm6Qerdq3ypXzfbXb7ngAcO0j-Z9aTeZvzKoqs,12209
105
106
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
106
107
  spiral/transaction.py,sha256=hQm6DfCklMDpIYJ9qA2wR45cCuUPGCiJy1tHGE3AsEY,3418
107
108
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
108
- pyspiral-0.6.14.dist-info/RECORD,,
109
+ pyspiral-0.6.15.dist-info/RECORD,,
spiral/_lib.abi3.so CHANGED
Binary file
spiral/enrichment.py CHANGED
@@ -64,7 +64,12 @@ class Enrichment:
64
64
  # TODO(marko): Need to figure out this sharding with key space index in places.
65
65
  # We could compute on-demand instead of requiring a resource.
66
66
  def apply_dask(
67
- self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
67
+ self,
68
+ *,
69
+ index: Optional["KeySpaceIndex"] = None,
70
+ partition_size_bytes: int | None = None,
71
+ tx_dump: str | None = None,
72
+ **kwargs,
68
73
  ) -> None:
69
74
  """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
70
75
 
@@ -79,6 +84,9 @@ class Enrichment:
79
84
  Args:
80
85
  index: Optional key space index to use for sharding the enrichment.
81
86
  If not provided, the table's default sharding will be used.
87
+ partition_size_bytes: The maximum partition size in bytes.
88
+ If not provided, the default partition size is used.
89
+ tx_dump: Optional path to dump the transaction operations as a JSON file for debugging.
82
90
  **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
83
91
  such as `address` to connect to an existing cluster.
84
92
  """
@@ -92,6 +100,7 @@ class Enrichment:
92
100
 
93
101
  # Start a transaction BEFORE the planning scan.
94
102
  tx = self._table.txn()
103
+ backup_ops = []
95
104
  plan_scan = self._table.spiral.scan(self._projection, where=self._where)
96
105
 
97
106
  # Determine the "tasks". Use the index if provided.
@@ -114,11 +123,18 @@ class Enrichment:
114
123
  for result in dask_client.gather(enrichments):
115
124
  result: EnrichmentTaskResult
116
125
  tx.include(result.ops)
126
+ backup_ops.extend(result.ops)
117
127
 
118
128
  if tx.is_empty():
119
129
  logger.warning("Transaction not committed. No rows were read for enrichment.")
120
130
  return
121
131
 
132
+ # TODO(marko): We can remove this when I have more trust in very large tx commits.
133
+ if tx_dump is not None:
134
+ with open(tx_dump, "w") as f:
135
+ f.writelines([op.to_json() for op in backup_ops])
136
+ logger.info(f"Transaction dumped to {tx_dump}")
137
+
122
138
  tx.commit()
123
139
 
124
140
 
@@ -8,6 +8,7 @@ import pyarrow as pa
8
8
 
9
9
  from spiral import _lib, arrow_
10
10
 
11
+ from . import file as file
11
12
  from . import http as http
12
13
  from . import list_ as list
13
14
  from . import s3 as s3
@@ -50,6 +51,7 @@ __all__ = [
50
51
  "text",
51
52
  "s3",
52
53
  "http",
54
+ "file",
53
55
  "UDF",
54
56
  ]
55
57
 
@@ -0,0 +1,16 @@
1
+ from spiral import _lib
2
+ from spiral.expressions.base import Expr, ExprLike
3
+
4
+
5
+ def get(expr: ExprLike) -> Expr:
6
+ """Read data from the local filesystem by the file:// URL.
7
+
8
+ Args:
9
+ expr: URLs of the data that needs to be read.
10
+ """
11
+ from spiral import expressions as se
12
+
13
+ expr = se.lift(expr)
14
+
15
+ # This just works :)
16
+ return Expr(_lib.expr.s3.get(expr.__expr__))
spiral/expressions/s3.py CHANGED
@@ -3,7 +3,7 @@ from spiral.expressions.base import Expr, ExprLike
3
3
 
4
4
 
5
5
  def get(expr: ExprLike) -> Expr:
6
- """Read data from object storage by the object's URL.
6
+ """Read data from object storage by the s3:// URL.
7
7
 
8
8
  Args:
9
9
  expr: URLs of the data that needs to be read from object storage.