xarray_sql 0.2.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.lock +1 -1
  2. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.toml +1 -1
  3. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/PKG-INFO +52 -53
  4. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/README.md +51 -52
  5. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/examples.md +33 -1
  6. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/pyproject.toml +1 -0
  7. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_df.py +91 -0
  8. xarray_sql-0.3.0/tests/test_ds.py +571 -0
  9. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_reader.py +38 -0
  10. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_sql.py +51 -0
  11. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/df.py +80 -19
  12. xarray_sql-0.3.0/xarray_sql/ds.py +838 -0
  13. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/reader.py +31 -4
  14. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/sql.py +69 -7
  15. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/.gitignore +0 -0
  16. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/LICENSE +0 -0
  17. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/assets/logo.svg +0 -0
  18. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/contributing.md +0 -0
  19. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/index.md +0 -0
  20. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/reference/xarray_sql.md +0 -0
  21. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/src/lib.rs +0 -0
  22. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/__init__.py +0 -0
  23. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/conftest.py +0 -0
  24. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_cft.py +0 -0
  25. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/__init__.py +1 -1
  26. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/cftime.py +0 -0
  27. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/core.py +0 -0
  28. {xarray_sql-0.2.3 → xarray_sql-0.3.0}/zensical.toml +0 -0
@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
3375
3375
 
3376
3376
  [[package]]
3377
3377
  name = "xarray_sql"
3378
- version = "0.2.3"
3378
+ version = "0.3.0"
3379
3379
  dependencies = [
3380
3380
  "arrow",
3381
3381
  "async-stream",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "xarray_sql"
3
- version = "0.2.3"
3
+ version = "0.3.0"
4
4
  authors = ["Alex Merose"]
5
5
  edition = "2021"
6
6
  exclude = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray_sql
3
- Version: 0.2.3
3
+ Version: 0.3.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Intended Audience :: Developers
@@ -63,22 +63,21 @@ import xarray as xr
63
63
  import xarray_sql as xql
64
64
 
65
65
 
66
- # Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
67
- # keeps Dask's partition setup cheap before any chunks are read from GCS.
68
- ds = (
69
- xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
70
- chunks=dict(time=1),
71
- storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
72
- .sel(time='2020')
66
+ # Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
67
+ # Turning off dask means we don't have to wait to construct a task graph.
68
+ ds = xr.open_zarr(
69
+ 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
70
+ chunks=None, # Turn dask off
71
+ storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
73
72
  )
74
73
 
75
74
  ctx = xql.XarrayContext()
76
- ctx.from_dataset('era5', ds, table_names={
75
+ # Make sure to pass `chunks`!
76
+ ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
77
77
  ('time', 'latitude', 'longitude'): 'surface',
78
78
  ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
79
79
  })
80
- # Registration: ~0.5s for a full year of hourly ERA5, all variables.
81
-
80
+ # Registration takes ~10s on my machine.
82
81
 
83
82
  # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
84
83
  # pushes column projection down to Zarr, so SELECT only fetches what you ask
@@ -100,52 +99,50 @@ ctx.sql('''
100
99
  # 0 8.640069
101
100
 
102
101
  # Average temperature per pressure level, globally.
103
- ctx.sql('''
102
+ result = ctx.sql('''
104
103
  SELECT level, AVG(temperature) - 273.15 AS avg_c
105
104
  FROM era5.atmosphere
106
105
  WHERE time BETWEEN TIMESTAMP '2020-01-01'
107
106
  AND TIMESTAMP '2020-01-01 05:00:00'
108
107
  GROUP BY level
109
108
  ORDER BY level DESC
110
- ''').to_pandas()
111
- # level avg_c
112
- # 0 1000 6.621012 ← surface
113
- # 1 975 5.185638
114
- # 2 950 4.028429
115
- # 3 925 3.082812
116
- # 4 900 2.210917
117
- # 5 875 1.395018
118
- # 6 850 0.634267
119
- # 7 825 -0.210372
120
- # 8 800 -1.181075
121
- # 9 775 -2.306465
122
- # 10 750 -3.535534
123
- # 11 700 -6.241685
124
- # 12 650 -9.236364
125
- # 13 600 -12.580938
126
- # 14 550 -16.335386
127
- # 15 500 -20.643604
128
- # 16 450 -25.573401
129
- # 17 400 -31.156920
130
- # 18 350 -37.400552
131
- # 19 300 -43.852607
132
- # 20 250 -49.322132
133
- # 21 225 -51.569113
134
- # 22 200 -53.693248
135
- # 23 175 -55.890484
136
- # 24 150 -58.382290
137
- # 25 125 -61.091916
138
- # 26 100 -63.624885 ← tropopause
139
- # 27 70 -63.182300
140
- # 28 50 -60.124845
141
- # 29 30 -55.986327
142
- # 30 20 -52.433089
143
- # 31 10 -44.140750
144
- # 32 7 -38.707350
145
- # 33 5 -32.621999
146
- # 34 3 -21.509175
147
- # 35 2 -13.355764
148
- # 36 1 -9.020513 ← top of atmosphere
109
+ ''')
110
+ # DataFrame()
111
+ # +-------+----------------------+
112
+ # | level | avg_c |
113
+ # +-------+----------------------+
114
+ # | 1000 | 6.6210120796502565 |
115
+ # | 975 | 5.185637919348153 |
116
+ # | 950 | 4.028428657263021 |
117
+ # | 925 | 3.0828117974912743 |
118
+ # | 900 | 2.2109172992531967 |
119
+ # | 875 | 1.395017610194202 |
120
+ # | 850 | 0.6342670572626616 |
121
+ # | 825 | -0.21037158786759846 |
122
+ # | 800 | -1.1810754318269687 |
123
+ # | 775 | -2.3064649711534457 |
124
+ # +-------+----------------------+
125
+
126
+ ctx.sql('''
127
+ SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
128
+ FROM era5.surface
129
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
130
+ AND TIMESTAMP '2020-01-01 05:00:00'
131
+ GROUP BY latitude, longitude
132
+ ORDER BY latitude DESC, longitude
133
+ ''').to_dataset(dims=['latitude', 'longitude'], template=ds)
134
+ # <xarray.Dataset> Size: 8MB
135
+ # Dimensions: (latitude: 721, longitude: 1440)
136
+ # Coordinates:
137
+ # * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
138
+ # * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
139
+ # Data variables:
140
+ # avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
141
+ # Attributes:
142
+ # last_updated: 2026-06-20 02:33:34.265980+00:00
143
+ # valid_time_start: 1940-01-01
144
+ # valid_time_stop: 2025-12-31
145
+ # valid_time_stop_era5t: 2026-06-14
149
146
  ```
150
147
 
151
148
  _(A runnable version of this example lives at
@@ -225,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
225
222
  _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
226
223
 
227
224
  - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
228
- - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
225
+ - [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
229
226
 
230
227
  ## Roadmap
231
228
 
232
229
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
233
230
  - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
234
231
  - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
235
- - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
232
+ - [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
236
233
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
237
234
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
238
235
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -254,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
254
251
  who are working to make this library better.
255
252
  - Andrew Huang for the sense of taste he brings to the project and consummate code
256
253
  changes.
254
+ - Aman Kumar for spending a considerable amount of his GSoC internship
255
+ contributing to this project.
257
256
 
258
257
 
259
258
  ## License
@@ -20,22 +20,21 @@ import xarray as xr
20
20
  import xarray_sql as xql
21
21
 
22
22
 
23
- # Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
24
- # keeps Dask's partition setup cheap before any chunks are read from GCS.
25
- ds = (
26
- xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
27
- chunks=dict(time=1),
28
- storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
29
- .sel(time='2020')
23
+ # Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
24
+ # Turning off dask means we don't have to wait to construct a task graph.
25
+ ds = xr.open_zarr(
26
+ 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
27
+ chunks=None, # Turn dask off
28
+ storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
30
29
  )
31
30
 
32
31
  ctx = xql.XarrayContext()
33
- ctx.from_dataset('era5', ds, table_names={
32
+ # Make sure to pass `chunks`!
33
+ ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
34
34
  ('time', 'latitude', 'longitude'): 'surface',
35
35
  ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
36
36
  })
37
- # Registration: ~0.5s for a full year of hourly ERA5, all variables.
38
-
37
+ # Registration takes ~10s on my machine.
39
38
 
40
39
  # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
41
40
  # pushes column projection down to Zarr, so SELECT only fetches what you ask
@@ -57,52 +56,50 @@ ctx.sql('''
57
56
  # 0 8.640069
58
57
 
59
58
  # Average temperature per pressure level, globally.
60
- ctx.sql('''
59
+ result = ctx.sql('''
61
60
  SELECT level, AVG(temperature) - 273.15 AS avg_c
62
61
  FROM era5.atmosphere
63
62
  WHERE time BETWEEN TIMESTAMP '2020-01-01'
64
63
  AND TIMESTAMP '2020-01-01 05:00:00'
65
64
  GROUP BY level
66
65
  ORDER BY level DESC
67
- ''').to_pandas()
68
- # level avg_c
69
- # 0 1000 6.621012 ← surface
70
- # 1 975 5.185638
71
- # 2 950 4.028429
72
- # 3 925 3.082812
73
- # 4 900 2.210917
74
- # 5 875 1.395018
75
- # 6 850 0.634267
76
- # 7 825 -0.210372
77
- # 8 800 -1.181075
78
- # 9 775 -2.306465
79
- # 10 750 -3.535534
80
- # 11 700 -6.241685
81
- # 12 650 -9.236364
82
- # 13 600 -12.580938
83
- # 14 550 -16.335386
84
- # 15 500 -20.643604
85
- # 16 450 -25.573401
86
- # 17 400 -31.156920
87
- # 18 350 -37.400552
88
- # 19 300 -43.852607
89
- # 20 250 -49.322132
90
- # 21 225 -51.569113
91
- # 22 200 -53.693248
92
- # 23 175 -55.890484
93
- # 24 150 -58.382290
94
- # 25 125 -61.091916
95
- # 26 100 -63.624885 ← tropopause
96
- # 27 70 -63.182300
97
- # 28 50 -60.124845
98
- # 29 30 -55.986327
99
- # 30 20 -52.433089
100
- # 31 10 -44.140750
101
- # 32 7 -38.707350
102
- # 33 5 -32.621999
103
- # 34 3 -21.509175
104
- # 35 2 -13.355764
105
- # 36 1 -9.020513 ← top of atmosphere
66
+ ''')
67
+ # DataFrame()
68
+ # +-------+----------------------+
69
+ # | level | avg_c |
70
+ # +-------+----------------------+
71
+ # | 1000 | 6.6210120796502565 |
72
+ # | 975 | 5.185637919348153 |
73
+ # | 950 | 4.028428657263021 |
74
+ # | 925 | 3.0828117974912743 |
75
+ # | 900 | 2.2109172992531967 |
76
+ # | 875 | 1.395017610194202 |
77
+ # | 850 | 0.6342670572626616 |
78
+ # | 825 | -0.21037158786759846 |
79
+ # | 800 | -1.1810754318269687 |
80
+ # | 775 | -2.3064649711534457 |
81
+ # +-------+----------------------+
82
+
83
+ ctx.sql('''
84
+ SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
85
+ FROM era5.surface
86
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
87
+ AND TIMESTAMP '2020-01-01 05:00:00'
88
+ GROUP BY latitude, longitude
89
+ ORDER BY latitude DESC, longitude
90
+ ''').to_dataset(dims=['latitude', 'longitude'], template=ds)
91
+ # <xarray.Dataset> Size: 8MB
92
+ # Dimensions: (latitude: 721, longitude: 1440)
93
+ # Coordinates:
94
+ # * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
95
+ # * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
96
+ # Data variables:
97
+ # avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
98
+ # Attributes:
99
+ # last_updated: 2026-06-20 02:33:34.265980+00:00
100
+ # valid_time_start: 1940-01-01
101
+ # valid_time_stop: 2025-12-31
102
+ # valid_time_stop_era5t: 2026-06-14
106
103
  ```
107
104
 
108
105
  _(A runnable version of this example lives at
@@ -182,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
182
179
  _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
183
180
 
184
181
  - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
185
- - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
182
+ - [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
186
183
 
187
184
  ## Roadmap
188
185
 
189
186
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
190
187
  - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
191
188
  - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
192
- - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
189
+ - [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
193
190
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
194
191
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
195
192
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -211,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
211
208
  who are working to make this library better.
212
209
  - Andrew Huang for the sense of taste he brings to the project and consummate code
213
210
  changes.
211
+ - Aman Kumar for spending a considerable amount of his GSoC internship
212
+ contributing to this project.
214
213
 
215
214
 
216
215
  ## License
@@ -87,7 +87,39 @@ If you omit `table_names`, each table is named by joining its dimension names
87
87
  with underscores, e.g. `era5.time_latitude_longitude` and
88
88
  `era5.time_level_latitude_longitude`.
89
89
 
90
- A runnable version of this example lives at
90
+ ## GOES satellite imagery (scalar variables)
91
+
92
+ Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
93
+ GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
94
+ scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
95
+ scalars into a single one-row table named `scalar`:
96
+
97
+ ```python
98
+ import fsspec
99
+ import xarray as xr
100
+ from xarray_sql import XarrayContext
101
+
102
+ # A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
103
+ # (y, x) image bands alongside dozens of scalar metadata variables.
104
+ url = (
105
+ 'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
106
+ 'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
107
+ )
108
+ ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
109
+ {'y': 250, 'x': 250}
110
+ )
111
+
112
+ ctx = XarrayContext()
113
+ ctx.from_dataset('goes', ds)
114
+
115
+ # The gridded bands and the scalar metadata are separate tables.
116
+ ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0] # -> 250000
117
+ ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape # -> (1, 89)
118
+ ```
119
+
120
+ Override the default name like any other group with `table_names={(): 'metadata'}`.
121
+
122
+ A runnable version of the ERA5 example lives at
91
123
  [`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
92
124
 
93
125
  [arco-era5]: https://github.com/google-research/arco-era5
@@ -90,6 +90,7 @@ module = [
90
90
  "pyarrow.*",
91
91
  "datafusion.*",
92
92
  "xarray.*",
93
+ "pandas.*",
93
94
  ]
94
95
  ignore_missing_imports = true
95
96
 
@@ -3,12 +3,14 @@ import tracemalloc
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
+ import pytest
6
7
  import xarray as xr
7
8
 
8
9
  from xarray_sql.df import (
9
10
  DEFAULT_BATCH_SIZE,
10
11
  _parse_schema,
11
12
  block_slices,
13
+ compute_chunks,
12
14
  dataset_to_record_batch,
13
15
  explode,
14
16
  from_map,
@@ -58,6 +60,37 @@ def test_explode_data_equal_one_last(air):
58
60
  assert air.isel(iselection).equals(ds)
59
61
 
60
62
 
63
+ def test_block_slices_scalar_dataset_yields_single_block():
64
+ # A dimensionless dataset (e.g. scalar metadata variables) has exactly
65
+ # one block: the whole, empty selection.
66
+ ds = xr.Dataset({"projection": ((), 0)})
67
+ assert list(block_slices(ds)) == [{}]
68
+
69
+
70
+ def test_block_slices_scalar_ignores_irrelevant_chunks():
71
+ ds = xr.Dataset({"projection": ((), 0)})
72
+ assert list(block_slices(ds, chunks={"time": 4})) == [{}]
73
+
74
+
75
+ def test_block_slices_filters_chunk_keys_to_dataset_dims(air_small):
76
+ # A chunk key for a dimension the dataset doesn't have is ignored,
77
+ # rather than raising.
78
+ base = list(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
79
+ extra = list(
80
+ block_slices(
81
+ air_small, chunks={"time": 4, "lat": 3, "lon": 4, "absent": 2}
82
+ )
83
+ )
84
+ assert len(extra) == len(base)
85
+
86
+
87
+ def test_block_slices_dimensional_unchunked_raises():
88
+ # A dataset with dimensions but no chunking is still a user error.
89
+ ds = xr.Dataset({"v": (["x"], np.arange(3))}, coords={"x": np.arange(3)})
90
+ with pytest.raises(AssertionError):
91
+ list(block_slices(ds))
92
+
93
+
61
94
  def test_from_map_basic():
62
95
  def make_df(x):
63
96
  return pd.DataFrame({"value": [x, x * 2], "index": [0, 1]})
@@ -441,3 +474,61 @@ def test_read_xarray_table_memory_bounds(large_ds):
441
474
  )
442
475
  finally:
443
476
  tracemalloc.stop()
477
+
478
+
479
+ # ---------------------------------------------------------------------------
480
+ # compute_chunks: arithmetic replacement for ds.chunk(...).chunks.
481
+ # Dask serves as the source of truth.
482
+ # ---------------------------------------------------------------------------
483
+
484
+
485
+ def _dask_chunks(ds: xr.Dataset, chunks: dict) -> dict:
486
+ rechunked = ds.copy(data=None, deep=False).chunk(chunks)
487
+ return {str(k): tuple(v) for k, v in rechunked.chunks.items()}
488
+
489
+
490
+ def _normalise(result: dict) -> dict:
491
+ return {str(k): tuple(v) for k, v in result.items()}
492
+
493
+
494
+ def _simple_ds(shape: tuple[int, ...], dims: tuple[str, ...]) -> xr.Dataset:
495
+ return xr.Dataset(
496
+ {"v": (dims, np.zeros(shape))},
497
+ coords={d: np.arange(s) for d, s in zip(dims, shape)},
498
+ )
499
+
500
+
501
+ @pytest.mark.parametrize(
502
+ "ds,chunks",
503
+ [
504
+ # Even divide on a single dim.
505
+ (_simple_ds((10,), ("x",)), {"x": 5}),
506
+ # Uneven divide: trailing remainder chunk.
507
+ (_simple_ds((10,), ("x",)), {"x": 3}),
508
+ # Requested chunk size larger than the dim → single chunk.
509
+ (_simple_ds((5,), ("x",)), {"x": 100}),
510
+ # Multi-dim spec with a dim left unspecified (kept as one chunk).
511
+ (_simple_ds((4, 6), ("x", "y")), {"x": 2}),
512
+ # Multi-dim spec rechunking every dim.
513
+ (_simple_ds((7, 11, 13), ("a", "b", "c")), {"a": 3, "b": 4, "c": 5}),
514
+ ],
515
+ )
516
+ def test_compute_chunks_matches_dask(ds, chunks):
517
+ assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
518
+
519
+
520
+ def test_compute_chunks_preserves_existing_dask_chunking():
521
+ # When the dataset is already dask-backed, rechunking one dim must
522
+ # leave other dims' existing chunk tuples alone.
523
+ ds = _simple_ds((4, 5), ("x", "y")).chunk({"x": 1, "y": 2})
524
+ chunks = {"x": 2}
525
+ assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
526
+
527
+
528
+ def test_compute_chunks_tuples_sum_to_dim_size():
529
+ # Dask-independent invariant: every per-dim chunk tuple must fully
530
+ # cover its dimension.
531
+ ds = _simple_ds((7, 11, 13), ("a", "b", "c"))
532
+ result = compute_chunks(ds, {"a": 3, "b": 4, "c": 5})
533
+ for dim, tup in result.items():
534
+ assert sum(tup) == ds.sizes[dim]