xarray_sql 0.2.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.lock +1 -1
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.toml +1 -1
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/PKG-INFO +52 -53
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/README.md +51 -52
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/examples.md +33 -1
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/pyproject.toml +1 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_df.py +91 -0
- xarray_sql-0.3.0/tests/test_ds.py +571 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_reader.py +38 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_sql.py +51 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/df.py +80 -19
- xarray_sql-0.3.0/xarray_sql/ds.py +838 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/reader.py +31 -4
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/sql.py +69 -7
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/.gitignore +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/LICENSE +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/assets/logo.svg +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/contributing.md +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/index.md +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/reference/xarray_sql.md +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/src/lib.rs +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/__init__.py +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/conftest.py +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_cft.py +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/__init__.py +1 -1
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/cftime.py +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/xarray_sql/core.py +0 -0
- {xarray_sql-0.2.3 → xarray_sql-0.3.0}/zensical.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray_sql
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Intended Audience :: Science/Research
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -63,22 +63,21 @@ import xarray as xr
|
|
|
63
63
|
import xarray_sql as xql
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
# Open
|
|
67
|
-
#
|
|
68
|
-
ds = (
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
.sel(time='2020')
|
|
66
|
+
# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
|
|
67
|
+
# Turning off dask means we don't have to wait to construct a task graph.
|
|
68
|
+
ds = xr.open_zarr(
|
|
69
|
+
'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
70
|
+
chunks=None, # Turn dask off
|
|
71
|
+
storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
|
|
73
72
|
)
|
|
74
73
|
|
|
75
74
|
ctx = xql.XarrayContext()
|
|
76
|
-
|
|
75
|
+
# Make sure to pass `chunks`!
|
|
76
|
+
ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
|
|
77
77
|
('time', 'latitude', 'longitude'): 'surface',
|
|
78
78
|
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
79
79
|
})
|
|
80
|
-
# Registration
|
|
81
|
-
|
|
80
|
+
# Registration takes ~10s on my machine.
|
|
82
81
|
|
|
83
82
|
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
84
83
|
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
@@ -100,52 +99,50 @@ ctx.sql('''
|
|
|
100
99
|
# 0 8.640069
|
|
101
100
|
|
|
102
101
|
# Average temperature per pressure level, globally.
|
|
103
|
-
ctx.sql('''
|
|
102
|
+
result = ctx.sql('''
|
|
104
103
|
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
105
104
|
FROM era5.atmosphere
|
|
106
105
|
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
107
106
|
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
108
107
|
GROUP BY level
|
|
109
108
|
ORDER BY level DESC
|
|
110
|
-
''')
|
|
111
|
-
#
|
|
112
|
-
#
|
|
113
|
-
#
|
|
114
|
-
#
|
|
115
|
-
#
|
|
116
|
-
#
|
|
117
|
-
#
|
|
118
|
-
#
|
|
119
|
-
#
|
|
120
|
-
#
|
|
121
|
-
#
|
|
122
|
-
#
|
|
123
|
-
#
|
|
124
|
-
#
|
|
125
|
-
#
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
#
|
|
136
|
-
#
|
|
137
|
-
#
|
|
138
|
-
#
|
|
139
|
-
#
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
144
|
-
#
|
|
145
|
-
#
|
|
146
|
-
#
|
|
147
|
-
# 35 2 -13.355764
|
|
148
|
-
# 36 1 -9.020513 ← top of atmosphere
|
|
109
|
+
''')
|
|
110
|
+
# DataFrame()
|
|
111
|
+
# +-------+----------------------+
|
|
112
|
+
# | level | avg_c |
|
|
113
|
+
# +-------+----------------------+
|
|
114
|
+
# | 1000 | 6.6210120796502565 |
|
|
115
|
+
# | 975 | 5.185637919348153 |
|
|
116
|
+
# | 950 | 4.028428657263021 |
|
|
117
|
+
# | 925 | 3.0828117974912743 |
|
|
118
|
+
# | 900 | 2.2109172992531967 |
|
|
119
|
+
# | 875 | 1.395017610194202 |
|
|
120
|
+
# | 850 | 0.6342670572626616 |
|
|
121
|
+
# | 825 | -0.21037158786759846 |
|
|
122
|
+
# | 800 | -1.1810754318269687 |
|
|
123
|
+
# | 775 | -2.3064649711534457 |
|
|
124
|
+
# +-------+----------------------+
|
|
125
|
+
|
|
126
|
+
ctx.sql('''
|
|
127
|
+
SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
|
|
128
|
+
FROM era5.surface
|
|
129
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
130
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
131
|
+
GROUP BY latitude, longitude
|
|
132
|
+
ORDER BY latitude DESC, longitude
|
|
133
|
+
''').to_dataset(dims=['latitude', 'longitude'], template=ds)
|
|
134
|
+
# <xarray.Dataset> Size: 8MB
|
|
135
|
+
# Dimensions: (latitude: 721, longitude: 1440)
|
|
136
|
+
# Coordinates:
|
|
137
|
+
# * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
|
|
138
|
+
# * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
|
|
139
|
+
# Data variables:
|
|
140
|
+
# avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
|
|
141
|
+
# Attributes:
|
|
142
|
+
# last_updated: 2026-06-20 02:33:34.265980+00:00
|
|
143
|
+
# valid_time_start: 1940-01-01
|
|
144
|
+
# valid_time_stop: 2025-12-31
|
|
145
|
+
# valid_time_stop_era5t: 2026-06-14
|
|
149
146
|
```
|
|
150
147
|
|
|
151
148
|
_(A runnable version of this example lives at
|
|
@@ -225,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
|
|
|
225
222
|
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
226
223
|
|
|
227
224
|
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
228
|
-
- [DuckDB-Zarr](https://github.com/
|
|
225
|
+
- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
|
|
229
226
|
|
|
230
227
|
## Roadmap
|
|
231
228
|
|
|
232
229
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
233
230
|
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
234
231
|
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
235
|
-
- [
|
|
232
|
+
- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
236
233
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
237
234
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
238
235
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -254,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
254
251
|
who are working to make this library better.
|
|
255
252
|
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
256
253
|
changes.
|
|
254
|
+
- Aman Kumar for spending a considerable amount of his GSoC internship
|
|
255
|
+
contributing to this project.
|
|
257
256
|
|
|
258
257
|
|
|
259
258
|
## License
|
|
@@ -20,22 +20,21 @@ import xarray as xr
|
|
|
20
20
|
import xarray_sql as xql
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
# Open
|
|
24
|
-
#
|
|
25
|
-
ds = (
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
.sel(time='2020')
|
|
23
|
+
# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
|
|
24
|
+
# Turning off dask means we don't have to wait to construct a task graph.
|
|
25
|
+
ds = xr.open_zarr(
|
|
26
|
+
'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
27
|
+
chunks=None, # Turn dask off
|
|
28
|
+
storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
|
|
30
29
|
)
|
|
31
30
|
|
|
32
31
|
ctx = xql.XarrayContext()
|
|
33
|
-
|
|
32
|
+
# Make sure to pass `chunks`!
|
|
33
|
+
ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
|
|
34
34
|
('time', 'latitude', 'longitude'): 'surface',
|
|
35
35
|
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
36
36
|
})
|
|
37
|
-
# Registration
|
|
38
|
-
|
|
37
|
+
# Registration takes ~10s on my machine.
|
|
39
38
|
|
|
40
39
|
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
41
40
|
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
@@ -57,52 +56,50 @@ ctx.sql('''
|
|
|
57
56
|
# 0 8.640069
|
|
58
57
|
|
|
59
58
|
# Average temperature per pressure level, globally.
|
|
60
|
-
ctx.sql('''
|
|
59
|
+
result = ctx.sql('''
|
|
61
60
|
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
62
61
|
FROM era5.atmosphere
|
|
63
62
|
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
64
63
|
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
65
64
|
GROUP BY level
|
|
66
65
|
ORDER BY level DESC
|
|
67
|
-
''')
|
|
68
|
-
#
|
|
69
|
-
#
|
|
70
|
-
#
|
|
71
|
-
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
#
|
|
76
|
-
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
81
|
-
#
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
#
|
|
93
|
-
#
|
|
94
|
-
#
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
#
|
|
98
|
-
#
|
|
99
|
-
#
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
#
|
|
103
|
-
#
|
|
104
|
-
# 35 2 -13.355764
|
|
105
|
-
# 36 1 -9.020513 ← top of atmosphere
|
|
66
|
+
''')
|
|
67
|
+
# DataFrame()
|
|
68
|
+
# +-------+----------------------+
|
|
69
|
+
# | level | avg_c |
|
|
70
|
+
# +-------+----------------------+
|
|
71
|
+
# | 1000 | 6.6210120796502565 |
|
|
72
|
+
# | 975 | 5.185637919348153 |
|
|
73
|
+
# | 950 | 4.028428657263021 |
|
|
74
|
+
# | 925 | 3.0828117974912743 |
|
|
75
|
+
# | 900 | 2.2109172992531967 |
|
|
76
|
+
# | 875 | 1.395017610194202 |
|
|
77
|
+
# | 850 | 0.6342670572626616 |
|
|
78
|
+
# | 825 | -0.21037158786759846 |
|
|
79
|
+
# | 800 | -1.1810754318269687 |
|
|
80
|
+
# | 775 | -2.3064649711534457 |
|
|
81
|
+
# +-------+----------------------+
|
|
82
|
+
|
|
83
|
+
ctx.sql('''
|
|
84
|
+
SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
|
|
85
|
+
FROM era5.surface
|
|
86
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
87
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
88
|
+
GROUP BY latitude, longitude
|
|
89
|
+
ORDER BY latitude DESC, longitude
|
|
90
|
+
''').to_dataset(dims=['latitude', 'longitude'], template=ds)
|
|
91
|
+
# <xarray.Dataset> Size: 8MB
|
|
92
|
+
# Dimensions: (latitude: 721, longitude: 1440)
|
|
93
|
+
# Coordinates:
|
|
94
|
+
# * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
|
|
95
|
+
# * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
|
|
96
|
+
# Data variables:
|
|
97
|
+
# avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
|
|
98
|
+
# Attributes:
|
|
99
|
+
# last_updated: 2026-06-20 02:33:34.265980+00:00
|
|
100
|
+
# valid_time_start: 1940-01-01
|
|
101
|
+
# valid_time_stop: 2025-12-31
|
|
102
|
+
# valid_time_stop_era5t: 2026-06-14
|
|
106
103
|
```
|
|
107
104
|
|
|
108
105
|
_(A runnable version of this example lives at
|
|
@@ -182,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
|
|
|
182
179
|
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
183
180
|
|
|
184
181
|
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
185
|
-
- [DuckDB-Zarr](https://github.com/
|
|
182
|
+
- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
|
|
186
183
|
|
|
187
184
|
## Roadmap
|
|
188
185
|
|
|
189
186
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
190
187
|
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
191
188
|
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
192
|
-
- [
|
|
189
|
+
- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
193
190
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
194
191
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
195
192
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -211,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
211
208
|
who are working to make this library better.
|
|
212
209
|
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
213
210
|
changes.
|
|
211
|
+
- Aman Kumar for spending a considerable amount of his GSoC internship
|
|
212
|
+
contributing to this project.
|
|
214
213
|
|
|
215
214
|
|
|
216
215
|
## License
|
|
@@ -87,7 +87,39 @@ If you omit `table_names`, each table is named by joining its dimension names
|
|
|
87
87
|
with underscores, e.g. `era5.time_latitude_longitude` and
|
|
88
88
|
`era5.time_level_latitude_longitude`.
|
|
89
89
|
|
|
90
|
-
|
|
90
|
+
## GOES satellite imagery (scalar variables)
|
|
91
|
+
|
|
92
|
+
Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
|
|
93
|
+
GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
|
|
94
|
+
scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
|
|
95
|
+
scalars into a single one-row table named `scalar`:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import fsspec
|
|
99
|
+
import xarray as xr
|
|
100
|
+
from xarray_sql import XarrayContext
|
|
101
|
+
|
|
102
|
+
# A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
|
|
103
|
+
# (y, x) image bands alongside dozens of scalar metadata variables.
|
|
104
|
+
url = (
|
|
105
|
+
'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
|
|
106
|
+
'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
|
|
107
|
+
)
|
|
108
|
+
ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
|
|
109
|
+
{'y': 250, 'x': 250}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
ctx = XarrayContext()
|
|
113
|
+
ctx.from_dataset('goes', ds)
|
|
114
|
+
|
|
115
|
+
# The gridded bands and the scalar metadata are separate tables.
|
|
116
|
+
ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0] # -> 250000
|
|
117
|
+
ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape # -> (1, 89)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Override the default name like any other group with `table_names={(): 'metadata'}`.
|
|
121
|
+
|
|
122
|
+
A runnable version of the ERA5 example lives at
|
|
91
123
|
[`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
|
|
92
124
|
|
|
93
125
|
[arco-era5]: https://github.com/google-research/arco-era5
|
|
@@ -3,12 +3,14 @@ import tracemalloc
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyarrow as pa
|
|
6
|
+
import pytest
|
|
6
7
|
import xarray as xr
|
|
7
8
|
|
|
8
9
|
from xarray_sql.df import (
|
|
9
10
|
DEFAULT_BATCH_SIZE,
|
|
10
11
|
_parse_schema,
|
|
11
12
|
block_slices,
|
|
13
|
+
compute_chunks,
|
|
12
14
|
dataset_to_record_batch,
|
|
13
15
|
explode,
|
|
14
16
|
from_map,
|
|
@@ -58,6 +60,37 @@ def test_explode_data_equal_one_last(air):
|
|
|
58
60
|
assert air.isel(iselection).equals(ds)
|
|
59
61
|
|
|
60
62
|
|
|
63
|
+
def test_block_slices_scalar_dataset_yields_single_block():
|
|
64
|
+
# A dimensionless dataset (e.g. scalar metadata variables) has exactly
|
|
65
|
+
# one block: the whole, empty selection.
|
|
66
|
+
ds = xr.Dataset({"projection": ((), 0)})
|
|
67
|
+
assert list(block_slices(ds)) == [{}]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_block_slices_scalar_ignores_irrelevant_chunks():
|
|
71
|
+
ds = xr.Dataset({"projection": ((), 0)})
|
|
72
|
+
assert list(block_slices(ds, chunks={"time": 4})) == [{}]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_block_slices_filters_chunk_keys_to_dataset_dims(air_small):
|
|
76
|
+
# A chunk key for a dimension the dataset doesn't have is ignored,
|
|
77
|
+
# rather than raising.
|
|
78
|
+
base = list(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
|
|
79
|
+
extra = list(
|
|
80
|
+
block_slices(
|
|
81
|
+
air_small, chunks={"time": 4, "lat": 3, "lon": 4, "absent": 2}
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
assert len(extra) == len(base)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_block_slices_dimensional_unchunked_raises():
|
|
88
|
+
# A dataset with dimensions but no chunking is still a user error.
|
|
89
|
+
ds = xr.Dataset({"v": (["x"], np.arange(3))}, coords={"x": np.arange(3)})
|
|
90
|
+
with pytest.raises(AssertionError):
|
|
91
|
+
list(block_slices(ds))
|
|
92
|
+
|
|
93
|
+
|
|
61
94
|
def test_from_map_basic():
|
|
62
95
|
def make_df(x):
|
|
63
96
|
return pd.DataFrame({"value": [x, x * 2], "index": [0, 1]})
|
|
@@ -441,3 +474,61 @@ def test_read_xarray_table_memory_bounds(large_ds):
|
|
|
441
474
|
)
|
|
442
475
|
finally:
|
|
443
476
|
tracemalloc.stop()
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
# ---------------------------------------------------------------------------
|
|
480
|
+
# compute_chunks: arithmetic replacement for ds.chunk(...).chunks.
|
|
481
|
+
# Dask serves as the source of truth.
|
|
482
|
+
# ---------------------------------------------------------------------------
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _dask_chunks(ds: xr.Dataset, chunks: dict) -> dict:
|
|
486
|
+
rechunked = ds.copy(data=None, deep=False).chunk(chunks)
|
|
487
|
+
return {str(k): tuple(v) for k, v in rechunked.chunks.items()}
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _normalise(result: dict) -> dict:
|
|
491
|
+
return {str(k): tuple(v) for k, v in result.items()}
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _simple_ds(shape: tuple[int, ...], dims: tuple[str, ...]) -> xr.Dataset:
|
|
495
|
+
return xr.Dataset(
|
|
496
|
+
{"v": (dims, np.zeros(shape))},
|
|
497
|
+
coords={d: np.arange(s) for d, s in zip(dims, shape)},
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@pytest.mark.parametrize(
|
|
502
|
+
"ds,chunks",
|
|
503
|
+
[
|
|
504
|
+
# Even divide on a single dim.
|
|
505
|
+
(_simple_ds((10,), ("x",)), {"x": 5}),
|
|
506
|
+
# Uneven divide: trailing remainder chunk.
|
|
507
|
+
(_simple_ds((10,), ("x",)), {"x": 3}),
|
|
508
|
+
# Requested chunk size larger than the dim → single chunk.
|
|
509
|
+
(_simple_ds((5,), ("x",)), {"x": 100}),
|
|
510
|
+
# Multi-dim spec with a dim left unspecified (kept as one chunk).
|
|
511
|
+
(_simple_ds((4, 6), ("x", "y")), {"x": 2}),
|
|
512
|
+
# Multi-dim spec rechunking every dim.
|
|
513
|
+
(_simple_ds((7, 11, 13), ("a", "b", "c")), {"a": 3, "b": 4, "c": 5}),
|
|
514
|
+
],
|
|
515
|
+
)
|
|
516
|
+
def test_compute_chunks_matches_dask(ds, chunks):
|
|
517
|
+
assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def test_compute_chunks_preserves_existing_dask_chunking():
|
|
521
|
+
# When the dataset is already dask-backed, rechunking one dim must
|
|
522
|
+
# leave other dims' existing chunk tuples alone.
|
|
523
|
+
ds = _simple_ds((4, 5), ("x", "y")).chunk({"x": 1, "y": 2})
|
|
524
|
+
chunks = {"x": 2}
|
|
525
|
+
assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def test_compute_chunks_tuples_sum_to_dim_size():
|
|
529
|
+
# Dask-independent invariant: every per-dim chunk tuple must fully
|
|
530
|
+
# cover its dimension.
|
|
531
|
+
ds = _simple_ds((7, 11, 13), ("a", "b", "c"))
|
|
532
|
+
result = compute_chunks(ds, {"a": 3, "b": 4, "c": 5})
|
|
533
|
+
for dim, tup in result.items():
|
|
534
|
+
assert sum(tup) == ds.sizes[dim]
|