xarray_sql 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.lock +1 -1
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.toml +1 -1
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/PKG-INFO +86 -42
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/README.md +85 -41
- xarray_sql-0.3.0/docs/examples.md +125 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/pyproject.toml +8 -5
- xarray_sql-0.3.0/tests/conftest.py +150 -0
- xarray_sql-0.3.0/tests/test_cft.py +170 -0
- xarray_sql-0.3.0/tests/test_df.py +534 -0
- xarray_sql-0.3.0/tests/test_ds.py +571 -0
- xarray_sql-0.3.0/tests/test_reader.py +1415 -0
- xarray_sql-0.3.0/tests/test_sql.py +490 -0
- xarray_sql-0.3.0/xarray_sql/cftime.py +248 -0
- xarray_sql-0.3.0/xarray_sql/core.py +49 -0
- xarray_sql-0.3.0/xarray_sql/df.py +508 -0
- xarray_sql-0.3.0/xarray_sql/ds.py +838 -0
- xarray_sql-0.3.0/xarray_sql/reader.py +332 -0
- xarray_sql-0.3.0/xarray_sql/sql.py +191 -0
- xarray_sql-0.2.2/docs/examples.md +0 -23
- xarray_sql-0.2.2/tests/conftest.py +0 -144
- xarray_sql-0.2.2/tests/test_cft.py +0 -176
- xarray_sql-0.2.2/tests/test_df.py +0 -428
- xarray_sql-0.2.2/tests/test_reader.py +0 -1372
- xarray_sql-0.2.2/tests/test_sql.py +0 -318
- xarray_sql-0.2.2/xarray_sql/cftime.py +0 -248
- xarray_sql-0.2.2/xarray_sql/core.py +0 -49
- xarray_sql-0.2.2/xarray_sql/df.py +0 -445
- xarray_sql-0.2.2/xarray_sql/reader.py +0 -299
- xarray_sql-0.2.2/xarray_sql/sql.py +0 -63
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/.gitignore +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/LICENSE +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/assets/logo.svg +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/contributing.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/index.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/reference/xarray_sql.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/src/lib.rs +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/tests/__init__.py +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/xarray_sql/__init__.py +1 -1
- {xarray_sql-0.2.2 → xarray_sql-0.3.0}/zensical.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray_sql
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Intended Audience :: Science/Research
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -62,52 +62,94 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
62
62
|
import xarray as xr
|
|
63
63
|
import xarray_sql as xql
|
|
64
64
|
|
|
65
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
66
65
|
|
|
67
|
-
#
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
66
|
+
# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
|
|
67
|
+
# Turning off dask means we don't have to wait to construct a task graph.
|
|
68
|
+
ds = xr.open_zarr(
|
|
69
|
+
'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
70
|
+
chunks=None, # Turn dask off
|
|
71
|
+
storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
|
|
72
|
+
)
|
|
71
73
|
|
|
74
|
+
ctx = xql.XarrayContext()
|
|
75
|
+
# Make sure to pass `chunks`!
|
|
76
|
+
ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
|
|
77
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
78
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
79
|
+
})
|
|
80
|
+
# Registration takes ~10s on my machine.
|
|
81
|
+
|
|
82
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
83
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
84
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
85
|
+
# across the year (terabytes from GCS).
|
|
86
|
+
# ---> Always SELECT specific columns. <---
|
|
87
|
+
|
|
88
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
89
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
90
|
+
ctx.sql('''
|
|
91
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
92
|
+
FROM era5.surface
|
|
93
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
94
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
95
|
+
AND latitude BETWEEN 39 AND 40
|
|
96
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
97
|
+
''').to_pandas()
|
|
98
|
+
# avg_c
|
|
99
|
+
# 0 8.640069
|
|
100
|
+
|
|
101
|
+
# Average temperature per pressure level, globally.
|
|
72
102
|
result = ctx.sql('''
|
|
73
|
-
SELECT
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
GROUP BY
|
|
78
|
-
|
|
103
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
104
|
+
FROM era5.atmosphere
|
|
105
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
106
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
107
|
+
GROUP BY level
|
|
108
|
+
ORDER BY level DESC
|
|
79
109
|
''')
|
|
80
110
|
# DataFrame()
|
|
81
|
-
#
|
|
82
|
-
# |
|
|
83
|
-
#
|
|
84
|
-
# |
|
|
85
|
-
# |
|
|
86
|
-
# |
|
|
87
|
-
# |
|
|
88
|
-
# |
|
|
89
|
-
# |
|
|
90
|
-
# |
|
|
91
|
-
# |
|
|
92
|
-
# |
|
|
93
|
-
# |
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
#
|
|
105
|
-
#
|
|
106
|
-
#
|
|
111
|
+
# +-------+----------------------+
|
|
112
|
+
# | level | avg_c |
|
|
113
|
+
# +-------+----------------------+
|
|
114
|
+
# | 1000 | 6.6210120796502565 |
|
|
115
|
+
# | 975 | 5.185637919348153 |
|
|
116
|
+
# | 950 | 4.028428657263021 |
|
|
117
|
+
# | 925 | 3.0828117974912743 |
|
|
118
|
+
# | 900 | 2.2109172992531967 |
|
|
119
|
+
# | 875 | 1.395017610194202 |
|
|
120
|
+
# | 850 | 0.6342670572626616 |
|
|
121
|
+
# | 825 | -0.21037158786759846 |
|
|
122
|
+
# | 800 | -1.1810754318269687 |
|
|
123
|
+
# | 775 | -2.3064649711534457 |
|
|
124
|
+
# +-------+----------------------+
|
|
125
|
+
|
|
126
|
+
ctx.sql('''
|
|
127
|
+
SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
|
|
128
|
+
FROM era5.surface
|
|
129
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
130
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
131
|
+
GROUP BY latitude, longitude
|
|
132
|
+
ORDER BY latitude DESC, longitude
|
|
133
|
+
''').to_dataset(dims=['latitude', 'longitude'], template=ds)
|
|
134
|
+
# <xarray.Dataset> Size: 8MB
|
|
135
|
+
# Dimensions: (latitude: 721, longitude: 1440)
|
|
136
|
+
# Coordinates:
|
|
137
|
+
# * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
|
|
138
|
+
# * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
|
|
139
|
+
# Data variables:
|
|
140
|
+
# avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
|
|
141
|
+
# Attributes:
|
|
142
|
+
# last_updated: 2026-06-20 02:33:34.265980+00:00
|
|
143
|
+
# valid_time_start: 1940-01-01
|
|
144
|
+
# valid_time_stop: 2025-12-31
|
|
145
|
+
# valid_time_stop_era5t: 2026-06-14
|
|
107
146
|
```
|
|
108
147
|
|
|
109
|
-
|
|
110
|
-
|
|
148
|
+
_(A runnable version of this example lives at
|
|
149
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
150
|
+
|
|
151
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
152
|
+
SQL queries against them.
|
|
111
153
|
|
|
112
154
|
## Why build this?
|
|
113
155
|
|
|
@@ -180,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
|
|
|
180
222
|
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
181
223
|
|
|
182
224
|
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
183
|
-
- [DuckDB-Zarr](https://github.com/
|
|
225
|
+
- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
|
|
184
226
|
|
|
185
227
|
## Roadmap
|
|
186
228
|
|
|
187
229
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
188
230
|
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
189
231
|
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
190
|
-
- [
|
|
232
|
+
- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
191
233
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
192
234
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
193
235
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -209,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
209
251
|
who are working to make this library better.
|
|
210
252
|
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
211
253
|
changes.
|
|
254
|
+
- Aman Kumar for spending a considerable amount of his GSoC internship
|
|
255
|
+
contributing to this project.
|
|
212
256
|
|
|
213
257
|
|
|
214
258
|
## License
|
|
@@ -19,52 +19,94 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
19
19
|
import xarray as xr
|
|
20
20
|
import xarray_sql as xql
|
|
21
21
|
|
|
22
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
23
22
|
|
|
24
|
-
#
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
|
|
24
|
+
# Turning off dask means we don't have to wait to construct a task graph.
|
|
25
|
+
ds = xr.open_zarr(
|
|
26
|
+
'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
27
|
+
chunks=None, # Turn dask off
|
|
28
|
+
storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
|
|
29
|
+
)
|
|
28
30
|
|
|
31
|
+
ctx = xql.XarrayContext()
|
|
32
|
+
# Make sure to pass `chunks`!
|
|
33
|
+
ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
|
|
34
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
35
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
36
|
+
})
|
|
37
|
+
# Registration takes ~10s on my machine.
|
|
38
|
+
|
|
39
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
40
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
41
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
42
|
+
# across the year (terabytes from GCS).
|
|
43
|
+
# ---> Always SELECT specific columns. <---
|
|
44
|
+
|
|
45
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
46
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
47
|
+
ctx.sql('''
|
|
48
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
49
|
+
FROM era5.surface
|
|
50
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
51
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
52
|
+
AND latitude BETWEEN 39 AND 40
|
|
53
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
54
|
+
''').to_pandas()
|
|
55
|
+
# avg_c
|
|
56
|
+
# 0 8.640069
|
|
57
|
+
|
|
58
|
+
# Average temperature per pressure level, globally.
|
|
29
59
|
result = ctx.sql('''
|
|
30
|
-
SELECT
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
GROUP BY
|
|
35
|
-
|
|
60
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
61
|
+
FROM era5.atmosphere
|
|
62
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
63
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
64
|
+
GROUP BY level
|
|
65
|
+
ORDER BY level DESC
|
|
36
66
|
''')
|
|
37
67
|
# DataFrame()
|
|
38
|
-
#
|
|
39
|
-
# |
|
|
40
|
-
#
|
|
41
|
-
# |
|
|
42
|
-
# |
|
|
43
|
-
# |
|
|
44
|
-
# |
|
|
45
|
-
# |
|
|
46
|
-
# |
|
|
47
|
-
# |
|
|
48
|
-
# |
|
|
49
|
-
# |
|
|
50
|
-
# |
|
|
51
|
-
#
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
#
|
|
68
|
+
# +-------+----------------------+
|
|
69
|
+
# | level | avg_c |
|
|
70
|
+
# +-------+----------------------+
|
|
71
|
+
# | 1000 | 6.6210120796502565 |
|
|
72
|
+
# | 975 | 5.185637919348153 |
|
|
73
|
+
# | 950 | 4.028428657263021 |
|
|
74
|
+
# | 925 | 3.0828117974912743 |
|
|
75
|
+
# | 900 | 2.2109172992531967 |
|
|
76
|
+
# | 875 | 1.395017610194202 |
|
|
77
|
+
# | 850 | 0.6342670572626616 |
|
|
78
|
+
# | 825 | -0.21037158786759846 |
|
|
79
|
+
# | 800 | -1.1810754318269687 |
|
|
80
|
+
# | 775 | -2.3064649711534457 |
|
|
81
|
+
# +-------+----------------------+
|
|
82
|
+
|
|
83
|
+
ctx.sql('''
|
|
84
|
+
SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
|
|
85
|
+
FROM era5.surface
|
|
86
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
87
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
88
|
+
GROUP BY latitude, longitude
|
|
89
|
+
ORDER BY latitude DESC, longitude
|
|
90
|
+
''').to_dataset(dims=['latitude', 'longitude'], template=ds)
|
|
91
|
+
# <xarray.Dataset> Size: 8MB
|
|
92
|
+
# Dimensions: (latitude: 721, longitude: 1440)
|
|
93
|
+
# Coordinates:
|
|
94
|
+
# * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
|
|
95
|
+
# * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
|
|
96
|
+
# Data variables:
|
|
97
|
+
# avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
|
|
98
|
+
# Attributes:
|
|
99
|
+
# last_updated: 2026-06-20 02:33:34.265980+00:00
|
|
100
|
+
# valid_time_start: 1940-01-01
|
|
101
|
+
# valid_time_stop: 2025-12-31
|
|
102
|
+
# valid_time_stop_era5t: 2026-06-14
|
|
64
103
|
```
|
|
65
104
|
|
|
66
|
-
|
|
67
|
-
|
|
105
|
+
_(A runnable version of this example lives at
|
|
106
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
107
|
+
|
|
108
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
109
|
+
SQL queries against them.
|
|
68
110
|
|
|
69
111
|
## Why build this?
|
|
70
112
|
|
|
@@ -137,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
|
|
|
137
179
|
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
138
180
|
|
|
139
181
|
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
140
|
-
- [DuckDB-Zarr](https://github.com/
|
|
182
|
+
- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
|
|
141
183
|
|
|
142
184
|
## Roadmap
|
|
143
185
|
|
|
144
186
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
145
187
|
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
146
188
|
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
147
|
-
- [
|
|
189
|
+
- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
148
190
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
149
191
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
150
192
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -166,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
166
208
|
who are working to make this library better.
|
|
167
209
|
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
168
210
|
changes.
|
|
211
|
+
- Aman Kumar for spending a considerable amount of his GSoC internship
|
|
212
|
+
contributing to this project.
|
|
169
213
|
|
|
170
214
|
|
|
171
215
|
## License
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Examples
|
|
2
|
+
|
|
3
|
+
```python
|
|
4
|
+
import xarray as xr
|
|
5
|
+
import xarray_sql as xql
|
|
6
|
+
|
|
7
|
+
ds = xr.tutorial.open_dataset('air_temperature')
|
|
8
|
+
|
|
9
|
+
ctx = xql.XarrayContext()
|
|
10
|
+
ctx.from_dataset('air', ds, chunks=dict(time=24))
|
|
11
|
+
|
|
12
|
+
result = ctx.sql('''
|
|
13
|
+
SELECT
|
|
14
|
+
"lat", "lon", AVG("air") as air_avg
|
|
15
|
+
FROM
|
|
16
|
+
"air"
|
|
17
|
+
GROUP BY
|
|
18
|
+
"lat", "lon"
|
|
19
|
+
''')
|
|
20
|
+
|
|
21
|
+
df = result.to_pandas()
|
|
22
|
+
df.head()
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Mixed-dimension datasets: ARCO-ERA5
|
|
26
|
+
|
|
27
|
+
When a Dataset has variables with differing dimensions (e.g. surface fields on
|
|
28
|
+
`(time, latitude, longitude)` and atmospheric fields on
|
|
29
|
+
`(time, level, latitude, longitude)`), `from_dataset` splits them into one
|
|
30
|
+
table per dimension group, registered together under a SQL schema named after
|
|
31
|
+
the first argument. [ARCO-ERA5][arco-era5] is a good example: 262 of its
|
|
32
|
+
variables are surface fields and 11 are atmospheric.
|
|
33
|
+
|
|
34
|
+
Open a year of ARCO-ERA5 and let SQL `WHERE` clauses do the filtering — the
|
|
35
|
+
library prunes time partitions and pushes dimension-column filters down. Use
|
|
36
|
+
the `table_names` kwarg to give each dimension group a friendly name:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import xarray as xr
|
|
40
|
+
import xarray_sql as xql
|
|
41
|
+
|
|
42
|
+
# Open ARCO-ERA5 directly from GCS (anonymous read).
|
|
43
|
+
url = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
|
|
44
|
+
full = xr.open_zarr(url, chunks=None, storage_options={'token': 'anon'})
|
|
45
|
+
|
|
46
|
+
# A full year of hourly ERA5 — all 273 variables. No spatial slicing on the
|
|
47
|
+
# xarray side; SQL WHERE clauses below express the filters. `chunks={'time': 1}`
|
|
48
|
+
# aligns Dask chunks to native Zarr chunks of shape (1, 37, 721, 1440) so
|
|
49
|
+
# chunk reads from GCS happen concurrently.
|
|
50
|
+
#
|
|
51
|
+
# Heads up: 262 of those variables are surface and 11 are atmospheric. The
|
|
52
|
+
# library pushes column projection down, so SELECT only fetches what you ask
|
|
53
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
54
|
+
# across the year (terabytes from GCS). Always SELECT specific columns.
|
|
55
|
+
ds = full.sel(time='2020').chunk({'time': 1})
|
|
56
|
+
|
|
57
|
+
ctx = xql.XarrayContext()
|
|
58
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
59
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
60
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
61
|
+
})
|
|
62
|
+
# Registers two tables under a SQL schema named 'era5': 'surface' and 'atmosphere'.
|
|
63
|
+
|
|
64
|
+
# Average 2m-temperature over the NYC area on the morning of 2020-01-01.
|
|
65
|
+
ctx.sql('''
|
|
66
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
67
|
+
FROM era5.surface
|
|
68
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
69
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
70
|
+
AND latitude BETWEEN 39 AND 40
|
|
71
|
+
AND longitude BETWEEN 286 AND 287
|
|
72
|
+
''').to_pandas()
|
|
73
|
+
|
|
74
|
+
# Average temperature per pressure level, globally — the standard
|
|
75
|
+
# atmospheric temperature profile. Scans ~230M rows.
|
|
76
|
+
ctx.sql('''
|
|
77
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
78
|
+
FROM era5.atmosphere
|
|
79
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
80
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
81
|
+
GROUP BY level
|
|
82
|
+
ORDER BY level DESC -- surface (1000 hPa) first
|
|
83
|
+
''').to_pandas()
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
If you omit `table_names`, each table is named by joining its dimension names
|
|
87
|
+
with underscores, e.g. `era5.time_latitude_longitude` and
|
|
88
|
+
`era5.time_level_latitude_longitude`.
|
|
89
|
+
|
|
90
|
+
## GOES satellite imagery (scalar variables)
|
|
91
|
+
|
|
92
|
+
Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
|
|
93
|
+
GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
|
|
94
|
+
scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
|
|
95
|
+
scalars into a single one-row table named `scalar`:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import fsspec
|
|
99
|
+
import xarray as xr
|
|
100
|
+
from xarray_sql import XarrayContext
|
|
101
|
+
|
|
102
|
+
# A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
|
|
103
|
+
# (y, x) image bands alongside dozens of scalar metadata variables.
|
|
104
|
+
url = (
|
|
105
|
+
'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
|
|
106
|
+
'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
|
|
107
|
+
)
|
|
108
|
+
ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
|
|
109
|
+
{'y': 250, 'x': 250}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
ctx = XarrayContext()
|
|
113
|
+
ctx.from_dataset('goes', ds)
|
|
114
|
+
|
|
115
|
+
# The gridded bands and the scalar metadata are separate tables.
|
|
116
|
+
ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0] # -> 250000
|
|
117
|
+
ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape # -> (1, 89)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Override the default name like any other group with `table_names={(): 'metadata'}`.
|
|
121
|
+
|
|
122
|
+
A runnable version of the ERA5 example lives at
|
|
123
|
+
[`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
|
|
124
|
+
|
|
125
|
+
[arco-era5]: https://github.com/google-research/arco-era5
|
|
@@ -64,11 +64,13 @@ module-name = "xarray_sql._native"
|
|
|
64
64
|
[tool.setuptools.packages.find]
|
|
65
65
|
exclude = ["demo", "perf_tests", "tests", "tests.*"]
|
|
66
66
|
|
|
67
|
-
[tool.
|
|
67
|
+
[tool.ruff]
|
|
68
68
|
line-length = 80
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
indent-width = 4
|
|
70
|
+
|
|
71
|
+
[tool.ruff.format]
|
|
72
|
+
indent-style = "space"
|
|
73
|
+
quote-style = "double"
|
|
72
74
|
|
|
73
75
|
[tool.mypy]
|
|
74
76
|
python_version = "3.11"
|
|
@@ -88,6 +90,7 @@ module = [
|
|
|
88
90
|
"pyarrow.*",
|
|
89
91
|
"datafusion.*",
|
|
90
92
|
"xarray.*",
|
|
93
|
+
"pandas.*",
|
|
91
94
|
]
|
|
92
95
|
ignore_missing_imports = true
|
|
93
96
|
|
|
@@ -98,7 +101,7 @@ dev = [
|
|
|
98
101
|
"xarray_sql[test]",
|
|
99
102
|
"xarray_sql[docs]",
|
|
100
103
|
"py-spy>=0.4.0",
|
|
101
|
-
"
|
|
104
|
+
"ruff>=0.15.10",
|
|
102
105
|
"maturin>=1.9.1",
|
|
103
106
|
]
|
|
104
107
|
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import xarray as xr
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def rand_wx(start: str, end: str) -> xr.Dataset:
|
|
9
|
+
np.random.seed(42)
|
|
10
|
+
lat = np.linspace(-90, 90, num=720)
|
|
11
|
+
lon = np.linspace(-180, 180, num=1440)
|
|
12
|
+
time = pd.date_range(start, end, freq="h")
|
|
13
|
+
level = np.array([1000, 500], dtype=np.int32)
|
|
14
|
+
reference_time = pd.Timestamp(start)
|
|
15
|
+
temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
|
|
16
|
+
precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
|
|
17
|
+
return xr.Dataset(
|
|
18
|
+
data_vars=dict(
|
|
19
|
+
temperature=(["lat", "lon", "time", "level"], temperature),
|
|
20
|
+
precipitation=(["lat", "lon", "time", "level"], precipitation),
|
|
21
|
+
),
|
|
22
|
+
coords=dict(
|
|
23
|
+
lat=lat,
|
|
24
|
+
lon=lon,
|
|
25
|
+
time=time,
|
|
26
|
+
level=level,
|
|
27
|
+
reference_time=reference_time,
|
|
28
|
+
),
|
|
29
|
+
attrs=dict(description="Random weather."),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_large_dataset(time_steps=1000, lat_points=100, lon_points=100):
|
|
34
|
+
"""Create a large xarray dataset for memory testing."""
|
|
35
|
+
np.random.seed(42)
|
|
36
|
+
|
|
37
|
+
time = pd.date_range("2020-01-01", periods=time_steps, freq="h")
|
|
38
|
+
lat = np.linspace(-90, 90, lat_points)
|
|
39
|
+
lon = np.linspace(-180, 180, lon_points)
|
|
40
|
+
|
|
41
|
+
temp_data = np.random.rand(time_steps, lat_points, lon_points) * 40 - 10
|
|
42
|
+
precip_data = np.random.rand(time_steps, lat_points, lon_points) * 100
|
|
43
|
+
|
|
44
|
+
return xr.Dataset(
|
|
45
|
+
{
|
|
46
|
+
"temperature": (["time", "lat", "lon"], temp_data),
|
|
47
|
+
"precipitation": (["time", "lat", "lon"], precip_data),
|
|
48
|
+
},
|
|
49
|
+
coords={"time": time, "lat": lat, "lon": lon},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture
|
|
54
|
+
def air():
|
|
55
|
+
ds = xr.tutorial.open_dataset("air_temperature")
|
|
56
|
+
chunks = {"time": 240}
|
|
57
|
+
return ds.chunk(chunks)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.fixture
|
|
61
|
+
def air_small(air):
|
|
62
|
+
return air.isel(
|
|
63
|
+
time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
|
|
64
|
+
).chunk({"time": 240})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.fixture
|
|
68
|
+
def randwx():
|
|
69
|
+
return rand_wx("1995-01-13T00", "1995-01-13T01")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.fixture
|
|
73
|
+
def large_ds():
|
|
74
|
+
return create_large_dataset().chunk({"time": 25})
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@pytest.fixture
|
|
78
|
+
def air_dataset_small():
|
|
79
|
+
ds = xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
|
|
80
|
+
return ds.isel(time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.fixture
|
|
84
|
+
def air_dataset_large():
|
|
85
|
+
return xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@pytest.fixture
|
|
89
|
+
def rasm_ds():
|
|
90
|
+
"""rasm uses cftime.DatetimeNoLeap (noleap / 365_day) for time."""
|
|
91
|
+
return xr.tutorial.open_dataset("rasm")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.fixture
|
|
95
|
+
def weather_dataset():
|
|
96
|
+
ds = rand_wx("2023-01-01T00", "2023-01-01T12")
|
|
97
|
+
return ds.isel(time=slice(0, 6), lat=slice(0, 10), lon=slice(0, 10)).chunk(
|
|
98
|
+
{"time": 3}
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.fixture
|
|
103
|
+
def synthetic_dataset():
|
|
104
|
+
return create_large_dataset(
|
|
105
|
+
time_steps=50, lat_points=20, lon_points=20
|
|
106
|
+
).chunk({"time": 25})
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@pytest.fixture
|
|
110
|
+
def station_dataset():
|
|
111
|
+
return xr.Dataset(
|
|
112
|
+
{
|
|
113
|
+
"station_id": (["station"], [1, 2, 3, 4, 5]),
|
|
114
|
+
"elevation": (["station"], [100, 250, 500, 750, 1000]),
|
|
115
|
+
"name": (
|
|
116
|
+
["station"],
|
|
117
|
+
[
|
|
118
|
+
"Station_A",
|
|
119
|
+
"Station_B",
|
|
120
|
+
"Station_C",
|
|
121
|
+
"Station_D",
|
|
122
|
+
"Station_E",
|
|
123
|
+
],
|
|
124
|
+
),
|
|
125
|
+
}
|
|
126
|
+
).chunk({"station": 5})
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@pytest.fixture
|
|
130
|
+
def air_and_stations():
|
|
131
|
+
air = (
|
|
132
|
+
xr.tutorial.open_dataset("air_temperature")
|
|
133
|
+
.isel(time=slice(0, 12), lat=slice(0, 5), lon=slice(0, 8))
|
|
134
|
+
.chunk({"time": 6})
|
|
135
|
+
)
|
|
136
|
+
stations = xr.Dataset(
|
|
137
|
+
{
|
|
138
|
+
"station_id": (["station"], [101, 102, 103]),
|
|
139
|
+
"lat": (
|
|
140
|
+
["station"],
|
|
141
|
+
[air.lat.values[0], air.lat.values[2], air.lat.values[4]],
|
|
142
|
+
),
|
|
143
|
+
"lon": (
|
|
144
|
+
["station"],
|
|
145
|
+
[air.lon.values[1], air.lon.values[3], air.lon.values[5]],
|
|
146
|
+
),
|
|
147
|
+
"elevation": (["station"], [100, 250, 500]),
|
|
148
|
+
}
|
|
149
|
+
).chunk({"station": 3})
|
|
150
|
+
return air, stations
|