xarray_sql 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.lock +1 -1
  2. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.toml +1 -1
  3. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/PKG-INFO +86 -42
  4. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/README.md +85 -41
  5. xarray_sql-0.3.0/docs/examples.md +125 -0
  6. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/pyproject.toml +8 -5
  7. xarray_sql-0.3.0/tests/conftest.py +150 -0
  8. xarray_sql-0.3.0/tests/test_cft.py +170 -0
  9. xarray_sql-0.3.0/tests/test_df.py +534 -0
  10. xarray_sql-0.3.0/tests/test_ds.py +571 -0
  11. xarray_sql-0.3.0/tests/test_reader.py +1415 -0
  12. xarray_sql-0.3.0/tests/test_sql.py +490 -0
  13. xarray_sql-0.3.0/xarray_sql/cftime.py +248 -0
  14. xarray_sql-0.3.0/xarray_sql/core.py +49 -0
  15. xarray_sql-0.3.0/xarray_sql/df.py +508 -0
  16. xarray_sql-0.3.0/xarray_sql/ds.py +838 -0
  17. xarray_sql-0.3.0/xarray_sql/reader.py +332 -0
  18. xarray_sql-0.3.0/xarray_sql/sql.py +191 -0
  19. xarray_sql-0.2.2/docs/examples.md +0 -23
  20. xarray_sql-0.2.2/tests/conftest.py +0 -144
  21. xarray_sql-0.2.2/tests/test_cft.py +0 -176
  22. xarray_sql-0.2.2/tests/test_df.py +0 -428
  23. xarray_sql-0.2.2/tests/test_reader.py +0 -1372
  24. xarray_sql-0.2.2/tests/test_sql.py +0 -318
  25. xarray_sql-0.2.2/xarray_sql/cftime.py +0 -248
  26. xarray_sql-0.2.2/xarray_sql/core.py +0 -49
  27. xarray_sql-0.2.2/xarray_sql/df.py +0 -445
  28. xarray_sql-0.2.2/xarray_sql/reader.py +0 -299
  29. xarray_sql-0.2.2/xarray_sql/sql.py +0 -63
  30. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/.gitignore +0 -0
  31. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/LICENSE +0 -0
  32. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/assets/logo.svg +0 -0
  33. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/contributing.md +0 -0
  34. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/index.md +0 -0
  35. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/reference/xarray_sql.md +0 -0
  36. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/src/lib.rs +0 -0
  37. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/tests/__init__.py +0 -0
  38. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/xarray_sql/__init__.py +1 -1
  39. {xarray_sql-0.2.2 → xarray_sql-0.3.0}/zensical.toml +0 -0
@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
3375
3375
 
3376
3376
  [[package]]
3377
3377
  name = "xarray_sql"
3378
- version = "0.2.2"
3378
+ version = "0.3.0"
3379
3379
  dependencies = [
3380
3380
  "arrow",
3381
3381
  "async-stream",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "xarray_sql"
3
- version = "0.2.2"
3
+ version = "0.3.0"
4
4
  authors = ["Alex Merose"]
5
5
  edition = "2021"
6
6
  exclude = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray_sql
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Intended Audience :: Developers
@@ -62,52 +62,94 @@ This is an experiment to provide a SQL interface for array datasets.
62
62
  import xarray as xr
63
63
  import xarray_sql as xql
64
64
 
65
- ds = xr.tutorial.open_dataset('air_temperature')
66
65
 
67
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
68
- ctx = xql.XarrayContext()
69
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
70
- # data is only materialized when we make a query.
66
+ # Open ARCO-ERA5 a weather dataset with 273 variables since 1940.
67
+ # Turning off dask means we don't have to wait to construct a task graph.
68
+ ds = xr.open_zarr(
69
+ 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
70
+ chunks=None, # Turn dask off
71
+ storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
72
+ )
71
73
 
74
+ ctx = xql.XarrayContext()
75
+ # Make sure to pass `chunks`!
76
+ ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
77
+ ('time', 'latitude', 'longitude'): 'surface',
78
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
79
+ })
80
+ # Registration takes ~10s on my machine.
81
+
82
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
83
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
84
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
85
+ # across the year (terabytes from GCS).
86
+ # ---> Always SELECT specific columns. <---
87
+
88
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
89
+ # pushes WHERE clauses on dimension columns down to partition pruning.
90
+ ctx.sql('''
91
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
92
+ FROM era5.surface
93
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
94
+ AND TIMESTAMP '2020-01-01 05:00:00'
95
+ AND latitude BETWEEN 39 AND 40
96
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
97
+ ''').to_pandas()
98
+ # avg_c
99
+ # 0 8.640069
100
+
101
+ # Average temperature per pressure level, globally.
72
102
  result = ctx.sql('''
73
- SELECT
74
- "lat", "lon", AVG("air") as air_avg
75
- FROM
76
- "air"
77
- GROUP BY
78
- "lat", "lon"
103
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
104
+ FROM era5.atmosphere
105
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
106
+ AND TIMESTAMP '2020-01-01 05:00:00'
107
+ GROUP BY level
108
+ ORDER BY level DESC
79
109
  ''')
80
110
  # DataFrame()
81
- # +------+-------+--------------------+
82
- # | lat | lon | air_avg |
83
- # +------+-------+--------------------+
84
- # | 75.0 | 205.0 | 259.88662671232834 |
85
- # | 75.0 | 207.5 | 259.48268150684896 |
86
- # | 75.0 | 230.0 | 258.9192123287667 |
87
- # | 75.0 | 275.0 | 257.07574315068456 |
88
- # | 75.0 | 322.5 | 250.11792123287654 |
89
- # | 75.0 | 325.0 | 250.81590068493134 |
90
- # | 72.5 | 205.0 | 262.74933904109537 |
91
- # | 72.5 | 207.5 | 262.5384315068488 |
92
- # | 72.5 | 230.0 | 260.82879452054743 |
93
- # | 72.5 | 275.0 | 257.3063321917804 |
94
- # +------+-------+--------------------+
95
- # Data truncated.
96
-
97
- # The full query is only made when we call `collect()`, or, in this case,
98
- # `to_pandas()`.
99
- df = result.to_pandas()
100
- df.head()
101
- # lat lon air_avg
102
- # 0 75.0 232.5 258.836188
103
- # 1 75.0 247.5 257.716171
104
- # 2 75.0 262.5 257.347959
105
- # 3 75.0 277.5 257.671308
106
- # 4 72.5 232.5 260.654401
111
+ # +-------+----------------------+
112
+ # | level | avg_c |
113
+ # +-------+----------------------+
114
+ # | 1000 | 6.6210120796502565 |
115
+ # | 975 | 5.185637919348153 |
116
+ # | 950 | 4.028428657263021 |
117
+ # | 925 | 3.0828117974912743 |
118
+ # | 900 | 2.2109172992531967 |
119
+ # | 875 | 1.395017610194202 |
120
+ # | 850 | 0.6342670572626616 |
121
+ # | 825 | -0.21037158786759846 |
122
+ # | 800 | -1.1810754318269687 |
123
+ # | 775 | -2.3064649711534457 |
124
+ # +-------+----------------------+
125
+
126
+ ctx.sql('''
127
+ SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
128
+ FROM era5.surface
129
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
130
+ AND TIMESTAMP '2020-01-01 05:00:00'
131
+ GROUP BY latitude, longitude
132
+ ORDER BY latitude DESC, longitude
133
+ ''').to_dataset(dims=['latitude', 'longitude'], template=ds)
134
+ # <xarray.Dataset> Size: 8MB
135
+ # Dimensions: (latitude: 721, longitude: 1440)
136
+ # Coordinates:
137
+ # * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
138
+ # * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
139
+ # Data variables:
140
+ # avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
141
+ # Attributes:
142
+ # last_updated: 2026-06-20 02:33:34.265980+00:00
143
+ # valid_time_start: 1940-01-01
144
+ # valid_time_stop: 2025-12-31
145
+ # valid_time_stop_era5t: 2026-06-14
107
146
  ```
108
147
 
109
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
110
- SQL queries against them.
148
+ _(A runnable version of this example lives at
149
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
150
+
151
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
152
+ SQL queries against them.
111
153
 
112
154
  ## Why build this?
113
155
 
@@ -180,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
180
222
  _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
181
223
 
182
224
  - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
183
- - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
225
+ - [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
184
226
 
185
227
  ## Roadmap
186
228
 
187
229
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
188
230
  - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
189
231
  - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
190
- - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
232
+ - [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
191
233
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
192
234
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
193
235
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -209,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
209
251
  who are working to make this library better.
210
252
  - Andrew Huang for the sense of taste he brings to the project and consummate code
211
253
  changes.
254
+ - Aman Kumar for spending a considerable amount of his GSoC internship
255
+ contributing to this project.
212
256
 
213
257
 
214
258
  ## License
@@ -19,52 +19,94 @@ This is an experiment to provide a SQL interface for array datasets.
19
19
  import xarray as xr
20
20
  import xarray_sql as xql
21
21
 
22
- ds = xr.tutorial.open_dataset('air_temperature')
23
22
 
24
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
25
- ctx = xql.XarrayContext()
26
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
27
- # data is only materialized when we make a query.
23
+ # Open ARCO-ERA5 a weather dataset with 273 variables since 1940.
24
+ # Turning off dask means we don't have to wait to construct a task graph.
25
+ ds = xr.open_zarr(
26
+ 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
27
+ chunks=None, # Turn dask off
28
+ storage_options={'token': 'anon'} # Anonymous read from the public GCS bucket — no auth required.
29
+ )
28
30
 
31
+ ctx = xql.XarrayContext()
32
+ # Make sure to pass `chunks`!
33
+ ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
34
+ ('time', 'latitude', 'longitude'): 'surface',
35
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
36
+ })
37
+ # Registration takes ~10s on my machine.
38
+
39
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
40
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
41
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
42
+ # across the year (terabytes from GCS).
43
+ # ---> Always SELECT specific columns. <---
44
+
45
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
46
+ # pushes WHERE clauses on dimension columns down to partition pruning.
47
+ ctx.sql('''
48
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
49
+ FROM era5.surface
50
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
51
+ AND TIMESTAMP '2020-01-01 05:00:00'
52
+ AND latitude BETWEEN 39 AND 40
53
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
54
+ ''').to_pandas()
55
+ # avg_c
56
+ # 0 8.640069
57
+
58
+ # Average temperature per pressure level, globally.
29
59
  result = ctx.sql('''
30
- SELECT
31
- "lat", "lon", AVG("air") as air_avg
32
- FROM
33
- "air"
34
- GROUP BY
35
- "lat", "lon"
60
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
61
+ FROM era5.atmosphere
62
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
63
+ AND TIMESTAMP '2020-01-01 05:00:00'
64
+ GROUP BY level
65
+ ORDER BY level DESC
36
66
  ''')
37
67
  # DataFrame()
38
- # +------+-------+--------------------+
39
- # | lat | lon | air_avg |
40
- # +------+-------+--------------------+
41
- # | 75.0 | 205.0 | 259.88662671232834 |
42
- # | 75.0 | 207.5 | 259.48268150684896 |
43
- # | 75.0 | 230.0 | 258.9192123287667 |
44
- # | 75.0 | 275.0 | 257.07574315068456 |
45
- # | 75.0 | 322.5 | 250.11792123287654 |
46
- # | 75.0 | 325.0 | 250.81590068493134 |
47
- # | 72.5 | 205.0 | 262.74933904109537 |
48
- # | 72.5 | 207.5 | 262.5384315068488 |
49
- # | 72.5 | 230.0 | 260.82879452054743 |
50
- # | 72.5 | 275.0 | 257.3063321917804 |
51
- # +------+-------+--------------------+
52
- # Data truncated.
53
-
54
- # The full query is only made when we call `collect()`, or, in this case,
55
- # `to_pandas()`.
56
- df = result.to_pandas()
57
- df.head()
58
- # lat lon air_avg
59
- # 0 75.0 232.5 258.836188
60
- # 1 75.0 247.5 257.716171
61
- # 2 75.0 262.5 257.347959
62
- # 3 75.0 277.5 257.671308
63
- # 4 72.5 232.5 260.654401
68
+ # +-------+----------------------+
69
+ # | level | avg_c |
70
+ # +-------+----------------------+
71
+ # | 1000 | 6.6210120796502565 |
72
+ # | 975 | 5.185637919348153 |
73
+ # | 950 | 4.028428657263021 |
74
+ # | 925 | 3.0828117974912743 |
75
+ # | 900 | 2.2109172992531967 |
76
+ # | 875 | 1.395017610194202 |
77
+ # | 850 | 0.6342670572626616 |
78
+ # | 825 | -0.21037158786759846 |
79
+ # | 800 | -1.1810754318269687 |
80
+ # | 775 | -2.3064649711534457 |
81
+ # +-------+----------------------+
82
+
83
+ ctx.sql('''
84
+ SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
85
+ FROM era5.surface
86
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
87
+ AND TIMESTAMP '2020-01-01 05:00:00'
88
+ GROUP BY latitude, longitude
89
+ ORDER BY latitude DESC, longitude
90
+ ''').to_dataset(dims=['latitude', 'longitude'], template=ds)
91
+ # <xarray.Dataset> Size: 8MB
92
+ # Dimensions: (latitude: 721, longitude: 1440)
93
+ # Coordinates:
94
+ # * latitude (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
95
+ # * longitude (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
96
+ # Data variables:
97
+ # avg_c (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
98
+ # Attributes:
99
+ # last_updated: 2026-06-20 02:33:34.265980+00:00
100
+ # valid_time_start: 1940-01-01
101
+ # valid_time_stop: 2025-12-31
102
+ # valid_time_stop_era5t: 2026-06-14
64
103
  ```
65
104
 
66
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
67
- SQL queries against them.
105
+ _(A runnable version of this example lives at
106
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
107
+
108
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
109
+ SQL queries against them.
68
110
 
69
111
  ## Why build this?
70
112
 
@@ -137,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
137
179
  _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
138
180
 
139
181
  - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
140
- - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
182
+ - [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
141
183
 
142
184
  ## Roadmap
143
185
 
144
186
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
145
187
  - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
146
188
  - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
147
- - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
189
+ - [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
148
190
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
149
191
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
150
192
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -166,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
166
208
  who are working to make this library better.
167
209
  - Andrew Huang for the sense of taste he brings to the project and consummate code
168
210
  changes.
211
+ - Aman Kumar for spending a considerable amount of his GSoC internship
212
+ contributing to this project.
169
213
 
170
214
 
171
215
  ## License
@@ -0,0 +1,125 @@
1
+ # Examples
2
+
3
+ ```python
4
+ import xarray as xr
5
+ import xarray_sql as xql
6
+
7
+ ds = xr.tutorial.open_dataset('air_temperature')
8
+
9
+ ctx = xql.XarrayContext()
10
+ ctx.from_dataset('air', ds, chunks=dict(time=24))
11
+
12
+ result = ctx.sql('''
13
+ SELECT
14
+ "lat", "lon", AVG("air") as air_avg
15
+ FROM
16
+ "air"
17
+ GROUP BY
18
+ "lat", "lon"
19
+ ''')
20
+
21
+ df = result.to_pandas()
22
+ df.head()
23
+ ```
24
+
25
+ ## Mixed-dimension datasets: ARCO-ERA5
26
+
27
+ When a Dataset has variables with differing dimensions (e.g. surface fields on
28
+ `(time, latitude, longitude)` and atmospheric fields on
29
+ `(time, level, latitude, longitude)`), `from_dataset` splits them into one
30
+ table per dimension group, registered together under a SQL schema named after
31
+ the first argument. [ARCO-ERA5][arco-era5] is a good example: 262 of its
32
+ variables are surface fields and 11 are atmospheric.
33
+
34
+ Open a year of ARCO-ERA5 and let SQL `WHERE` clauses do the filtering — the
35
+ library prunes time partitions and pushes dimension-column filters down. Use
36
+ the `table_names` kwarg to give each dimension group a friendly name:
37
+
38
+ ```python
39
+ import xarray as xr
40
+ import xarray_sql as xql
41
+
42
+ # Open ARCO-ERA5 directly from GCS (anonymous read).
43
+ url = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
44
+ full = xr.open_zarr(url, chunks=None, storage_options={'token': 'anon'})
45
+
46
+ # A full year of hourly ERA5 — all 273 variables. No spatial slicing on the
47
+ # xarray side; SQL WHERE clauses below express the filters. `chunks={'time': 1}`
48
+ # aligns Dask chunks to native Zarr chunks of shape (1, 37, 721, 1440) so
49
+ # chunk reads from GCS happen concurrently.
50
+ #
51
+ # Heads up: 262 of those variables are surface and 11 are atmospheric. The
52
+ # library pushes column projection down, so SELECT only fetches what you ask
53
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
54
+ # across the year (terabytes from GCS). Always SELECT specific columns.
55
+ ds = full.sel(time='2020').chunk({'time': 1})
56
+
57
+ ctx = xql.XarrayContext()
58
+ ctx.from_dataset('era5', ds, table_names={
59
+ ('time', 'latitude', 'longitude'): 'surface',
60
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
61
+ })
62
+ # Registers two tables under a SQL schema named 'era5': 'surface' and 'atmosphere'.
63
+
64
+ # Average 2m-temperature over the NYC area on the morning of 2020-01-01.
65
+ ctx.sql('''
66
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
67
+ FROM era5.surface
68
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
69
+ AND TIMESTAMP '2020-01-01 05:00:00'
70
+ AND latitude BETWEEN 39 AND 40
71
+ AND longitude BETWEEN 286 AND 287
72
+ ''').to_pandas()
73
+
74
+ # Average temperature per pressure level, globally — the standard
75
+ # atmospheric temperature profile. Scans ~230M rows.
76
+ ctx.sql('''
77
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
78
+ FROM era5.atmosphere
79
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
80
+ AND TIMESTAMP '2020-01-01 05:00:00'
81
+ GROUP BY level
82
+ ORDER BY level DESC -- surface (1000 hPa) first
83
+ ''').to_pandas()
84
+ ```
85
+
86
+ If you omit `table_names`, each table is named by joining its dimension names
87
+ with underscores, e.g. `era5.time_latitude_longitude` and
88
+ `era5.time_level_latitude_longitude`.
89
+
90
+ ## GOES satellite imagery (scalar variables)
91
+
92
+ Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
93
+ GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
94
+ scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
95
+ scalars into a single one-row table named `scalar`:
96
+
97
+ ```python
98
+ import fsspec
99
+ import xarray as xr
100
+ from xarray_sql import XarrayContext
101
+
102
+ # A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
103
+ # (y, x) image bands alongside dozens of scalar metadata variables.
104
+ url = (
105
+ 'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
106
+ 'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
107
+ )
108
+ ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
109
+ {'y': 250, 'x': 250}
110
+ )
111
+
112
+ ctx = XarrayContext()
113
+ ctx.from_dataset('goes', ds)
114
+
115
+ # The gridded bands and the scalar metadata are separate tables.
116
+ ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0] # -> 250000
117
+ ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape # -> (1, 89)
118
+ ```
119
+
120
+ Override the default name like any other group with `table_names={(): 'metadata'}`.
121
+
122
+ A runnable version of the ERA5 example lives at
123
+ [`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
124
+
125
+ [arco-era5]: https://github.com/google-research/arco-era5
@@ -64,11 +64,13 @@ module-name = "xarray_sql._native"
64
64
  [tool.setuptools.packages.find]
65
65
  exclude = ["demo", "perf_tests", "tests", "tests.*"]
66
66
 
67
- [tool.pyink]
67
+ [tool.ruff]
68
68
  line-length = 80
69
- preview = true
70
- pyink-indentation = 2
71
- pyink-use-majority-quotes = true
69
+ indent-width = 4
70
+
71
+ [tool.ruff.format]
72
+ indent-style = "space"
73
+ quote-style = "double"
72
74
 
73
75
  [tool.mypy]
74
76
  python_version = "3.11"
@@ -88,6 +90,7 @@ module = [
88
90
  "pyarrow.*",
89
91
  "datafusion.*",
90
92
  "xarray.*",
93
+ "pandas.*",
91
94
  ]
92
95
  ignore_missing_imports = true
93
96
 
@@ -98,7 +101,7 @@ dev = [
98
101
  "xarray_sql[test]",
99
102
  "xarray_sql[docs]",
100
103
  "py-spy>=0.4.0",
101
- "pyink>=24.10.1",
104
+ "ruff>=0.15.10",
102
105
  "maturin>=1.9.1",
103
106
  ]
104
107
 
@@ -0,0 +1,150 @@
1
+ import pytest
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import xarray as xr
6
+
7
+
8
+ def rand_wx(start: str, end: str) -> xr.Dataset:
9
+ np.random.seed(42)
10
+ lat = np.linspace(-90, 90, num=720)
11
+ lon = np.linspace(-180, 180, num=1440)
12
+ time = pd.date_range(start, end, freq="h")
13
+ level = np.array([1000, 500], dtype=np.int32)
14
+ reference_time = pd.Timestamp(start)
15
+ temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
16
+ precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
17
+ return xr.Dataset(
18
+ data_vars=dict(
19
+ temperature=(["lat", "lon", "time", "level"], temperature),
20
+ precipitation=(["lat", "lon", "time", "level"], precipitation),
21
+ ),
22
+ coords=dict(
23
+ lat=lat,
24
+ lon=lon,
25
+ time=time,
26
+ level=level,
27
+ reference_time=reference_time,
28
+ ),
29
+ attrs=dict(description="Random weather."),
30
+ )
31
+
32
+
33
+ def create_large_dataset(time_steps=1000, lat_points=100, lon_points=100):
34
+ """Create a large xarray dataset for memory testing."""
35
+ np.random.seed(42)
36
+
37
+ time = pd.date_range("2020-01-01", periods=time_steps, freq="h")
38
+ lat = np.linspace(-90, 90, lat_points)
39
+ lon = np.linspace(-180, 180, lon_points)
40
+
41
+ temp_data = np.random.rand(time_steps, lat_points, lon_points) * 40 - 10
42
+ precip_data = np.random.rand(time_steps, lat_points, lon_points) * 100
43
+
44
+ return xr.Dataset(
45
+ {
46
+ "temperature": (["time", "lat", "lon"], temp_data),
47
+ "precipitation": (["time", "lat", "lon"], precip_data),
48
+ },
49
+ coords={"time": time, "lat": lat, "lon": lon},
50
+ )
51
+
52
+
53
+ @pytest.fixture
54
+ def air():
55
+ ds = xr.tutorial.open_dataset("air_temperature")
56
+ chunks = {"time": 240}
57
+ return ds.chunk(chunks)
58
+
59
+
60
+ @pytest.fixture
61
+ def air_small(air):
62
+ return air.isel(
63
+ time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
64
+ ).chunk({"time": 240})
65
+
66
+
67
+ @pytest.fixture
68
+ def randwx():
69
+ return rand_wx("1995-01-13T00", "1995-01-13T01")
70
+
71
+
72
+ @pytest.fixture
73
+ def large_ds():
74
+ return create_large_dataset().chunk({"time": 25})
75
+
76
+
77
+ @pytest.fixture
78
+ def air_dataset_small():
79
+ ds = xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
80
+ return ds.isel(time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10))
81
+
82
+
83
+ @pytest.fixture
84
+ def air_dataset_large():
85
+ return xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
86
+
87
+
88
+ @pytest.fixture
89
+ def rasm_ds():
90
+ """rasm uses cftime.DatetimeNoLeap (noleap / 365_day) for time."""
91
+ return xr.tutorial.open_dataset("rasm")
92
+
93
+
94
+ @pytest.fixture
95
+ def weather_dataset():
96
+ ds = rand_wx("2023-01-01T00", "2023-01-01T12")
97
+ return ds.isel(time=slice(0, 6), lat=slice(0, 10), lon=slice(0, 10)).chunk(
98
+ {"time": 3}
99
+ )
100
+
101
+
102
+ @pytest.fixture
103
+ def synthetic_dataset():
104
+ return create_large_dataset(
105
+ time_steps=50, lat_points=20, lon_points=20
106
+ ).chunk({"time": 25})
107
+
108
+
109
+ @pytest.fixture
110
+ def station_dataset():
111
+ return xr.Dataset(
112
+ {
113
+ "station_id": (["station"], [1, 2, 3, 4, 5]),
114
+ "elevation": (["station"], [100, 250, 500, 750, 1000]),
115
+ "name": (
116
+ ["station"],
117
+ [
118
+ "Station_A",
119
+ "Station_B",
120
+ "Station_C",
121
+ "Station_D",
122
+ "Station_E",
123
+ ],
124
+ ),
125
+ }
126
+ ).chunk({"station": 5})
127
+
128
+
129
+ @pytest.fixture
130
+ def air_and_stations():
131
+ air = (
132
+ xr.tutorial.open_dataset("air_temperature")
133
+ .isel(time=slice(0, 12), lat=slice(0, 5), lon=slice(0, 8))
134
+ .chunk({"time": 6})
135
+ )
136
+ stations = xr.Dataset(
137
+ {
138
+ "station_id": (["station"], [101, 102, 103]),
139
+ "lat": (
140
+ ["station"],
141
+ [air.lat.values[0], air.lat.values[2], air.lat.values[4]],
142
+ ),
143
+ "lon": (
144
+ ["station"],
145
+ [air.lon.values[1], air.lon.values[3], air.lon.values[5]],
146
+ ),
147
+ "elevation": (["station"], [100, 250, 500]),
148
+ }
149
+ ).chunk({"station": 3})
150
+ return air, stations