xarray_sql 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/Cargo.lock +1 -1
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/Cargo.toml +1 -1
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/PKG-INFO +88 -43
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/README.md +87 -42
- xarray_sql-0.2.3/docs/examples.md +93 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/pyproject.toml +7 -5
- xarray_sql-0.2.3/tests/conftest.py +150 -0
- xarray_sql-0.2.3/tests/test_cft.py +170 -0
- xarray_sql-0.2.3/tests/test_df.py +443 -0
- xarray_sql-0.2.3/tests/test_reader.py +1377 -0
- xarray_sql-0.2.3/tests/test_sql.py +439 -0
- xarray_sql-0.2.3/xarray_sql/cftime.py +248 -0
- xarray_sql-0.2.3/xarray_sql/core.py +49 -0
- xarray_sql-0.2.3/xarray_sql/df.py +447 -0
- xarray_sql-0.2.3/xarray_sql/reader.py +305 -0
- xarray_sql-0.2.3/xarray_sql/sql.py +129 -0
- xarray_sql-0.2.2/docs/examples.md +0 -23
- xarray_sql-0.2.2/tests/conftest.py +0 -144
- xarray_sql-0.2.2/tests/test_cft.py +0 -176
- xarray_sql-0.2.2/tests/test_df.py +0 -428
- xarray_sql-0.2.2/tests/test_reader.py +0 -1372
- xarray_sql-0.2.2/tests/test_sql.py +0 -318
- xarray_sql-0.2.2/xarray_sql/cftime.py +0 -248
- xarray_sql-0.2.2/xarray_sql/core.py +0 -49
- xarray_sql-0.2.2/xarray_sql/df.py +0 -445
- xarray_sql-0.2.2/xarray_sql/reader.py +0 -299
- xarray_sql-0.2.2/xarray_sql/sql.py +0 -63
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/.gitignore +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/LICENSE +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/assets/logo.svg +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/contributing.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/index.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/reference/xarray_sql.md +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/src/lib.rs +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/tests/__init__.py +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/xarray_sql/__init__.py +0 -0
- {xarray_sql-0.2.2 → xarray_sql-0.2.3}/zensical.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray_sql
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Intended Audience :: Science/Research
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -62,52 +62,97 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
62
62
|
import xarray as xr
|
|
63
63
|
import xarray_sql as xql
|
|
64
64
|
|
|
65
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
66
65
|
|
|
67
|
-
#
|
|
66
|
+
# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
|
|
67
|
+
# keeps Dask's partition setup cheap before any chunks are read from GCS.
|
|
68
|
+
ds = (
|
|
69
|
+
xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
70
|
+
chunks=dict(time=1),
|
|
71
|
+
storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
|
|
72
|
+
.sel(time='2020')
|
|
73
|
+
)
|
|
74
|
+
|
|
68
75
|
ctx = xql.XarrayContext()
|
|
69
|
-
ctx.from_dataset('
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
#
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
#
|
|
105
|
-
#
|
|
106
|
-
#
|
|
76
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
77
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
78
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
79
|
+
})
|
|
80
|
+
# Registration: ~0.5s for a full year of hourly ERA5, all variables.
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
84
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
85
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
86
|
+
# across the year (terabytes from GCS).
|
|
87
|
+
# ---> Always SELECT specific columns. <---
|
|
88
|
+
|
|
89
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
90
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
91
|
+
ctx.sql('''
|
|
92
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
93
|
+
FROM era5.surface
|
|
94
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
95
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
96
|
+
AND latitude BETWEEN 39 AND 40
|
|
97
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
98
|
+
''').to_pandas()
|
|
99
|
+
# avg_c
|
|
100
|
+
# 0 8.640069
|
|
101
|
+
|
|
102
|
+
# Average temperature per pressure level, globally.
|
|
103
|
+
ctx.sql('''
|
|
104
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
105
|
+
FROM era5.atmosphere
|
|
106
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
107
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
108
|
+
GROUP BY level
|
|
109
|
+
ORDER BY level DESC
|
|
110
|
+
''').to_pandas()
|
|
111
|
+
# level avg_c
|
|
112
|
+
# 0 1000 6.621012 ← surface
|
|
113
|
+
# 1 975 5.185638
|
|
114
|
+
# 2 950 4.028429
|
|
115
|
+
# 3 925 3.082812
|
|
116
|
+
# 4 900 2.210917
|
|
117
|
+
# 5 875 1.395018
|
|
118
|
+
# 6 850 0.634267
|
|
119
|
+
# 7 825 -0.210372
|
|
120
|
+
# 8 800 -1.181075
|
|
121
|
+
# 9 775 -2.306465
|
|
122
|
+
# 10 750 -3.535534
|
|
123
|
+
# 11 700 -6.241685
|
|
124
|
+
# 12 650 -9.236364
|
|
125
|
+
# 13 600 -12.580938
|
|
126
|
+
# 14 550 -16.335386
|
|
127
|
+
# 15 500 -20.643604
|
|
128
|
+
# 16 450 -25.573401
|
|
129
|
+
# 17 400 -31.156920
|
|
130
|
+
# 18 350 -37.400552
|
|
131
|
+
# 19 300 -43.852607
|
|
132
|
+
# 20 250 -49.322132
|
|
133
|
+
# 21 225 -51.569113
|
|
134
|
+
# 22 200 -53.693248
|
|
135
|
+
# 23 175 -55.890484
|
|
136
|
+
# 24 150 -58.382290
|
|
137
|
+
# 25 125 -61.091916
|
|
138
|
+
# 26 100 -63.624885 ← tropopause
|
|
139
|
+
# 27 70 -63.182300
|
|
140
|
+
# 28 50 -60.124845
|
|
141
|
+
# 29 30 -55.986327
|
|
142
|
+
# 30 20 -52.433089
|
|
143
|
+
# 31 10 -44.140750
|
|
144
|
+
# 32 7 -38.707350
|
|
145
|
+
# 33 5 -32.621999
|
|
146
|
+
# 34 3 -21.509175
|
|
147
|
+
# 35 2 -13.355764
|
|
148
|
+
# 36 1 -9.020513 ← top of atmosphere
|
|
107
149
|
```
|
|
108
150
|
|
|
109
|
-
|
|
110
|
-
|
|
151
|
+
_(A runnable version of this example lives at
|
|
152
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
153
|
+
|
|
154
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
155
|
+
SQL queries against them.
|
|
111
156
|
|
|
112
157
|
## Why build this?
|
|
113
158
|
|
|
@@ -19,52 +19,97 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
19
19
|
import xarray as xr
|
|
20
20
|
import xarray_sql as xql
|
|
21
21
|
|
|
22
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
23
22
|
|
|
24
|
-
#
|
|
23
|
+
# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
|
|
24
|
+
# keeps Dask's partition setup cheap before any chunks are read from GCS.
|
|
25
|
+
ds = (
|
|
26
|
+
xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
27
|
+
chunks=dict(time=1),
|
|
28
|
+
storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
|
|
29
|
+
.sel(time='2020')
|
|
30
|
+
)
|
|
31
|
+
|
|
25
32
|
ctx = xql.XarrayContext()
|
|
26
|
-
ctx.from_dataset('
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
#
|
|
33
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
34
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
35
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
36
|
+
})
|
|
37
|
+
# Registration: ~0.5s for a full year of hourly ERA5, all variables.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
41
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
42
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
43
|
+
# across the year (terabytes from GCS).
|
|
44
|
+
# ---> Always SELECT specific columns. <---
|
|
45
|
+
|
|
46
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
47
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
48
|
+
ctx.sql('''
|
|
49
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
50
|
+
FROM era5.surface
|
|
51
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
52
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
53
|
+
AND latitude BETWEEN 39 AND 40
|
|
54
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
55
|
+
''').to_pandas()
|
|
56
|
+
# avg_c
|
|
57
|
+
# 0 8.640069
|
|
58
|
+
|
|
59
|
+
# Average temperature per pressure level, globally.
|
|
60
|
+
ctx.sql('''
|
|
61
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
62
|
+
FROM era5.atmosphere
|
|
63
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
64
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
65
|
+
GROUP BY level
|
|
66
|
+
ORDER BY level DESC
|
|
67
|
+
''').to_pandas()
|
|
68
|
+
# level avg_c
|
|
69
|
+
# 0 1000 6.621012 ← surface
|
|
70
|
+
# 1 975 5.185638
|
|
71
|
+
# 2 950 4.028429
|
|
72
|
+
# 3 925 3.082812
|
|
73
|
+
# 4 900 2.210917
|
|
74
|
+
# 5 875 1.395018
|
|
75
|
+
# 6 850 0.634267
|
|
76
|
+
# 7 825 -0.210372
|
|
77
|
+
# 8 800 -1.181075
|
|
78
|
+
# 9 775 -2.306465
|
|
79
|
+
# 10 750 -3.535534
|
|
80
|
+
# 11 700 -6.241685
|
|
81
|
+
# 12 650 -9.236364
|
|
82
|
+
# 13 600 -12.580938
|
|
83
|
+
# 14 550 -16.335386
|
|
84
|
+
# 15 500 -20.643604
|
|
85
|
+
# 16 450 -25.573401
|
|
86
|
+
# 17 400 -31.156920
|
|
87
|
+
# 18 350 -37.400552
|
|
88
|
+
# 19 300 -43.852607
|
|
89
|
+
# 20 250 -49.322132
|
|
90
|
+
# 21 225 -51.569113
|
|
91
|
+
# 22 200 -53.693248
|
|
92
|
+
# 23 175 -55.890484
|
|
93
|
+
# 24 150 -58.382290
|
|
94
|
+
# 25 125 -61.091916
|
|
95
|
+
# 26 100 -63.624885 ← tropopause
|
|
96
|
+
# 27 70 -63.182300
|
|
97
|
+
# 28 50 -60.124845
|
|
98
|
+
# 29 30 -55.986327
|
|
99
|
+
# 30 20 -52.433089
|
|
100
|
+
# 31 10 -44.140750
|
|
101
|
+
# 32 7 -38.707350
|
|
102
|
+
# 33 5 -32.621999
|
|
103
|
+
# 34 3 -21.509175
|
|
104
|
+
# 35 2 -13.355764
|
|
105
|
+
# 36 1 -9.020513 ← top of atmosphere
|
|
64
106
|
```
|
|
65
107
|
|
|
66
|
-
|
|
67
|
-
|
|
108
|
+
_(A runnable version of this example lives at
|
|
109
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
110
|
+
|
|
111
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
112
|
+
SQL queries against them.
|
|
68
113
|
|
|
69
114
|
## Why build this?
|
|
70
115
|
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Examples
|
|
2
|
+
|
|
3
|
+
```python
|
|
4
|
+
import xarray as xr
|
|
5
|
+
import xarray_sql as xql
|
|
6
|
+
|
|
7
|
+
ds = xr.tutorial.open_dataset('air_temperature')
|
|
8
|
+
|
|
9
|
+
ctx = xql.XarrayContext()
|
|
10
|
+
ctx.from_dataset('air', ds, chunks=dict(time=24))
|
|
11
|
+
|
|
12
|
+
result = ctx.sql('''
|
|
13
|
+
SELECT
|
|
14
|
+
"lat", "lon", AVG("air") as air_avg
|
|
15
|
+
FROM
|
|
16
|
+
"air"
|
|
17
|
+
GROUP BY
|
|
18
|
+
"lat", "lon"
|
|
19
|
+
''')
|
|
20
|
+
|
|
21
|
+
df = result.to_pandas()
|
|
22
|
+
df.head()
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Mixed-dimension datasets: ARCO-ERA5
|
|
26
|
+
|
|
27
|
+
When a Dataset has variables with differing dimensions (e.g. surface fields on
|
|
28
|
+
`(time, latitude, longitude)` and atmospheric fields on
|
|
29
|
+
`(time, level, latitude, longitude)`), `from_dataset` splits them into one
|
|
30
|
+
table per dimension group, registered together under a SQL schema named after
|
|
31
|
+
the first argument. [ARCO-ERA5][arco-era5] is a good example: 262 of its
|
|
32
|
+
variables are surface fields and 11 are atmospheric.
|
|
33
|
+
|
|
34
|
+
Open a year of ARCO-ERA5 and let SQL `WHERE` clauses do the filtering — the
|
|
35
|
+
library prunes time partitions and pushes dimension-column filters down. Use
|
|
36
|
+
the `table_names` kwarg to give each dimension group a friendly name:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import xarray as xr
|
|
40
|
+
import xarray_sql as xql
|
|
41
|
+
|
|
42
|
+
# Open ARCO-ERA5 directly from GCS (anonymous read).
|
|
43
|
+
url = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
|
|
44
|
+
full = xr.open_zarr(url, chunks=None, storage_options={'token': 'anon'})
|
|
45
|
+
|
|
46
|
+
# A full year of hourly ERA5 — all 273 variables. No spatial slicing on the
|
|
47
|
+
# xarray side; SQL WHERE clauses below express the filters. `chunks={'time': 1}`
|
|
48
|
+
# aligns Dask chunks to native Zarr chunks of shape (1, 37, 721, 1440) so
|
|
49
|
+
# chunk reads from GCS happen concurrently.
|
|
50
|
+
#
|
|
51
|
+
# Heads up: 262 of those variables are surface and 11 are atmospheric. The
|
|
52
|
+
# library pushes column projection down, so SELECT only fetches what you ask
|
|
53
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
54
|
+
# across the year (terabytes from GCS). Always SELECT specific columns.
|
|
55
|
+
ds = full.sel(time='2020').chunk({'time': 1})
|
|
56
|
+
|
|
57
|
+
ctx = xql.XarrayContext()
|
|
58
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
59
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
60
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
61
|
+
})
|
|
62
|
+
# Registers two tables under a SQL schema named 'era5': 'surface' and 'atmosphere'.
|
|
63
|
+
|
|
64
|
+
# Average 2m-temperature over the NYC area on the morning of 2020-01-01.
|
|
65
|
+
ctx.sql('''
|
|
66
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
67
|
+
FROM era5.surface
|
|
68
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
69
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
70
|
+
AND latitude BETWEEN 39 AND 40
|
|
71
|
+
AND longitude BETWEEN 286 AND 287
|
|
72
|
+
''').to_pandas()
|
|
73
|
+
|
|
74
|
+
# Average temperature per pressure level, globally — the standard
|
|
75
|
+
# atmospheric temperature profile. Scans ~230M rows.
|
|
76
|
+
ctx.sql('''
|
|
77
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
78
|
+
FROM era5.atmosphere
|
|
79
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
80
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
81
|
+
GROUP BY level
|
|
82
|
+
ORDER BY level DESC -- surface (1000 hPa) first
|
|
83
|
+
''').to_pandas()
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
If you omit `table_names`, each table is named by joining its dimension names
|
|
87
|
+
with underscores, e.g. `era5.time_latitude_longitude` and
|
|
88
|
+
`era5.time_level_latitude_longitude`.
|
|
89
|
+
|
|
90
|
+
A runnable version of this example lives at
|
|
91
|
+
[`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
|
|
92
|
+
|
|
93
|
+
[arco-era5]: https://github.com/google-research/arco-era5
|
|
@@ -64,11 +64,13 @@ module-name = "xarray_sql._native"
|
|
|
64
64
|
[tool.setuptools.packages.find]
|
|
65
65
|
exclude = ["demo", "perf_tests", "tests", "tests.*"]
|
|
66
66
|
|
|
67
|
-
[tool.
|
|
67
|
+
[tool.ruff]
|
|
68
68
|
line-length = 80
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
indent-width = 4
|
|
70
|
+
|
|
71
|
+
[tool.ruff.format]
|
|
72
|
+
indent-style = "space"
|
|
73
|
+
quote-style = "double"
|
|
72
74
|
|
|
73
75
|
[tool.mypy]
|
|
74
76
|
python_version = "3.11"
|
|
@@ -98,7 +100,7 @@ dev = [
|
|
|
98
100
|
"xarray_sql[test]",
|
|
99
101
|
"xarray_sql[docs]",
|
|
100
102
|
"py-spy>=0.4.0",
|
|
101
|
-
"
|
|
103
|
+
"ruff>=0.15.10",
|
|
102
104
|
"maturin>=1.9.1",
|
|
103
105
|
]
|
|
104
106
|
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import xarray as xr
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def rand_wx(start: str, end: str) -> xr.Dataset:
|
|
9
|
+
np.random.seed(42)
|
|
10
|
+
lat = np.linspace(-90, 90, num=720)
|
|
11
|
+
lon = np.linspace(-180, 180, num=1440)
|
|
12
|
+
time = pd.date_range(start, end, freq="h")
|
|
13
|
+
level = np.array([1000, 500], dtype=np.int32)
|
|
14
|
+
reference_time = pd.Timestamp(start)
|
|
15
|
+
temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
|
|
16
|
+
precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
|
|
17
|
+
return xr.Dataset(
|
|
18
|
+
data_vars=dict(
|
|
19
|
+
temperature=(["lat", "lon", "time", "level"], temperature),
|
|
20
|
+
precipitation=(["lat", "lon", "time", "level"], precipitation),
|
|
21
|
+
),
|
|
22
|
+
coords=dict(
|
|
23
|
+
lat=lat,
|
|
24
|
+
lon=lon,
|
|
25
|
+
time=time,
|
|
26
|
+
level=level,
|
|
27
|
+
reference_time=reference_time,
|
|
28
|
+
),
|
|
29
|
+
attrs=dict(description="Random weather."),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_large_dataset(time_steps=1000, lat_points=100, lon_points=100):
|
|
34
|
+
"""Create a large xarray dataset for memory testing."""
|
|
35
|
+
np.random.seed(42)
|
|
36
|
+
|
|
37
|
+
time = pd.date_range("2020-01-01", periods=time_steps, freq="h")
|
|
38
|
+
lat = np.linspace(-90, 90, lat_points)
|
|
39
|
+
lon = np.linspace(-180, 180, lon_points)
|
|
40
|
+
|
|
41
|
+
temp_data = np.random.rand(time_steps, lat_points, lon_points) * 40 - 10
|
|
42
|
+
precip_data = np.random.rand(time_steps, lat_points, lon_points) * 100
|
|
43
|
+
|
|
44
|
+
return xr.Dataset(
|
|
45
|
+
{
|
|
46
|
+
"temperature": (["time", "lat", "lon"], temp_data),
|
|
47
|
+
"precipitation": (["time", "lat", "lon"], precip_data),
|
|
48
|
+
},
|
|
49
|
+
coords={"time": time, "lat": lat, "lon": lon},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture
|
|
54
|
+
def air():
|
|
55
|
+
ds = xr.tutorial.open_dataset("air_temperature")
|
|
56
|
+
chunks = {"time": 240}
|
|
57
|
+
return ds.chunk(chunks)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.fixture
|
|
61
|
+
def air_small(air):
|
|
62
|
+
return air.isel(
|
|
63
|
+
time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
|
|
64
|
+
).chunk({"time": 240})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.fixture
|
|
68
|
+
def randwx():
|
|
69
|
+
return rand_wx("1995-01-13T00", "1995-01-13T01")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.fixture
|
|
73
|
+
def large_ds():
|
|
74
|
+
return create_large_dataset().chunk({"time": 25})
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@pytest.fixture
|
|
78
|
+
def air_dataset_small():
|
|
79
|
+
ds = xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
|
|
80
|
+
return ds.isel(time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.fixture
|
|
84
|
+
def air_dataset_large():
|
|
85
|
+
return xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@pytest.fixture
|
|
89
|
+
def rasm_ds():
|
|
90
|
+
"""rasm uses cftime.DatetimeNoLeap (noleap / 365_day) for time."""
|
|
91
|
+
return xr.tutorial.open_dataset("rasm")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.fixture
|
|
95
|
+
def weather_dataset():
|
|
96
|
+
ds = rand_wx("2023-01-01T00", "2023-01-01T12")
|
|
97
|
+
return ds.isel(time=slice(0, 6), lat=slice(0, 10), lon=slice(0, 10)).chunk(
|
|
98
|
+
{"time": 3}
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.fixture
|
|
103
|
+
def synthetic_dataset():
|
|
104
|
+
return create_large_dataset(
|
|
105
|
+
time_steps=50, lat_points=20, lon_points=20
|
|
106
|
+
).chunk({"time": 25})
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@pytest.fixture
|
|
110
|
+
def station_dataset():
|
|
111
|
+
return xr.Dataset(
|
|
112
|
+
{
|
|
113
|
+
"station_id": (["station"], [1, 2, 3, 4, 5]),
|
|
114
|
+
"elevation": (["station"], [100, 250, 500, 750, 1000]),
|
|
115
|
+
"name": (
|
|
116
|
+
["station"],
|
|
117
|
+
[
|
|
118
|
+
"Station_A",
|
|
119
|
+
"Station_B",
|
|
120
|
+
"Station_C",
|
|
121
|
+
"Station_D",
|
|
122
|
+
"Station_E",
|
|
123
|
+
],
|
|
124
|
+
),
|
|
125
|
+
}
|
|
126
|
+
).chunk({"station": 5})
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@pytest.fixture
|
|
130
|
+
def air_and_stations():
|
|
131
|
+
air = (
|
|
132
|
+
xr.tutorial.open_dataset("air_temperature")
|
|
133
|
+
.isel(time=slice(0, 12), lat=slice(0, 5), lon=slice(0, 8))
|
|
134
|
+
.chunk({"time": 6})
|
|
135
|
+
)
|
|
136
|
+
stations = xr.Dataset(
|
|
137
|
+
{
|
|
138
|
+
"station_id": (["station"], [101, 102, 103]),
|
|
139
|
+
"lat": (
|
|
140
|
+
["station"],
|
|
141
|
+
[air.lat.values[0], air.lat.values[2], air.lat.values[4]],
|
|
142
|
+
),
|
|
143
|
+
"lon": (
|
|
144
|
+
["station"],
|
|
145
|
+
[air.lon.values[1], air.lon.values[3], air.lon.values[5]],
|
|
146
|
+
),
|
|
147
|
+
"elevation": (["station"], [100, 250, 500]),
|
|
148
|
+
}
|
|
149
|
+
).chunk({"station": 3})
|
|
150
|
+
return air, stations
|