xarray_sql 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/Cargo.lock +1 -1
  2. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/Cargo.toml +1 -1
  3. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/PKG-INFO +88 -43
  4. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/README.md +87 -42
  5. xarray_sql-0.2.3/docs/examples.md +93 -0
  6. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/pyproject.toml +7 -5
  7. xarray_sql-0.2.3/tests/conftest.py +150 -0
  8. xarray_sql-0.2.3/tests/test_cft.py +170 -0
  9. xarray_sql-0.2.3/tests/test_df.py +443 -0
  10. xarray_sql-0.2.3/tests/test_reader.py +1377 -0
  11. xarray_sql-0.2.3/tests/test_sql.py +439 -0
  12. xarray_sql-0.2.3/xarray_sql/cftime.py +248 -0
  13. xarray_sql-0.2.3/xarray_sql/core.py +49 -0
  14. xarray_sql-0.2.3/xarray_sql/df.py +447 -0
  15. xarray_sql-0.2.3/xarray_sql/reader.py +305 -0
  16. xarray_sql-0.2.3/xarray_sql/sql.py +129 -0
  17. xarray_sql-0.2.2/docs/examples.md +0 -23
  18. xarray_sql-0.2.2/tests/conftest.py +0 -144
  19. xarray_sql-0.2.2/tests/test_cft.py +0 -176
  20. xarray_sql-0.2.2/tests/test_df.py +0 -428
  21. xarray_sql-0.2.2/tests/test_reader.py +0 -1372
  22. xarray_sql-0.2.2/tests/test_sql.py +0 -318
  23. xarray_sql-0.2.2/xarray_sql/cftime.py +0 -248
  24. xarray_sql-0.2.2/xarray_sql/core.py +0 -49
  25. xarray_sql-0.2.2/xarray_sql/df.py +0 -445
  26. xarray_sql-0.2.2/xarray_sql/reader.py +0 -299
  27. xarray_sql-0.2.2/xarray_sql/sql.py +0 -63
  28. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/.gitignore +0 -0
  29. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/LICENSE +0 -0
  30. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/assets/logo.svg +0 -0
  31. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/contributing.md +0 -0
  32. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/index.md +0 -0
  33. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/docs/reference/xarray_sql.md +0 -0
  34. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/src/lib.rs +0 -0
  35. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/tests/__init__.py +0 -0
  36. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/xarray_sql/__init__.py +0 -0
  37. {xarray_sql-0.2.2 → xarray_sql-0.2.3}/zensical.toml +0 -0
@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
3375
3375
 
3376
3376
  [[package]]
3377
3377
  name = "xarray_sql"
3378
- version = "0.2.2"
3378
+ version = "0.2.3"
3379
3379
  dependencies = [
3380
3380
  "arrow",
3381
3381
  "async-stream",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "xarray_sql"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  authors = ["Alex Merose"]
5
5
  edition = "2021"
6
6
  exclude = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray_sql
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Intended Audience :: Developers
@@ -62,52 +62,97 @@ This is an experiment to provide a SQL interface for array datasets.
62
62
  import xarray as xr
63
63
  import xarray_sql as xql
64
64
 
65
- ds = xr.tutorial.open_dataset('air_temperature')
66
65
 
67
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
66
+ # Open a year of ARCO-ERA5 all 273 variables. Selecting a year up front
67
+ # keeps Dask's partition setup cheap before any chunks are read from GCS.
68
+ ds = (
69
+ xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
70
+ chunks=dict(time=1),
71
+ storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
72
+ .sel(time='2020')
73
+ )
74
+
68
75
  ctx = xql.XarrayContext()
69
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
70
- # data is only materialized when we make a query.
71
-
72
- result = ctx.sql('''
73
- SELECT
74
- "lat", "lon", AVG("air") as air_avg
75
- FROM
76
- "air"
77
- GROUP BY
78
- "lat", "lon"
79
- ''')
80
- # DataFrame()
81
- # +------+-------+--------------------+
82
- # | lat | lon | air_avg |
83
- # +------+-------+--------------------+
84
- # | 75.0 | 205.0 | 259.88662671232834 |
85
- # | 75.0 | 207.5 | 259.48268150684896 |
86
- # | 75.0 | 230.0 | 258.9192123287667 |
87
- # | 75.0 | 275.0 | 257.07574315068456 |
88
- # | 75.0 | 322.5 | 250.11792123287654 |
89
- # | 75.0 | 325.0 | 250.81590068493134 |
90
- # | 72.5 | 205.0 | 262.74933904109537 |
91
- # | 72.5 | 207.5 | 262.5384315068488 |
92
- # | 72.5 | 230.0 | 260.82879452054743 |
93
- # | 72.5 | 275.0 | 257.3063321917804 |
94
- # +------+-------+--------------------+
95
- # Data truncated.
96
-
97
- # The full query is only made when we call `collect()`, or, in this case,
98
- # `to_pandas()`.
99
- df = result.to_pandas()
100
- df.head()
101
- # lat lon air_avg
102
- # 0 75.0 232.5 258.836188
103
- # 1 75.0 247.5 257.716171
104
- # 2 75.0 262.5 257.347959
105
- # 3 75.0 277.5 257.671308
106
- # 4 72.5 232.5 260.654401
76
+ ctx.from_dataset('era5', ds, table_names={
77
+ ('time', 'latitude', 'longitude'): 'surface',
78
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
79
+ })
80
+ # Registration: ~0.5s for a full year of hourly ERA5, all variables.
81
+
82
+
83
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
84
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
85
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
86
+ # across the year (terabytes from GCS).
87
+ # ---> Always SELECT specific columns. <---
88
+
89
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
90
+ # pushes WHERE clauses on dimension columns down to partition pruning.
91
+ ctx.sql('''
92
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
93
+ FROM era5.surface
94
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
95
+ AND TIMESTAMP '2020-01-01 05:00:00'
96
+ AND latitude BETWEEN 39 AND 40
97
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
98
+ ''').to_pandas()
99
+ # avg_c
100
+ # 0 8.640069
101
+
102
+ # Average temperature per pressure level, globally.
103
+ ctx.sql('''
104
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
105
+ FROM era5.atmosphere
106
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
107
+ AND TIMESTAMP '2020-01-01 05:00:00'
108
+ GROUP BY level
109
+ ORDER BY level DESC
110
+ ''').to_pandas()
111
+ # level avg_c
112
+ # 0 1000 6.621012 ← surface
113
+ # 1 975 5.185638
114
+ # 2 950 4.028429
115
+ # 3 925 3.082812
116
+ # 4 900 2.210917
117
+ # 5 875 1.395018
118
+ # 6 850 0.634267
119
+ # 7 825 -0.210372
120
+ # 8 800 -1.181075
121
+ # 9 775 -2.306465
122
+ # 10 750 -3.535534
123
+ # 11 700 -6.241685
124
+ # 12 650 -9.236364
125
+ # 13 600 -12.580938
126
+ # 14 550 -16.335386
127
+ # 15 500 -20.643604
128
+ # 16 450 -25.573401
129
+ # 17 400 -31.156920
130
+ # 18 350 -37.400552
131
+ # 19 300 -43.852607
132
+ # 20 250 -49.322132
133
+ # 21 225 -51.569113
134
+ # 22 200 -53.693248
135
+ # 23 175 -55.890484
136
+ # 24 150 -58.382290
137
+ # 25 125 -61.091916
138
+ # 26 100 -63.624885 ← tropopause
139
+ # 27 70 -63.182300
140
+ # 28 50 -60.124845
141
+ # 29 30 -55.986327
142
+ # 30 20 -52.433089
143
+ # 31 10 -44.140750
144
+ # 32 7 -38.707350
145
+ # 33 5 -32.621999
146
+ # 34 3 -21.509175
147
+ # 35 2 -13.355764
148
+ # 36 1 -9.020513 ← top of atmosphere
107
149
  ```
108
150
 
109
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
110
- SQL queries against them.
151
+ _(A runnable version of this example lives at
152
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
153
+
154
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
155
+ SQL queries against them.
111
156
 
112
157
  ## Why build this?
113
158
 
@@ -19,52 +19,97 @@ This is an experiment to provide a SQL interface for array datasets.
19
19
  import xarray as xr
20
20
  import xarray_sql as xql
21
21
 
22
- ds = xr.tutorial.open_dataset('air_temperature')
23
22
 
24
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
23
+ # Open a year of ARCO-ERA5 all 273 variables. Selecting a year up front
24
+ # keeps Dask's partition setup cheap before any chunks are read from GCS.
25
+ ds = (
26
+ xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
27
+ chunks=dict(time=1),
28
+ storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
29
+ .sel(time='2020')
30
+ )
31
+
25
32
  ctx = xql.XarrayContext()
26
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
27
- # data is only materialized when we make a query.
28
-
29
- result = ctx.sql('''
30
- SELECT
31
- "lat", "lon", AVG("air") as air_avg
32
- FROM
33
- "air"
34
- GROUP BY
35
- "lat", "lon"
36
- ''')
37
- # DataFrame()
38
- # +------+-------+--------------------+
39
- # | lat | lon | air_avg |
40
- # +------+-------+--------------------+
41
- # | 75.0 | 205.0 | 259.88662671232834 |
42
- # | 75.0 | 207.5 | 259.48268150684896 |
43
- # | 75.0 | 230.0 | 258.9192123287667 |
44
- # | 75.0 | 275.0 | 257.07574315068456 |
45
- # | 75.0 | 322.5 | 250.11792123287654 |
46
- # | 75.0 | 325.0 | 250.81590068493134 |
47
- # | 72.5 | 205.0 | 262.74933904109537 |
48
- # | 72.5 | 207.5 | 262.5384315068488 |
49
- # | 72.5 | 230.0 | 260.82879452054743 |
50
- # | 72.5 | 275.0 | 257.3063321917804 |
51
- # +------+-------+--------------------+
52
- # Data truncated.
53
-
54
- # The full query is only made when we call `collect()`, or, in this case,
55
- # `to_pandas()`.
56
- df = result.to_pandas()
57
- df.head()
58
- # lat lon air_avg
59
- # 0 75.0 232.5 258.836188
60
- # 1 75.0 247.5 257.716171
61
- # 2 75.0 262.5 257.347959
62
- # 3 75.0 277.5 257.671308
63
- # 4 72.5 232.5 260.654401
33
+ ctx.from_dataset('era5', ds, table_names={
34
+ ('time', 'latitude', 'longitude'): 'surface',
35
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
36
+ })
37
+ # Registration: ~0.5s for a full year of hourly ERA5, all variables.
38
+
39
+
40
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
41
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
42
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
43
+ # across the year (terabytes from GCS).
44
+ # ---> Always SELECT specific columns. <---
45
+
46
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
47
+ # pushes WHERE clauses on dimension columns down to partition pruning.
48
+ ctx.sql('''
49
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
50
+ FROM era5.surface
51
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
52
+ AND TIMESTAMP '2020-01-01 05:00:00'
53
+ AND latitude BETWEEN 39 AND 40
54
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
55
+ ''').to_pandas()
56
+ # avg_c
57
+ # 0 8.640069
58
+
59
+ # Average temperature per pressure level, globally.
60
+ ctx.sql('''
61
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
62
+ FROM era5.atmosphere
63
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
64
+ AND TIMESTAMP '2020-01-01 05:00:00'
65
+ GROUP BY level
66
+ ORDER BY level DESC
67
+ ''').to_pandas()
68
+ # level avg_c
69
+ # 0 1000 6.621012 ← surface
70
+ # 1 975 5.185638
71
+ # 2 950 4.028429
72
+ # 3 925 3.082812
73
+ # 4 900 2.210917
74
+ # 5 875 1.395018
75
+ # 6 850 0.634267
76
+ # 7 825 -0.210372
77
+ # 8 800 -1.181075
78
+ # 9 775 -2.306465
79
+ # 10 750 -3.535534
80
+ # 11 700 -6.241685
81
+ # 12 650 -9.236364
82
+ # 13 600 -12.580938
83
+ # 14 550 -16.335386
84
+ # 15 500 -20.643604
85
+ # 16 450 -25.573401
86
+ # 17 400 -31.156920
87
+ # 18 350 -37.400552
88
+ # 19 300 -43.852607
89
+ # 20 250 -49.322132
90
+ # 21 225 -51.569113
91
+ # 22 200 -53.693248
92
+ # 23 175 -55.890484
93
+ # 24 150 -58.382290
94
+ # 25 125 -61.091916
95
+ # 26 100 -63.624885 ← tropopause
96
+ # 27 70 -63.182300
97
+ # 28 50 -60.124845
98
+ # 29 30 -55.986327
99
+ # 30 20 -52.433089
100
+ # 31 10 -44.140750
101
+ # 32 7 -38.707350
102
+ # 33 5 -32.621999
103
+ # 34 3 -21.509175
104
+ # 35 2 -13.355764
105
+ # 36 1 -9.020513 ← top of atmosphere
64
106
  ```
65
107
 
66
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
67
- SQL queries against them.
108
+ _(A runnable version of this example lives at
109
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
110
+
111
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
112
+ SQL queries against them.
68
113
 
69
114
  ## Why build this?
70
115
 
@@ -0,0 +1,93 @@
1
+ # Examples
2
+
3
+ ```python
4
+ import xarray as xr
5
+ import xarray_sql as xql
6
+
7
+ ds = xr.tutorial.open_dataset('air_temperature')
8
+
9
+ ctx = xql.XarrayContext()
10
+ ctx.from_dataset('air', ds, chunks=dict(time=24))
11
+
12
+ result = ctx.sql('''
13
+ SELECT
14
+ "lat", "lon", AVG("air") as air_avg
15
+ FROM
16
+ "air"
17
+ GROUP BY
18
+ "lat", "lon"
19
+ ''')
20
+
21
+ df = result.to_pandas()
22
+ df.head()
23
+ ```
24
+
25
+ ## Mixed-dimension datasets: ARCO-ERA5
26
+
27
+ When a Dataset has variables with differing dimensions (e.g. surface fields on
28
+ `(time, latitude, longitude)` and atmospheric fields on
29
+ `(time, level, latitude, longitude)`), `from_dataset` splits them into one
30
+ table per dimension group, registered together under a SQL schema named after
31
+ the first argument. [ARCO-ERA5][arco-era5] is a good example: 262 of its
32
+ variables are surface fields and 11 are atmospheric.
33
+
34
+ Open a year of ARCO-ERA5 and let SQL `WHERE` clauses do the filtering — the
35
+ library prunes time partitions and pushes dimension-column filters down. Use
36
+ the `table_names` kwarg to give each dimension group a friendly name:
37
+
38
+ ```python
39
+ import xarray as xr
40
+ import xarray_sql as xql
41
+
42
+ # Open ARCO-ERA5 directly from GCS (anonymous read).
43
+ url = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
44
+ full = xr.open_zarr(url, chunks=None, storage_options={'token': 'anon'})
45
+
46
+ # A full year of hourly ERA5 — all 273 variables. No spatial slicing on the
47
+ # xarray side; SQL WHERE clauses below express the filters. `chunks={'time': 1}`
48
+ # aligns Dask chunks to native Zarr chunks of shape (1, 37, 721, 1440) so
49
+ # chunk reads from GCS happen concurrently.
50
+ #
51
+ # Heads up: 262 of those variables are surface and 11 are atmospheric. The
52
+ # library pushes column projection down, so SELECT only fetches what you ask
53
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
54
+ # across the year (terabytes from GCS). Always SELECT specific columns.
55
+ ds = full.sel(time='2020').chunk({'time': 1})
56
+
57
+ ctx = xql.XarrayContext()
58
+ ctx.from_dataset('era5', ds, table_names={
59
+ ('time', 'latitude', 'longitude'): 'surface',
60
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
61
+ })
62
+ # Registers two tables under a SQL schema named 'era5': 'surface' and 'atmosphere'.
63
+
64
+ # Average 2m-temperature over the NYC area on the morning of 2020-01-01.
65
+ ctx.sql('''
66
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
67
+ FROM era5.surface
68
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
69
+ AND TIMESTAMP '2020-01-01 05:00:00'
70
+ AND latitude BETWEEN 39 AND 40
71
+ AND longitude BETWEEN 286 AND 287
72
+ ''').to_pandas()
73
+
74
+ # Average temperature per pressure level, globally — the standard
75
+ # atmospheric temperature profile. Scans ~230M rows.
76
+ ctx.sql('''
77
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
78
+ FROM era5.atmosphere
79
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
80
+ AND TIMESTAMP '2020-01-01 05:00:00'
81
+ GROUP BY level
82
+ ORDER BY level DESC -- surface (1000 hPa) first
83
+ ''').to_pandas()
84
+ ```
85
+
86
+ If you omit `table_names`, each table is named by joining its dimension names
87
+ with underscores, e.g. `era5.time_latitude_longitude` and
88
+ `era5.time_level_latitude_longitude`.
89
+
90
+ A runnable version of this example lives at
91
+ [`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
92
+
93
+ [arco-era5]: https://github.com/google-research/arco-era5
@@ -64,11 +64,13 @@ module-name = "xarray_sql._native"
64
64
  [tool.setuptools.packages.find]
65
65
  exclude = ["demo", "perf_tests", "tests", "tests.*"]
66
66
 
67
- [tool.pyink]
67
+ [tool.ruff]
68
68
  line-length = 80
69
- preview = true
70
- pyink-indentation = 2
71
- pyink-use-majority-quotes = true
69
+ indent-width = 4
70
+
71
+ [tool.ruff.format]
72
+ indent-style = "space"
73
+ quote-style = "double"
72
74
 
73
75
  [tool.mypy]
74
76
  python_version = "3.11"
@@ -98,7 +100,7 @@ dev = [
98
100
  "xarray_sql[test]",
99
101
  "xarray_sql[docs]",
100
102
  "py-spy>=0.4.0",
101
- "pyink>=24.10.1",
103
+ "ruff>=0.15.10",
102
104
  "maturin>=1.9.1",
103
105
  ]
104
106
 
@@ -0,0 +1,150 @@
1
+ import pytest
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import xarray as xr
6
+
7
+
8
+ def rand_wx(start: str, end: str) -> xr.Dataset:
9
+ np.random.seed(42)
10
+ lat = np.linspace(-90, 90, num=720)
11
+ lon = np.linspace(-180, 180, num=1440)
12
+ time = pd.date_range(start, end, freq="h")
13
+ level = np.array([1000, 500], dtype=np.int32)
14
+ reference_time = pd.Timestamp(start)
15
+ temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
16
+ precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
17
+ return xr.Dataset(
18
+ data_vars=dict(
19
+ temperature=(["lat", "lon", "time", "level"], temperature),
20
+ precipitation=(["lat", "lon", "time", "level"], precipitation),
21
+ ),
22
+ coords=dict(
23
+ lat=lat,
24
+ lon=lon,
25
+ time=time,
26
+ level=level,
27
+ reference_time=reference_time,
28
+ ),
29
+ attrs=dict(description="Random weather."),
30
+ )
31
+
32
+
33
+ def create_large_dataset(time_steps=1000, lat_points=100, lon_points=100):
34
+ """Create a large xarray dataset for memory testing."""
35
+ np.random.seed(42)
36
+
37
+ time = pd.date_range("2020-01-01", periods=time_steps, freq="h")
38
+ lat = np.linspace(-90, 90, lat_points)
39
+ lon = np.linspace(-180, 180, lon_points)
40
+
41
+ temp_data = np.random.rand(time_steps, lat_points, lon_points) * 40 - 10
42
+ precip_data = np.random.rand(time_steps, lat_points, lon_points) * 100
43
+
44
+ return xr.Dataset(
45
+ {
46
+ "temperature": (["time", "lat", "lon"], temp_data),
47
+ "precipitation": (["time", "lat", "lon"], precip_data),
48
+ },
49
+ coords={"time": time, "lat": lat, "lon": lon},
50
+ )
51
+
52
+
53
+ @pytest.fixture
54
+ def air():
55
+ ds = xr.tutorial.open_dataset("air_temperature")
56
+ chunks = {"time": 240}
57
+ return ds.chunk(chunks)
58
+
59
+
60
+ @pytest.fixture
61
+ def air_small(air):
62
+ return air.isel(
63
+ time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
64
+ ).chunk({"time": 240})
65
+
66
+
67
+ @pytest.fixture
68
+ def randwx():
69
+ return rand_wx("1995-01-13T00", "1995-01-13T01")
70
+
71
+
72
+ @pytest.fixture
73
+ def large_ds():
74
+ return create_large_dataset().chunk({"time": 25})
75
+
76
+
77
+ @pytest.fixture
78
+ def air_dataset_small():
79
+ ds = xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
80
+ return ds.isel(time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10))
81
+
82
+
83
+ @pytest.fixture
84
+ def air_dataset_large():
85
+ return xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
86
+
87
+
88
+ @pytest.fixture
89
+ def rasm_ds():
90
+ """rasm uses cftime.DatetimeNoLeap (noleap / 365_day) for time."""
91
+ return xr.tutorial.open_dataset("rasm")
92
+
93
+
94
+ @pytest.fixture
95
+ def weather_dataset():
96
+ ds = rand_wx("2023-01-01T00", "2023-01-01T12")
97
+ return ds.isel(time=slice(0, 6), lat=slice(0, 10), lon=slice(0, 10)).chunk(
98
+ {"time": 3}
99
+ )
100
+
101
+
102
+ @pytest.fixture
103
+ def synthetic_dataset():
104
+ return create_large_dataset(
105
+ time_steps=50, lat_points=20, lon_points=20
106
+ ).chunk({"time": 25})
107
+
108
+
109
+ @pytest.fixture
110
+ def station_dataset():
111
+ return xr.Dataset(
112
+ {
113
+ "station_id": (["station"], [1, 2, 3, 4, 5]),
114
+ "elevation": (["station"], [100, 250, 500, 750, 1000]),
115
+ "name": (
116
+ ["station"],
117
+ [
118
+ "Station_A",
119
+ "Station_B",
120
+ "Station_C",
121
+ "Station_D",
122
+ "Station_E",
123
+ ],
124
+ ),
125
+ }
126
+ ).chunk({"station": 5})
127
+
128
+
129
+ @pytest.fixture
130
+ def air_and_stations():
131
+ air = (
132
+ xr.tutorial.open_dataset("air_temperature")
133
+ .isel(time=slice(0, 12), lat=slice(0, 5), lon=slice(0, 8))
134
+ .chunk({"time": 6})
135
+ )
136
+ stations = xr.Dataset(
137
+ {
138
+ "station_id": (["station"], [101, 102, 103]),
139
+ "lat": (
140
+ ["station"],
141
+ [air.lat.values[0], air.lat.values[2], air.lat.values[4]],
142
+ ),
143
+ "lon": (
144
+ ["station"],
145
+ [air.lon.values[1], air.lon.values[3], air.lon.values[5]],
146
+ ),
147
+ "elevation": (["station"], [100, 250, 500]),
148
+ }
149
+ ).chunk({"station": 3})
150
+ return air, stations