xarray_sql 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/.gitignore +2 -0
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/Cargo.lock +1 -1
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/Cargo.toml +1 -1
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/PKG-INFO +112 -58
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/README.md +102 -57
- xarray_sql-0.2.3/docs/assets/logo.svg +104 -0
- xarray_sql-0.2.3/docs/contributing.md +1 -0
- xarray_sql-0.2.3/docs/examples.md +93 -0
- xarray_sql-0.2.3/docs/index.md +1 -0
- xarray_sql-0.2.3/docs/reference/xarray_sql.md +8 -0
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/pyproject.toml +23 -6
- xarray_sql-0.2.3/tests/__init__.py +0 -0
- xarray_sql-0.2.3/tests/conftest.py +150 -0
- xarray_sql-0.2.3/tests/test_cft.py +170 -0
- xarray_sql-0.2.3/tests/test_df.py +443 -0
- xarray_sql-0.2.3/tests/test_reader.py +1377 -0
- xarray_sql-0.2.3/tests/test_sql.py +439 -0
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/xarray_sql/__init__.py +2 -0
- xarray_sql-0.2.3/xarray_sql/cftime.py +248 -0
- xarray_sql-0.2.3/xarray_sql/core.py +49 -0
- xarray_sql-0.2.3/xarray_sql/df.py +447 -0
- xarray_sql-0.2.3/xarray_sql/reader.py +305 -0
- xarray_sql-0.2.3/xarray_sql/sql.py +129 -0
- xarray_sql-0.2.3/zensical.toml +122 -0
- xarray_sql-0.2.1/xarray_sql/core.py +0 -49
- xarray_sql-0.2.1/xarray_sql/df.py +0 -378
- xarray_sql-0.2.1/xarray_sql/df_test.py +0 -489
- xarray_sql-0.2.1/xarray_sql/reader.py +0 -298
- xarray_sql-0.2.1/xarray_sql/reader_test.py +0 -1372
- xarray_sql-0.2.1/xarray_sql/sql.py +0 -18
- xarray_sql-0.2.1/xarray_sql/sql_test.py +0 -194
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/LICENSE +0 -0
- {xarray_sql-0.2.1 → xarray_sql-0.2.3}/src/lib.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xarray_sql
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Intended Audience :: Science/Research
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -19,9 +19,18 @@ Classifier: Topic :: Database :: Front-Ends
|
|
|
19
19
|
Requires-Dist: dask>=2024.8.0
|
|
20
20
|
Requires-Dist: datafusion==52.0.0
|
|
21
21
|
Requires-Dist: xarray>=2024.7.0
|
|
22
|
+
Requires-Dist: xarray-sql[docs] ; extra == 'dev'
|
|
23
|
+
Requires-Dist: pre-commit ; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest ; extra == 'dev'
|
|
25
|
+
Requires-Dist: watchfiles ; extra == 'dev'
|
|
26
|
+
Requires-Dist: zensical ; extra == 'docs'
|
|
27
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'docs'
|
|
28
|
+
Requires-Dist: cftime ; extra == 'test'
|
|
22
29
|
Requires-Dist: pytest ; extra == 'test'
|
|
23
30
|
Requires-Dist: xarray[io] ; extra == 'test'
|
|
24
31
|
Requires-Dist: gcsfs ; extra == 'test'
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Provides-Extra: docs
|
|
25
34
|
Provides-Extra: test
|
|
26
35
|
License-File: LICENSE
|
|
27
36
|
Summary: Querry Xarray with SQL.
|
|
@@ -53,52 +62,97 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
53
62
|
import xarray as xr
|
|
54
63
|
import xarray_sql as xql
|
|
55
64
|
|
|
56
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
57
65
|
|
|
58
|
-
#
|
|
66
|
+
# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
|
|
67
|
+
# keeps Dask's partition setup cheap before any chunks are read from GCS.
|
|
68
|
+
ds = (
|
|
69
|
+
xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
70
|
+
chunks=dict(time=1),
|
|
71
|
+
storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
|
|
72
|
+
.sel(time='2020')
|
|
73
|
+
)
|
|
74
|
+
|
|
59
75
|
ctx = xql.XarrayContext()
|
|
60
|
-
ctx.from_dataset('
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
#
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
#
|
|
76
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
77
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
78
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
79
|
+
})
|
|
80
|
+
# Registration: ~0.5s for a full year of hourly ERA5, all variables.
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
84
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
85
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
86
|
+
# across the year (terabytes from GCS).
|
|
87
|
+
# ---> Always SELECT specific columns. <---
|
|
88
|
+
|
|
89
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
90
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
91
|
+
ctx.sql('''
|
|
92
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
93
|
+
FROM era5.surface
|
|
94
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
95
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
96
|
+
AND latitude BETWEEN 39 AND 40
|
|
97
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
98
|
+
''').to_pandas()
|
|
99
|
+
# avg_c
|
|
100
|
+
# 0 8.640069
|
|
101
|
+
|
|
102
|
+
# Average temperature per pressure level, globally.
|
|
103
|
+
ctx.sql('''
|
|
104
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
105
|
+
FROM era5.atmosphere
|
|
106
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
107
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
108
|
+
GROUP BY level
|
|
109
|
+
ORDER BY level DESC
|
|
110
|
+
''').to_pandas()
|
|
111
|
+
# level avg_c
|
|
112
|
+
# 0 1000 6.621012 ← surface
|
|
113
|
+
# 1 975 5.185638
|
|
114
|
+
# 2 950 4.028429
|
|
115
|
+
# 3 925 3.082812
|
|
116
|
+
# 4 900 2.210917
|
|
117
|
+
# 5 875 1.395018
|
|
118
|
+
# 6 850 0.634267
|
|
119
|
+
# 7 825 -0.210372
|
|
120
|
+
# 8 800 -1.181075
|
|
121
|
+
# 9 775 -2.306465
|
|
122
|
+
# 10 750 -3.535534
|
|
123
|
+
# 11 700 -6.241685
|
|
124
|
+
# 12 650 -9.236364
|
|
125
|
+
# 13 600 -12.580938
|
|
126
|
+
# 14 550 -16.335386
|
|
127
|
+
# 15 500 -20.643604
|
|
128
|
+
# 16 450 -25.573401
|
|
129
|
+
# 17 400 -31.156920
|
|
130
|
+
# 18 350 -37.400552
|
|
131
|
+
# 19 300 -43.852607
|
|
132
|
+
# 20 250 -49.322132
|
|
133
|
+
# 21 225 -51.569113
|
|
134
|
+
# 22 200 -53.693248
|
|
135
|
+
# 23 175 -55.890484
|
|
136
|
+
# 24 150 -58.382290
|
|
137
|
+
# 25 125 -61.091916
|
|
138
|
+
# 26 100 -63.624885 ← tropopause
|
|
139
|
+
# 27 70 -63.182300
|
|
140
|
+
# 28 50 -60.124845
|
|
141
|
+
# 29 30 -55.986327
|
|
142
|
+
# 30 20 -52.433089
|
|
143
|
+
# 31 10 -44.140750
|
|
144
|
+
# 32 7 -38.707350
|
|
145
|
+
# 33 5 -32.621999
|
|
146
|
+
# 34 3 -21.509175
|
|
147
|
+
# 35 2 -13.355764
|
|
148
|
+
# 36 1 -9.020513 ← top of atmosphere
|
|
98
149
|
```
|
|
99
150
|
|
|
100
|
-
|
|
101
|
-
|
|
151
|
+
_(A runnable version of this example lives at
|
|
152
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
153
|
+
|
|
154
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
155
|
+
SQL queries against them.
|
|
102
156
|
|
|
103
157
|
## Why build this?
|
|
104
158
|
|
|
@@ -128,11 +182,11 @@ That's it!
|
|
|
128
182
|
_2025 update_: This library now implements a Dask-like `from_map` interface in
|
|
129
183
|
pure DataFusion and PyArrow, but works with the same principle!
|
|
130
184
|
|
|
131
|
-
_2026 update_: Instead of `from_map()`, we
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
185
|
+
_2026 update_: Instead of `from_map()`, we create a way to translate Xarray chunks
|
|
186
|
+
into Arrow RecordBatches. We pass a Python callback into a DataFusion `TableProvider`
|
|
187
|
+
that lets the DB engine translate the underlying Dataset arrays into DataFusion partitions.
|
|
188
|
+
Ultimately, the initial insight of the `pivot()` function -- that any ndarray can be
|
|
189
|
+
translated into a 2D table -- underlies this performant query mechanism.
|
|
136
190
|
|
|
137
191
|
## Why does this work?
|
|
138
192
|
|
|
@@ -150,11 +204,6 @@ early users – "tire kickers", if you will. We'd love your input to shape the d
|
|
|
150
204
|
project! Please, give this a try and [file issues](https://github.com/alxmrs/xarray-sql/issues) as
|
|
151
205
|
you see fit. Check out our [contributing guide](CONTRIBUTING.md), too 😉.
|
|
152
206
|
|
|
153
|
-
I can say that for now, the library is oriented towards making whole scans of
|
|
154
|
-
Xarray Datasets. Common filter optimizations (even basic ones like an `.sel()` on
|
|
155
|
-
core dimensions, let alone predicate push downs) are not fully implemented yet.
|
|
156
|
-
However, these operations and more are on our roadmap.
|
|
157
|
-
|
|
158
207
|
## What would a deeper integration look like?
|
|
159
208
|
|
|
160
209
|
I have a few ideas so far. One approach involves applying operations directly on
|
|
@@ -169,18 +218,21 @@ and BigQuery. More thoughts on this
|
|
|
169
218
|
in [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
170
219
|
|
|
171
220
|
_2025 update_: Something like this is being built across a few projects! The ones I know about are:
|
|
221
|
+
|
|
172
222
|
- [CartoDB's Raquet](https://github.com/CartoDB/raquet)
|
|
173
223
|
- The DataFusion community's [arrow-zarr](https://github.com/datafusion-contrib/arrow-zarr)
|
|
174
224
|
|
|
175
|
-
|
|
176
|
-
|
|
225
|
+
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
226
|
+
|
|
227
|
+
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
228
|
+
- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
|
|
177
229
|
|
|
178
230
|
## Roadmap
|
|
179
231
|
|
|
180
232
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
181
|
-
- [
|
|
182
|
-
- [
|
|
183
|
-
- [ ] Translate a single Zarr to a collection of tables
|
|
233
|
+
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
234
|
+
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
235
|
+
- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
184
236
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
185
237
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
186
238
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -200,6 +252,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
200
252
|
and DataFusion-specific collaboration.
|
|
201
253
|
- The gracious volunteer data science students at [UCSD's DS3](https://www.ds3atucsd.com/) org,
|
|
202
254
|
who are working to make this library better.
|
|
255
|
+
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
256
|
+
changes.
|
|
203
257
|
|
|
204
258
|
|
|
205
259
|
## License
|
|
@@ -19,52 +19,97 @@ This is an experiment to provide a SQL interface for array datasets.
|
|
|
19
19
|
import xarray as xr
|
|
20
20
|
import xarray_sql as xql
|
|
21
21
|
|
|
22
|
-
ds = xr.tutorial.open_dataset('air_temperature')
|
|
23
22
|
|
|
24
|
-
#
|
|
23
|
+
# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
|
|
24
|
+
# keeps Dask's partition setup cheap before any chunks are read from GCS.
|
|
25
|
+
ds = (
|
|
26
|
+
xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
|
|
27
|
+
chunks=dict(time=1),
|
|
28
|
+
storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
|
|
29
|
+
.sel(time='2020')
|
|
30
|
+
)
|
|
31
|
+
|
|
25
32
|
ctx = xql.XarrayContext()
|
|
26
|
-
ctx.from_dataset('
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
#
|
|
33
|
+
ctx.from_dataset('era5', ds, table_names={
|
|
34
|
+
('time', 'latitude', 'longitude'): 'surface',
|
|
35
|
+
('time', 'level', 'latitude', 'longitude'): 'atmosphere',
|
|
36
|
+
})
|
|
37
|
+
# Registration: ~0.5s for a full year of hourly ERA5, all variables.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
|
|
41
|
+
# pushes column projection down to Zarr, so SELECT only fetches what you ask
|
|
42
|
+
# for — but `SELECT * FROM era5.surface` would try to pull every variable
|
|
43
|
+
# across the year (terabytes from GCS).
|
|
44
|
+
# ---> Always SELECT specific columns. <---
|
|
45
|
+
|
|
46
|
+
# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
|
|
47
|
+
# pushes WHERE clauses on dimension columns down to partition pruning.
|
|
48
|
+
ctx.sql('''
|
|
49
|
+
SELECT AVG("2m_temperature") - 273.15 AS avg_c
|
|
50
|
+
FROM era5.surface
|
|
51
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
52
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
53
|
+
AND latitude BETWEEN 39 AND 40
|
|
54
|
+
AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
|
|
55
|
+
''').to_pandas()
|
|
56
|
+
# avg_c
|
|
57
|
+
# 0 8.640069
|
|
58
|
+
|
|
59
|
+
# Average temperature per pressure level, globally.
|
|
60
|
+
ctx.sql('''
|
|
61
|
+
SELECT level, AVG(temperature) - 273.15 AS avg_c
|
|
62
|
+
FROM era5.atmosphere
|
|
63
|
+
WHERE time BETWEEN TIMESTAMP '2020-01-01'
|
|
64
|
+
AND TIMESTAMP '2020-01-01 05:00:00'
|
|
65
|
+
GROUP BY level
|
|
66
|
+
ORDER BY level DESC
|
|
67
|
+
''').to_pandas()
|
|
68
|
+
# level avg_c
|
|
69
|
+
# 0 1000 6.621012 ← surface
|
|
70
|
+
# 1 975 5.185638
|
|
71
|
+
# 2 950 4.028429
|
|
72
|
+
# 3 925 3.082812
|
|
73
|
+
# 4 900 2.210917
|
|
74
|
+
# 5 875 1.395018
|
|
75
|
+
# 6 850 0.634267
|
|
76
|
+
# 7 825 -0.210372
|
|
77
|
+
# 8 800 -1.181075
|
|
78
|
+
# 9 775 -2.306465
|
|
79
|
+
# 10 750 -3.535534
|
|
80
|
+
# 11 700 -6.241685
|
|
81
|
+
# 12 650 -9.236364
|
|
82
|
+
# 13 600 -12.580938
|
|
83
|
+
# 14 550 -16.335386
|
|
84
|
+
# 15 500 -20.643604
|
|
85
|
+
# 16 450 -25.573401
|
|
86
|
+
# 17 400 -31.156920
|
|
87
|
+
# 18 350 -37.400552
|
|
88
|
+
# 19 300 -43.852607
|
|
89
|
+
# 20 250 -49.322132
|
|
90
|
+
# 21 225 -51.569113
|
|
91
|
+
# 22 200 -53.693248
|
|
92
|
+
# 23 175 -55.890484
|
|
93
|
+
# 24 150 -58.382290
|
|
94
|
+
# 25 125 -61.091916
|
|
95
|
+
# 26 100 -63.624885 ← tropopause
|
|
96
|
+
# 27 70 -63.182300
|
|
97
|
+
# 28 50 -60.124845
|
|
98
|
+
# 29 30 -55.986327
|
|
99
|
+
# 30 20 -52.433089
|
|
100
|
+
# 31 10 -44.140750
|
|
101
|
+
# 32 7 -38.707350
|
|
102
|
+
# 33 5 -32.621999
|
|
103
|
+
# 34 3 -21.509175
|
|
104
|
+
# 35 2 -13.355764
|
|
105
|
+
# 36 1 -9.020513 ← top of atmosphere
|
|
64
106
|
```
|
|
65
107
|
|
|
66
|
-
|
|
67
|
-
|
|
108
|
+
_(A runnable version of this example lives at
|
|
109
|
+
[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
|
|
110
|
+
|
|
111
|
+
Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
|
|
112
|
+
SQL queries against them.
|
|
68
113
|
|
|
69
114
|
## Why build this?
|
|
70
115
|
|
|
@@ -94,11 +139,11 @@ That's it!
|
|
|
94
139
|
_2025 update_: This library now implements a Dask-like `from_map` interface in
|
|
95
140
|
pure DataFusion and PyArrow, but works with the same principle!
|
|
96
141
|
|
|
97
|
-
_2026 update_: Instead of `from_map()`, we
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
142
|
+
_2026 update_: Instead of `from_map()`, we create a way to translate Xarray chunks
|
|
143
|
+
into Arrow RecordBatches. We pass a Python callback into a DataFusion `TableProvider`
|
|
144
|
+
that lets the DB engine translate the underlying Dataset arrays into DataFusion partitions.
|
|
145
|
+
Ultimately, the initial insight of the `pivot()` function -- that any ndarray can be
|
|
146
|
+
translated into a 2D table -- underlies this performant query mechanism.
|
|
102
147
|
|
|
103
148
|
## Why does this work?
|
|
104
149
|
|
|
@@ -116,11 +161,6 @@ early users – "tire kickers", if you will. We'd love your input to shape the d
|
|
|
116
161
|
project! Please, give this a try and [file issues](https://github.com/alxmrs/xarray-sql/issues) as
|
|
117
162
|
you see fit. Check out our [contributing guide](CONTRIBUTING.md), too 😉.
|
|
118
163
|
|
|
119
|
-
I can say that for now, the library is oriented towards making whole scans of
|
|
120
|
-
Xarray Datasets. Common filter optimizations (even basic ones like an `.sel()` on
|
|
121
|
-
core dimensions, let alone predicate push downs) are not fully implemented yet.
|
|
122
|
-
However, these operations and more are on our roadmap.
|
|
123
|
-
|
|
124
164
|
## What would a deeper integration look like?
|
|
125
165
|
|
|
126
166
|
I have a few ideas so far. One approach involves applying operations directly on
|
|
@@ -135,18 +175,21 @@ and BigQuery. More thoughts on this
|
|
|
135
175
|
in [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
136
176
|
|
|
137
177
|
_2025 update_: Something like this is being built across a few projects! The ones I know about are:
|
|
178
|
+
|
|
138
179
|
- [CartoDB's Raquet](https://github.com/CartoDB/raquet)
|
|
139
180
|
- The DataFusion community's [arrow-zarr](https://github.com/datafusion-contrib/arrow-zarr)
|
|
140
181
|
|
|
141
|
-
|
|
142
|
-
|
|
182
|
+
_2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
|
|
183
|
+
|
|
184
|
+
- [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
|
|
185
|
+
- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
|
|
143
186
|
|
|
144
187
|
## Roadmap
|
|
145
188
|
|
|
146
189
|
- [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
|
|
147
|
-
- [
|
|
148
|
-
- [
|
|
149
|
-
- [ ] Translate a single Zarr to a collection of tables
|
|
190
|
+
- [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
|
|
191
|
+
- [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
|
|
192
|
+
- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
|
|
150
193
|
- [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
|
|
151
194
|
- [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
|
|
152
195
|
- [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
|
|
@@ -166,6 +209,8 @@ I want to give a special thanks to the following folks and institutions:
|
|
|
166
209
|
and DataFusion-specific collaboration.
|
|
167
210
|
- The gracious volunteer data science students at [UCSD's DS3](https://www.ds3atucsd.com/) org,
|
|
168
211
|
who are working to make this library better.
|
|
212
|
+
- Andrew Huang for the sense of taste he brings to the project and consummate code
|
|
213
|
+
changes.
|
|
169
214
|
|
|
170
215
|
|
|
171
216
|
## License
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|
2
|
+
<svg
|
|
3
|
+
version="1.1"
|
|
4
|
+
x="0px"
|
|
5
|
+
y="0px"
|
|
6
|
+
viewBox="50 115 420 395"
|
|
7
|
+
xml:space="preserve"
|
|
8
|
+
id="svg6"
|
|
9
|
+
sodipodi:docname="logo
|
|
10
|
+
inkscape:version="1.3.2 (091e20e, 2023-11-25)"
|
|
11
|
+
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
|
12
|
+
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
|
13
|
+
xmlns="http://www.w3.org/2000/svg"
|
|
14
|
+
xmlns:svg="http://www.w3.org/2000/svg"><defs
|
|
15
|
+
id="defs6" /><sodipodi:namedview
|
|
16
|
+
id="namedview6"
|
|
17
|
+
pagecolor="#ffffff"
|
|
18
|
+
bordercolor="#000000"
|
|
19
|
+
borderopacity="0.25"
|
|
20
|
+
inkscape:showpageshadow="2"
|
|
21
|
+
inkscape:pageopacity="0.0"
|
|
22
|
+
inkscape:pagecheckerboard="0"
|
|
23
|
+
inkscape:deskcolor="#d1d1d1"
|
|
24
|
+
showguides="false"
|
|
25
|
+
inkscape:zoom="0.59746835"
|
|
26
|
+
inkscape:cx="76.154661"
|
|
27
|
+
inkscape:cy="63.601695"
|
|
28
|
+
inkscape:window-width="1320"
|
|
29
|
+
inkscape:window-height="905"
|
|
30
|
+
inkscape:window-x="181"
|
|
31
|
+
inkscape:window-y="34"
|
|
32
|
+
inkscape:window-maximized="0"
|
|
33
|
+
inkscape:current-layer="g6" />
|
|
34
|
+
<style
|
|
35
|
+
type="text/css"
|
|
36
|
+
id="style1">
|
|
37
|
+
.st0{fill:#216C89;}
|
|
38
|
+
.st1{fill:#4993AA;}
|
|
39
|
+
.st2{fill:#0F4565;}
|
|
40
|
+
.st3{fill:#6BE8E8;}
|
|
41
|
+
.st4{fill:#9DEEF4;}
|
|
42
|
+
.st5{fill:#4ACFDD;}
|
|
43
|
+
.st6{fill:#E38017;}
|
|
44
|
+
.st7{fill:#16AFB5;}
|
|
45
|
+
</style>
|
|
46
|
+
<g
|
|
47
|
+
id="g6">
|
|
48
|
+
<!-- BOTTOM DATABASE (dark) -->
|
|
49
|
+
<path
|
|
50
|
+
class="st2"
|
|
51
|
+
d="m 65,362.92813 c 0,0 0,96.42499 0,96.42499 0,17.9075 70,27.55 112,27.55 42,0 112,-9.6425 112,-27.55 v -96.42499 c 0,17.90749 -70,27.54999 -112,27.54999 -42,0 -112,-9.6425 -112,-27.54999 z"
|
|
52
|
+
id="path1"
|
|
53
|
+
style="stroke-width:0.82991" />
|
|
54
|
+
<path
|
|
55
|
+
class="st0"
|
|
56
|
+
d="m 177,390.47812 c 42,0 112,-9.6425 112,-27.54999 v 96.42499 c 0,17.9075 -70,27.55 -112,27.55 z"
|
|
57
|
+
opacity="0.4"
|
|
58
|
+
id="path2"
|
|
59
|
+
style="stroke-width:0.82991" />
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
<!-- TOP DATABASE (teal) -->
|
|
64
|
+
<path
|
|
65
|
+
class="st5"
|
|
66
|
+
d="m 65,276.83438 c 0,0 0,75.76249 0,75.76249 0,17.90751 70,27.55001 112,27.55001 42,0 112,-9.6425 112,-27.55001 v -75.76249 c 0,17.90749 -70,27.54999 -112,27.54999 -42,0 -112,-9.6425 -112,-27.54999 z"
|
|
67
|
+
id="path3"
|
|
68
|
+
style="stroke-width:0.82991" />
|
|
69
|
+
<path
|
|
70
|
+
class="st3"
|
|
71
|
+
d="m 177,304.38437 c 42,0 112,-9.6425 112,-27.54999 v 75.76249 c 0,17.90751 -70,27.55001 -112,27.55001 z"
|
|
72
|
+
opacity="0.4"
|
|
73
|
+
id="path4"
|
|
74
|
+
style="stroke-width:0.82991" />
|
|
75
|
+
<path
|
|
76
|
+
class="st5"
|
|
77
|
+
d="m 66.576,190.40937 c 0,0 0,75.7625 0,75.7625 0,17.90751 70,27.55 112,27.55 42,0 112,-9.64249 112,-27.55 v -75.7625 c 0,17.9075 -70,27.55 -112,27.55 -42,0 -112,-9.6425 -112,-27.55 z"
|
|
78
|
+
id="path3-5"
|
|
79
|
+
style="fill:#4acfdd;stroke-width:0.82991" /><path
|
|
80
|
+
class="st3"
|
|
81
|
+
d="m 178.576,217.95937 c 42,0 112,-9.6425 112,-27.55 v 75.7625 c 0,17.90751 -70,27.55 -112,27.55 z"
|
|
82
|
+
opacity="0.4"
|
|
83
|
+
id="path4-7"
|
|
84
|
+
style="fill:#6be8e8;stroke-width:0.82991" /><ellipse
|
|
85
|
+
class="st4"
|
|
86
|
+
cx="178.576"
|
|
87
|
+
cy="183.52188"
|
|
88
|
+
rx="112"
|
|
89
|
+
ry="27.549999"
|
|
90
|
+
id="ellipse4-5"
|
|
91
|
+
style="fill:#9deef4;stroke-width:0.82991" />
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
<!-- DIAGONAL BARS -->
|
|
95
|
+
<polygon
|
|
96
|
+
class="st6"
|
|
97
|
+
points="377.48,412.74 308.66,482.2 308.66,346.56 377.48,277.09"
|
|
98
|
+
id="polygon5" />
|
|
99
|
+
<polygon
|
|
100
|
+
class="st7"
|
|
101
|
+
points="457.07,412.74 388.25,482.2 388.25,346.56 457.07,277.09"
|
|
102
|
+
id="polygon6" />
|
|
103
|
+
</g>
|
|
104
|
+
</svg>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
--8<-- "CONTRIBUTING.md"
|