xarray_sql 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/.gitignore +2 -0
  2. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/Cargo.lock +1 -1
  3. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/Cargo.toml +1 -1
  4. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/PKG-INFO +112 -58
  5. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/README.md +102 -57
  6. xarray_sql-0.2.3/docs/assets/logo.svg +104 -0
  7. xarray_sql-0.2.3/docs/contributing.md +1 -0
  8. xarray_sql-0.2.3/docs/examples.md +93 -0
  9. xarray_sql-0.2.3/docs/index.md +1 -0
  10. xarray_sql-0.2.3/docs/reference/xarray_sql.md +8 -0
  11. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/pyproject.toml +23 -6
  12. xarray_sql-0.2.3/tests/__init__.py +0 -0
  13. xarray_sql-0.2.3/tests/conftest.py +150 -0
  14. xarray_sql-0.2.3/tests/test_cft.py +170 -0
  15. xarray_sql-0.2.3/tests/test_df.py +443 -0
  16. xarray_sql-0.2.3/tests/test_reader.py +1377 -0
  17. xarray_sql-0.2.3/tests/test_sql.py +439 -0
  18. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/xarray_sql/__init__.py +2 -0
  19. xarray_sql-0.2.3/xarray_sql/cftime.py +248 -0
  20. xarray_sql-0.2.3/xarray_sql/core.py +49 -0
  21. xarray_sql-0.2.3/xarray_sql/df.py +447 -0
  22. xarray_sql-0.2.3/xarray_sql/reader.py +305 -0
  23. xarray_sql-0.2.3/xarray_sql/sql.py +129 -0
  24. xarray_sql-0.2.3/zensical.toml +122 -0
  25. xarray_sql-0.2.1/xarray_sql/core.py +0 -49
  26. xarray_sql-0.2.1/xarray_sql/df.py +0 -378
  27. xarray_sql-0.2.1/xarray_sql/df_test.py +0 -489
  28. xarray_sql-0.2.1/xarray_sql/reader.py +0 -298
  29. xarray_sql-0.2.1/xarray_sql/reader_test.py +0 -1372
  30. xarray_sql-0.2.1/xarray_sql/sql.py +0 -18
  31. xarray_sql-0.2.1/xarray_sql/sql_test.py +0 -194
  32. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/LICENSE +0 -0
  33. {xarray_sql-0.2.1 → xarray_sql-0.2.3}/src/lib.rs +0 -0
@@ -12,3 +12,5 @@ test_data
12
12
  .chainlink
13
13
  .claude
14
14
  CHANGELOG.md
15
+ *.ipynb
16
+ /site
@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
3375
3375
 
3376
3376
  [[package]]
3377
3377
  name = "xarray_sql"
3378
- version = "0.2.1"
3378
+ version = "0.2.3"
3379
3379
  dependencies = [
3380
3380
  "arrow",
3381
3381
  "async-stream",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "xarray_sql"
3
- version = "0.2.1"
3
+ version = "0.2.3"
4
4
  authors = ["Alex Merose"]
5
5
  edition = "2021"
6
6
  exclude = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xarray_sql
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Intended Audience :: Developers
@@ -19,9 +19,18 @@ Classifier: Topic :: Database :: Front-Ends
19
19
  Requires-Dist: dask>=2024.8.0
20
20
  Requires-Dist: datafusion==52.0.0
21
21
  Requires-Dist: xarray>=2024.7.0
22
+ Requires-Dist: xarray-sql[docs] ; extra == 'dev'
23
+ Requires-Dist: pre-commit ; extra == 'dev'
24
+ Requires-Dist: pytest ; extra == 'dev'
25
+ Requires-Dist: watchfiles ; extra == 'dev'
26
+ Requires-Dist: zensical ; extra == 'docs'
27
+ Requires-Dist: mkdocstrings[python] ; extra == 'docs'
28
+ Requires-Dist: cftime ; extra == 'test'
22
29
  Requires-Dist: pytest ; extra == 'test'
23
30
  Requires-Dist: xarray[io] ; extra == 'test'
24
31
  Requires-Dist: gcsfs ; extra == 'test'
32
+ Provides-Extra: dev
33
+ Provides-Extra: docs
25
34
  Provides-Extra: test
26
35
  License-File: LICENSE
27
36
  Summary: Querry Xarray with SQL.
@@ -53,52 +62,97 @@ This is an experiment to provide a SQL interface for array datasets.
53
62
  import xarray as xr
54
63
  import xarray_sql as xql
55
64
 
56
- ds = xr.tutorial.open_dataset('air_temperature')
57
65
 
58
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
66
+ # Open a year of ARCO-ERA5 all 273 variables. Selecting a year up front
67
+ # keeps Dask's partition setup cheap before any chunks are read from GCS.
68
+ ds = (
69
+ xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
70
+ chunks=dict(time=1),
71
+ storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
72
+ .sel(time='2020')
73
+ )
74
+
59
75
  ctx = xql.XarrayContext()
60
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
61
- # data is only materialized when we make a query.
62
-
63
- result = ctx.sql('''
64
- SELECT
65
- "lat", "lon", AVG("air") as air_avg
66
- FROM
67
- "air"
68
- GROUP BY
69
- "lat", "lon"
70
- ''')
71
- # DataFrame()
72
- # +------+-------+--------------------+
73
- # | lat | lon | air_avg |
74
- # +------+-------+--------------------+
75
- # | 75.0 | 205.0 | 259.88662671232834 |
76
- # | 75.0 | 207.5 | 259.48268150684896 |
77
- # | 75.0 | 230.0 | 258.9192123287667 |
78
- # | 75.0 | 275.0 | 257.07574315068456 |
79
- # | 75.0 | 322.5 | 250.11792123287654 |
80
- # | 75.0 | 325.0 | 250.81590068493134 |
81
- # | 72.5 | 205.0 | 262.74933904109537 |
82
- # | 72.5 | 207.5 | 262.5384315068488 |
83
- # | 72.5 | 230.0 | 260.82879452054743 |
84
- # | 72.5 | 275.0 | 257.3063321917804 |
85
- # +------+-------+--------------------+
86
- # Data truncated.
87
-
88
- # The full query is only made when we call `collect()`, or, in this case,
89
- # `to_pandas()`.
90
- df = result.to_pandas()
91
- df.head()
92
- # lat lon air_avg
93
- # 0 75.0 232.5 258.836188
94
- # 1 75.0 247.5 257.716171
95
- # 2 75.0 262.5 257.347959
96
- # 3 75.0 277.5 257.671308
97
- # 4 72.5 232.5 260.654401
76
+ ctx.from_dataset('era5', ds, table_names={
77
+ ('time', 'latitude', 'longitude'): 'surface',
78
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
79
+ })
80
+ # Registration: ~0.5s for a full year of hourly ERA5, all variables.
81
+
82
+
83
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
84
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
85
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
86
+ # across the year (terabytes from GCS).
87
+ # ---> Always SELECT specific columns. <---
88
+
89
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
90
+ # pushes WHERE clauses on dimension columns down to partition pruning.
91
+ ctx.sql('''
92
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
93
+ FROM era5.surface
94
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
95
+ AND TIMESTAMP '2020-01-01 05:00:00'
96
+ AND latitude BETWEEN 39 AND 40
97
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
98
+ ''').to_pandas()
99
+ # avg_c
100
+ # 0 8.640069
101
+
102
+ # Average temperature per pressure level, globally.
103
+ ctx.sql('''
104
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
105
+ FROM era5.atmosphere
106
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
107
+ AND TIMESTAMP '2020-01-01 05:00:00'
108
+ GROUP BY level
109
+ ORDER BY level DESC
110
+ ''').to_pandas()
111
+ # level avg_c
112
+ # 0 1000 6.621012 ← surface
113
+ # 1 975 5.185638
114
+ # 2 950 4.028429
115
+ # 3 925 3.082812
116
+ # 4 900 2.210917
117
+ # 5 875 1.395018
118
+ # 6 850 0.634267
119
+ # 7 825 -0.210372
120
+ # 8 800 -1.181075
121
+ # 9 775 -2.306465
122
+ # 10 750 -3.535534
123
+ # 11 700 -6.241685
124
+ # 12 650 -9.236364
125
+ # 13 600 -12.580938
126
+ # 14 550 -16.335386
127
+ # 15 500 -20.643604
128
+ # 16 450 -25.573401
129
+ # 17 400 -31.156920
130
+ # 18 350 -37.400552
131
+ # 19 300 -43.852607
132
+ # 20 250 -49.322132
133
+ # 21 225 -51.569113
134
+ # 22 200 -53.693248
135
+ # 23 175 -55.890484
136
+ # 24 150 -58.382290
137
+ # 25 125 -61.091916
138
+ # 26 100 -63.624885 ← tropopause
139
+ # 27 70 -63.182300
140
+ # 28 50 -60.124845
141
+ # 29 30 -55.986327
142
+ # 30 20 -52.433089
143
+ # 31 10 -44.140750
144
+ # 32 7 -38.707350
145
+ # 33 5 -32.621999
146
+ # 34 3 -21.509175
147
+ # 35 2 -13.355764
148
+ # 36 1 -9.020513 ← top of atmosphere
98
149
  ```
99
150
 
100
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
101
- SQL queries against them.
151
+ _(A runnable version of this example lives at
152
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
153
+
154
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
155
+ SQL queries against them.
102
156
 
103
157
  ## Why build this?
104
158
 
@@ -128,11 +182,11 @@ That's it!
128
182
  _2025 update_: This library now implements a Dask-like `from_map` interface in
129
183
  pure DataFusion and PyArrow, but works with the same principle!
130
184
 
131
- _2026 update_: Instead of `from_map()`, we make factory functions from blocks of
132
- Xarray datasets that return RecordBatchReaders. These feed into a Rust-based
133
- DataFusion `TableProvider`. Every chunk is uses the Arrow in memory format to
134
- translate between Python and Rust. Even still, the core of what makes this idea
135
- work is the core `pivot()` operation from where this project began!
185
+ _2026 update_: Instead of `from_map()`, we create a way to translate Xarray chunks
186
+ into Arrow RecordBatches. We pass a Python callback into a DataFusion `TableProvider`
187
+ that lets the DB engine translate the underlying Dataset arrays into DataFusion partitions.
188
+ Ultimately, the initial insight of the `pivot()` function -- that any ndarray can be
189
+ translated into a 2D table -- underlies this performant query mechanism.
136
190
 
137
191
  ## Why does this work?
138
192
 
@@ -150,11 +204,6 @@ early users – "tire kickers", if you will. We'd love your input to shape the d
150
204
  project! Please, give this a try and [file issues](https://github.com/alxmrs/xarray-sql/issues) as
151
205
  you see fit. Check out our [contributing guide](CONTRIBUTING.md), too 😉.
152
206
 
153
- I can say that for now, the library is oriented towards making whole scans of
154
- Xarray Datasets. Common filter optimizations (even basic ones like an `.sel()` on
155
- core dimensions, let alone predicate push downs) are not fully implemented yet.
156
- However, these operations and more are on our roadmap.
157
-
158
207
  ## What would a deeper integration look like?
159
208
 
160
209
  I have a few ideas so far. One approach involves applying operations directly on
@@ -169,18 +218,21 @@ and BigQuery. More thoughts on this
169
218
  in [#4](https://github.com/alxmrs/xarray-sql/issues/4).
170
219
 
171
220
  _2025 update_: Something like this is being built across a few projects! The ones I know about are:
221
+
172
222
  - [CartoDB's Raquet](https://github.com/CartoDB/raquet)
173
223
  - The DataFusion community's [arrow-zarr](https://github.com/datafusion-contrib/arrow-zarr)
174
224
 
175
- As of writing, this project is [amid integrating](https://github.com/alxmrs/xarray-sql/pull/69) a
176
- rust-based DataFusion backend provided by arrow-zarr.
225
+ _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
226
+
227
+ - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
228
+ - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
177
229
 
178
230
  ## Roadmap
179
231
 
180
232
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
181
- - [ ] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
182
- - [ ] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
183
- - [ ] Translate a single Zarr to a collection of tables via DataFusion's catalog interface [#85](https://github.com/alxmrs/xarray-sql/issues/85).
233
+ - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
234
+ - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
235
+ - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
184
236
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
185
237
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
186
238
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -200,6 +252,8 @@ I want to give a special thanks to the following folks and institutions:
200
252
  and DataFusion-specific collaboration.
201
253
  - The gracious volunteer data science students at [UCSD's DS3](https://www.ds3atucsd.com/) org,
202
254
  who are working to make this library better.
255
+ - Andrew Huang for the sense of taste he brings to the project and consummate code
256
+ changes.
203
257
 
204
258
 
205
259
  ## License
@@ -19,52 +19,97 @@ This is an experiment to provide a SQL interface for array datasets.
19
19
  import xarray as xr
20
20
  import xarray_sql as xql
21
21
 
22
- ds = xr.tutorial.open_dataset('air_temperature')
23
22
 
24
- # The same as a dask-sql Context; i.e. an Apache DataFusion Context.
23
+ # Open a year of ARCO-ERA5 all 273 variables. Selecting a year up front
24
+ # keeps Dask's partition setup cheap before any chunks are read from GCS.
25
+ ds = (
26
+ xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
27
+ chunks=dict(time=1),
28
+ storage_options={'token': 'anon'}) # Anonymous read from the public GCS bucket — no auth required.
29
+ .sel(time='2020')
30
+ )
31
+
25
32
  ctx = xql.XarrayContext()
26
- ctx.from_dataset('air', ds, chunks=dict(time=24)) # the dataset needs to be chunked!
27
- # data is only materialized when we make a query.
28
-
29
- result = ctx.sql('''
30
- SELECT
31
- "lat", "lon", AVG("air") as air_avg
32
- FROM
33
- "air"
34
- GROUP BY
35
- "lat", "lon"
36
- ''')
37
- # DataFrame()
38
- # +------+-------+--------------------+
39
- # | lat | lon | air_avg |
40
- # +------+-------+--------------------+
41
- # | 75.0 | 205.0 | 259.88662671232834 |
42
- # | 75.0 | 207.5 | 259.48268150684896 |
43
- # | 75.0 | 230.0 | 258.9192123287667 |
44
- # | 75.0 | 275.0 | 257.07574315068456 |
45
- # | 75.0 | 322.5 | 250.11792123287654 |
46
- # | 75.0 | 325.0 | 250.81590068493134 |
47
- # | 72.5 | 205.0 | 262.74933904109537 |
48
- # | 72.5 | 207.5 | 262.5384315068488 |
49
- # | 72.5 | 230.0 | 260.82879452054743 |
50
- # | 72.5 | 275.0 | 257.3063321917804 |
51
- # +------+-------+--------------------+
52
- # Data truncated.
53
-
54
- # The full query is only made when we call `collect()`, or, in this case,
55
- # `to_pandas()`.
56
- df = result.to_pandas()
57
- df.head()
58
- # lat lon air_avg
59
- # 0 75.0 232.5 258.836188
60
- # 1 75.0 247.5 257.716171
61
- # 2 75.0 262.5 257.347959
62
- # 3 75.0 277.5 257.671308
63
- # 4 72.5 232.5 260.654401
33
+ ctx.from_dataset('era5', ds, table_names={
34
+ ('time', 'latitude', 'longitude'): 'surface',
35
+ ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
36
+ })
37
+ # Registration: ~0.5s for a full year of hourly ERA5, all variables.
38
+
39
+
40
+ # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
41
+ # pushes column projection down to Zarr, so SELECT only fetches what you ask
42
+ # for — but `SELECT * FROM era5.surface` would try to pull every variable
43
+ # across the year (terabytes from GCS).
44
+ # ---> Always SELECT specific columns. <---
45
+
46
+ # Average 2m-temperature over NYC on the morning of 2020-01-01. The library
47
+ # pushes WHERE clauses on dimension columns down to partition pruning.
48
+ ctx.sql('''
49
+ SELECT AVG("2m_temperature") - 273.15 AS avg_c
50
+ FROM era5.surface
51
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
52
+ AND TIMESTAMP '2020-01-01 05:00:00'
53
+ AND latitude BETWEEN 39 AND 40
54
+ AND longitude BETWEEN 286 AND 287 -- ERA5 uses 0-360 longitudes
55
+ ''').to_pandas()
56
+ # avg_c
57
+ # 0 8.640069
58
+
59
+ # Average temperature per pressure level, globally.
60
+ ctx.sql('''
61
+ SELECT level, AVG(temperature) - 273.15 AS avg_c
62
+ FROM era5.atmosphere
63
+ WHERE time BETWEEN TIMESTAMP '2020-01-01'
64
+ AND TIMESTAMP '2020-01-01 05:00:00'
65
+ GROUP BY level
66
+ ORDER BY level DESC
67
+ ''').to_pandas()
68
+ # level avg_c
69
+ # 0 1000 6.621012 ← surface
70
+ # 1 975 5.185638
71
+ # 2 950 4.028429
72
+ # 3 925 3.082812
73
+ # 4 900 2.210917
74
+ # 5 875 1.395018
75
+ # 6 850 0.634267
76
+ # 7 825 -0.210372
77
+ # 8 800 -1.181075
78
+ # 9 775 -2.306465
79
+ # 10 750 -3.535534
80
+ # 11 700 -6.241685
81
+ # 12 650 -9.236364
82
+ # 13 600 -12.580938
83
+ # 14 550 -16.335386
84
+ # 15 500 -20.643604
85
+ # 16 450 -25.573401
86
+ # 17 400 -31.156920
87
+ # 18 350 -37.400552
88
+ # 19 300 -43.852607
89
+ # 20 250 -49.322132
90
+ # 21 225 -51.569113
91
+ # 22 200 -53.693248
92
+ # 23 175 -55.890484
93
+ # 24 150 -58.382290
94
+ # 25 125 -61.091916
95
+ # 26 100 -63.624885 ← tropopause
96
+ # 27 70 -63.182300
97
+ # 28 50 -60.124845
98
+ # 29 30 -55.986327
99
+ # 30 20 -52.433089
100
+ # 31 10 -44.140750
101
+ # 32 7 -38.707350
102
+ # 33 5 -32.621999
103
+ # 34 3 -21.509175
104
+ # 35 2 -13.355764
105
+ # 36 1 -9.020513 ← top of atmosphere
64
106
  ```
65
107
 
66
- Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
67
- SQL queries against them.
108
+ _(A runnable version of this example lives at
109
+ [`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
110
+
111
+ Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
112
+ SQL queries against them.
68
113
 
69
114
  ## Why build this?
70
115
 
@@ -94,11 +139,11 @@ That's it!
94
139
  _2025 update_: This library now implements a Dask-like `from_map` interface in
95
140
  pure DataFusion and PyArrow, but works with the same principle!
96
141
 
97
- _2026 update_: Instead of `from_map()`, we make factory functions from blocks of
98
- Xarray datasets that return RecordBatchReaders. These feed into a Rust-based
99
- DataFusion `TableProvider`. Every chunk is uses the Arrow in memory format to
100
- translate between Python and Rust. Even still, the core of what makes this idea
101
- work is the core `pivot()` operation from where this project began!
142
+ _2026 update_: Instead of `from_map()`, we create a way to translate Xarray chunks
143
+ into Arrow RecordBatches. We pass a Python callback into a DataFusion `TableProvider`
144
+ that lets the DB engine translate the underlying Dataset arrays into DataFusion partitions.
145
+ Ultimately, the initial insight of the `pivot()` function -- that any ndarray can be
146
+ translated into a 2D table -- underlies this performant query mechanism.
102
147
 
103
148
  ## Why does this work?
104
149
 
@@ -116,11 +161,6 @@ early users – "tire kickers", if you will. We'd love your input to shape the d
116
161
  project! Please, give this a try and [file issues](https://github.com/alxmrs/xarray-sql/issues) as
117
162
  you see fit. Check out our [contributing guide](CONTRIBUTING.md), too 😉.
118
163
 
119
- I can say that for now, the library is oriented towards making whole scans of
120
- Xarray Datasets. Common filter optimizations (even basic ones like an `.sel()` on
121
- core dimensions, let alone predicate push downs) are not fully implemented yet.
122
- However, these operations and more are on our roadmap.
123
-
124
164
  ## What would a deeper integration look like?
125
165
 
126
166
  I have a few ideas so far. One approach involves applying operations directly on
@@ -135,18 +175,21 @@ and BigQuery. More thoughts on this
135
175
  in [#4](https://github.com/alxmrs/xarray-sql/issues/4).
136
176
 
137
177
  _2025 update_: Something like this is being built across a few projects! The ones I know about are:
178
+
138
179
  - [CartoDB's Raquet](https://github.com/CartoDB/raquet)
139
180
  - The DataFusion community's [arrow-zarr](https://github.com/datafusion-contrib/arrow-zarr)
140
181
 
141
- As of writing, this project is [amid integrating](https://github.com/alxmrs/xarray-sql/pull/69) a
142
- rust-based DataFusion backend provided by arrow-zarr.
182
+ _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
183
+
184
+ - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
185
+ - [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
143
186
 
144
187
  ## Roadmap
145
188
 
146
189
  - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
147
- - [ ] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
148
- - [ ] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
149
- - [ ] Translate a single Zarr to a collection of tables via DataFusion's catalog interface [#85](https://github.com/alxmrs/xarray-sql/issues/85).
190
+ - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
191
+ - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
192
+ - [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
150
193
  - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
151
194
  - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
152
195
  - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -166,6 +209,8 @@ I want to give a special thanks to the following folks and institutions:
166
209
  and DataFusion-specific collaboration.
167
210
  - The gracious volunteer data science students at [UCSD's DS3](https://www.ds3atucsd.com/) org,
168
211
  who are working to make this library better.
212
+ - Andrew Huang for the sense of taste he brings to the project and consummate code
213
+ changes.
169
214
 
170
215
 
171
216
  ## License
@@ -0,0 +1,104 @@
1
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2
+ <svg
3
+ version="1.1"
4
+ x="0px"
5
+ y="0px"
6
+ viewBox="50 115 420 395"
7
+ xml:space="preserve"
8
+ id="svg6"
9
+ sodipodi:docname="logo
10
+ inkscape:version="1.3.2 (091e20e, 2023-11-25)"
11
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
13
+ xmlns="http://www.w3.org/2000/svg"
14
+ xmlns:svg="http://www.w3.org/2000/svg"><defs
15
+ id="defs6" /><sodipodi:namedview
16
+ id="namedview6"
17
+ pagecolor="#ffffff"
18
+ bordercolor="#000000"
19
+ borderopacity="0.25"
20
+ inkscape:showpageshadow="2"
21
+ inkscape:pageopacity="0.0"
22
+ inkscape:pagecheckerboard="0"
23
+ inkscape:deskcolor="#d1d1d1"
24
+ showguides="false"
25
+ inkscape:zoom="0.59746835"
26
+ inkscape:cx="76.154661"
27
+ inkscape:cy="63.601695"
28
+ inkscape:window-width="1320"
29
+ inkscape:window-height="905"
30
+ inkscape:window-x="181"
31
+ inkscape:window-y="34"
32
+ inkscape:window-maximized="0"
33
+ inkscape:current-layer="g6" />
34
+ <style
35
+ type="text/css"
36
+ id="style1">
37
+ .st0{fill:#216C89;}
38
+ .st1{fill:#4993AA;}
39
+ .st2{fill:#0F4565;}
40
+ .st3{fill:#6BE8E8;}
41
+ .st4{fill:#9DEEF4;}
42
+ .st5{fill:#4ACFDD;}
43
+ .st6{fill:#E38017;}
44
+ .st7{fill:#16AFB5;}
45
+ </style>
46
+ <g
47
+ id="g6">
48
+ <!-- BOTTOM DATABASE (dark) -->
49
+ <path
50
+ class="st2"
51
+ d="m 65,362.92813 c 0,0 0,96.42499 0,96.42499 0,17.9075 70,27.55 112,27.55 42,0 112,-9.6425 112,-27.55 v -96.42499 c 0,17.90749 -70,27.54999 -112,27.54999 -42,0 -112,-9.6425 -112,-27.54999 z"
52
+ id="path1"
53
+ style="stroke-width:0.82991" />
54
+ <path
55
+ class="st0"
56
+ d="m 177,390.47812 c 42,0 112,-9.6425 112,-27.54999 v 96.42499 c 0,17.9075 -70,27.55 -112,27.55 z"
57
+ opacity="0.4"
58
+ id="path2"
59
+ style="stroke-width:0.82991" />
60
+
61
+
62
+
63
+ <!-- TOP DATABASE (teal) -->
64
+ <path
65
+ class="st5"
66
+ d="m 65,276.83438 c 0,0 0,75.76249 0,75.76249 0,17.90751 70,27.55001 112,27.55001 42,0 112,-9.6425 112,-27.55001 v -75.76249 c 0,17.90749 -70,27.54999 -112,27.54999 -42,0 -112,-9.6425 -112,-27.54999 z"
67
+ id="path3"
68
+ style="stroke-width:0.82991" />
69
+ <path
70
+ class="st3"
71
+ d="m 177,304.38437 c 42,0 112,-9.6425 112,-27.54999 v 75.76249 c 0,17.90751 -70,27.55001 -112,27.55001 z"
72
+ opacity="0.4"
73
+ id="path4"
74
+ style="stroke-width:0.82991" />
75
+ <path
76
+ class="st5"
77
+ d="m 66.576,190.40937 c 0,0 0,75.7625 0,75.7625 0,17.90751 70,27.55 112,27.55 42,0 112,-9.64249 112,-27.55 v -75.7625 c 0,17.9075 -70,27.55 -112,27.55 -42,0 -112,-9.6425 -112,-27.55 z"
78
+ id="path3-5"
79
+ style="fill:#4acfdd;stroke-width:0.82991" /><path
80
+ class="st3"
81
+ d="m 178.576,217.95937 c 42,0 112,-9.6425 112,-27.55 v 75.7625 c 0,17.90751 -70,27.55 -112,27.55 z"
82
+ opacity="0.4"
83
+ id="path4-7"
84
+ style="fill:#6be8e8;stroke-width:0.82991" /><ellipse
85
+ class="st4"
86
+ cx="178.576"
87
+ cy="183.52188"
88
+ rx="112"
89
+ ry="27.549999"
90
+ id="ellipse4-5"
91
+ style="fill:#9deef4;stroke-width:0.82991" />
92
+
93
+
94
+ <!-- DIAGONAL BARS -->
95
+ <polygon
96
+ class="st6"
97
+ points="377.48,412.74 308.66,482.2 308.66,346.56 377.48,277.09"
98
+ id="polygon5" />
99
+ <polygon
100
+ class="st7"
101
+ points="457.07,412.74 388.25,482.2 388.25,346.56 457.07,277.09"
102
+ id="polygon6" />
103
+ </g>
104
+ </svg>
@@ -0,0 +1 @@
1
+ --8<-- "CONTRIBUTING.md"