PyPI - xarray_sql - Versions diffs - 0.2.3__tar.gz → 0.3.0__tar.gz - Mend

xarray_sql 0.2.3tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.lock RENAMED Viewed

@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
 [[package]]
 name = "xarray_sql"
-version = "0.2.3"
+version = "0.3.0"
 dependencies = [
  "arrow",
  "async-stream",

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "xarray_sql"
-version = "0.2.3"
+version = "0.3.0"
 authors = ["Alex Merose"]
 edition = "2021"
 exclude = [

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xarray_sql
-Version: 0.2.3
+Version: 0.3.0
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Developers
@@ -63,22 +63,21 @@ import xarray as xr
 import xarray_sql as xql
-# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
-# keeps Dask's partition setup cheap before any chunks are read from GCS.
-ds = (
-  xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
-               chunks=dict(time=1),
-               storage_options={'token': 'anon'})  # Anonymous read from the public GCS bucket — no auth required.
-  .sel(time='2020')
+# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
+# Turning off dask means we don't have to wait to construct a task graph.
+ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
+  chunks=None,  # Turn dask off
+  storage_options={'token': 'anon'}  # Anonymous read from the public GCS bucket — no auth required.
 )
 ctx = xql.XarrayContext()
-ctx.from_dataset('era5', ds, table_names={
+# Make sure to pass `chunks`!
+ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
     ('time', 'latitude', 'longitude'): 'surface',
     ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
 })
-# Registration: ~0.5s for a full year of hourly ERA5, all variables.
+# Registration takes ~10s on my machine.
 # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
 # pushes column projection down to Zarr, so SELECT only fetches what you ask
@@ -100,52 +99,50 @@ ctx.sql('''
 # 0  8.640069
 # Average temperature per pressure level, globally.
-ctx.sql('''
+result = ctx.sql('''
   SELECT level, AVG(temperature) - 273.15 AS avg_c
   FROM era5.atmosphere
   WHERE time BETWEEN TIMESTAMP '2020-01-01'
                  AND TIMESTAMP '2020-01-01 05:00:00'
   GROUP BY level
   ORDER BY level DESC
-''').to_pandas()
-#     level      avg_c
-# 0    1000   6.621012   ← surface
-# 1     975   5.185638
-# 2     950   4.028429
-# 3     925   3.082812
-# 4     900   2.210917
-# 5     875   1.395018
-# 6     850   0.634267
-# 7     825  -0.210372
-# 8     800  -1.181075
-# 9     775  -2.306465
-# 10    750  -3.535534
-# 11    700  -6.241685
-# 12    650  -9.236364
-# 13    600 -12.580938
-# 14    550 -16.335386
-# 15    500 -20.643604
-# 16    450 -25.573401
-# 17    400 -31.156920
-# 18    350 -37.400552
-# 19    300 -43.852607
-# 20    250 -49.322132
-# 21    225 -51.569113
-# 22    200 -53.693248
-# 23    175 -55.890484
-# 24    150 -58.382290
-# 25    125 -61.091916
-# 26    100 -63.624885   ← tropopause
-# 27     70 -63.182300
-# 28     50 -60.124845
-# 29     30 -55.986327
-# 30     20 -52.433089
-# 31     10 -44.140750
-# 32      7 -38.707350
-# 33      5 -32.621999
-# 34      3 -21.509175
-# 35      2 -13.355764
-# 36      1  -9.020513   ← top of atmosphere
+''')
+# DataFrame()
+# +-------+----------------------+
+# | level | avg_c                |
+# +-------+----------------------+
+# | 1000  | 6.6210120796502565   |
+# | 975   | 5.185637919348153    |
+# | 950   | 4.028428657263021    |
+# | 925   | 3.0828117974912743   |
+# | 900   | 2.2109172992531967   |
+# | 875   | 1.395017610194202    |
+# | 850   | 0.6342670572626616   |
+# | 825   | -0.21037158786759846 |
+# | 800   | -1.1810754318269687  |
+# | 775   | -2.3064649711534457  |
+# +-------+----------------------+
+ctx.sql('''
+  SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY latitude, longitude
+  ORDER BY latitude DESC, longitude
+''').to_dataset(dims=['latitude', 'longitude'], template=ds)
+# <xarray.Dataset> Size: 8MB
+# Dimensions:    (latitude: 721, longitude: 1440)
+# Coordinates:
+#   * latitude   (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
+#   * longitude  (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
+# Data variables:
+#     avg_c      (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
+# Attributes:
+#     last_updated:           2026-06-20 02:33:34.265980+00:00
+#     valid_time_start:       1940-01-01
+#     valid_time_stop:        2025-12-31
+#     valid_time_stop_era5t:  2026-06-14
 ```
 _(A runnable version of this example lives at
@@ -225,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
 _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
 - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
-- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
+- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
 ## Roadmap
 - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
 - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
 - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
-- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
+- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
 - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
 - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
 - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -254,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
   who are working to make this library better.
 - Andrew Huang for the sense of taste he brings to the project and consummate code
   changes.
+- Aman Kumar for spending a considerable amount of his GSoC internship
+  contributing to this project.
 ## License

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/README.md RENAMED Viewed

@@ -20,22 +20,21 @@ import xarray as xr
 import xarray_sql as xql
-# Open a year of ARCO-ERA5 — all 273 variables. Selecting a year up front
-# keeps Dask's partition setup cheap before any chunks are read from GCS.
-ds = (
-  xr.open_zarr('gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
-               chunks=dict(time=1),
-               storage_options={'token': 'anon'})  # Anonymous read from the public GCS bucket — no auth required.
-  .sel(time='2020')
+# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
+# Turning off dask means we don't have to wait to construct a task graph.
+ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
+  chunks=None,  # Turn dask off
+  storage_options={'token': 'anon'}  # Anonymous read from the public GCS bucket — no auth required.
 )
 ctx = xql.XarrayContext()
-ctx.from_dataset('era5', ds, table_names={
+# Make sure to pass `chunks`!
+ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
     ('time', 'latitude', 'longitude'): 'surface',
     ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
 })
-# Registration: ~0.5s for a full year of hourly ERA5, all variables.
+# Registration takes ~10s on my machine.
 # Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
 # pushes column projection down to Zarr, so SELECT only fetches what you ask
@@ -57,52 +56,50 @@ ctx.sql('''
 # 0  8.640069
 # Average temperature per pressure level, globally.
-ctx.sql('''
+result = ctx.sql('''
   SELECT level, AVG(temperature) - 273.15 AS avg_c
   FROM era5.atmosphere
   WHERE time BETWEEN TIMESTAMP '2020-01-01'
                  AND TIMESTAMP '2020-01-01 05:00:00'
   GROUP BY level
   ORDER BY level DESC
-''').to_pandas()
-#     level      avg_c
-# 0    1000   6.621012   ← surface
-# 1     975   5.185638
-# 2     950   4.028429
-# 3     925   3.082812
-# 4     900   2.210917
-# 5     875   1.395018
-# 6     850   0.634267
-# 7     825  -0.210372
-# 8     800  -1.181075
-# 9     775  -2.306465
-# 10    750  -3.535534
-# 11    700  -6.241685
-# 12    650  -9.236364
-# 13    600 -12.580938
-# 14    550 -16.335386
-# 15    500 -20.643604
-# 16    450 -25.573401
-# 17    400 -31.156920
-# 18    350 -37.400552
-# 19    300 -43.852607
-# 20    250 -49.322132
-# 21    225 -51.569113
-# 22    200 -53.693248
-# 23    175 -55.890484
-# 24    150 -58.382290
-# 25    125 -61.091916
-# 26    100 -63.624885   ← tropopause
-# 27     70 -63.182300
-# 28     50 -60.124845
-# 29     30 -55.986327
-# 30     20 -52.433089
-# 31     10 -44.140750
-# 32      7 -38.707350
-# 33      5 -32.621999
-# 34      3 -21.509175
-# 35      2 -13.355764
-# 36      1  -9.020513   ← top of atmosphere
+''')
+# DataFrame()
+# +-------+----------------------+
+# | level | avg_c                |
+# +-------+----------------------+
+# | 1000  | 6.6210120796502565   |
+# | 975   | 5.185637919348153    |
+# | 950   | 4.028428657263021    |
+# | 925   | 3.0828117974912743   |
+# | 900   | 2.2109172992531967   |
+# | 875   | 1.395017610194202    |
+# | 850   | 0.6342670572626616   |
+# | 825   | -0.21037158786759846 |
+# | 800   | -1.1810754318269687  |
+# | 775   | -2.3064649711534457  |
+# +-------+----------------------+
+ctx.sql('''
+  SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY latitude, longitude
+  ORDER BY latitude DESC, longitude
+''').to_dataset(dims=['latitude', 'longitude'], template=ds)
+# <xarray.Dataset> Size: 8MB
+# Dimensions:    (latitude: 721, longitude: 1440)
+# Coordinates:
+#   * latitude   (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
+#   * longitude  (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
+# Data variables:
+#     avg_c      (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
+# Attributes:
+#     last_updated:           2026-06-20 02:33:34.265980+00:00
+#     valid_time_start:       1940-01-01
+#     valid_time_stop:        2025-12-31
+#     valid_time_stop_era5t:  2026-06-14
 ```
 _(A runnable version of this example lives at
@@ -182,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
 _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
 - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
-- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
+- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
 ## Roadmap
 - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
 - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
 - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
-- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
+- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
 - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
 - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
 - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -211,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
   who are working to make this library better.
 - Andrew Huang for the sense of taste he brings to the project and consummate code
   changes.
+- Aman Kumar for spending a considerable amount of his GSoC internship
+  contributing to this project.
 ## License

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/docs/examples.md RENAMED Viewed

@@ -87,7 +87,39 @@ If you omit `table_names`, each table is named by joining its dimension names
 with underscores, e.g. `era5.time_latitude_longitude` and
 `era5.time_level_latitude_longitude`.
-A runnable version of this example lives at
+## GOES satellite imagery (scalar variables)
+Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
+GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
+scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
+scalars into a single one-row table named `scalar`:
+```python
+import fsspec
+import xarray as xr
+from xarray_sql import XarrayContext
+# A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
+# (y, x) image bands alongside dozens of scalar metadata variables.
+url = (
+    'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
+    'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
+)
+ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
+    {'y': 250, 'x': 250}
+)
+ctx = XarrayContext()
+ctx.from_dataset('goes', ds)
+# The gridded bands and the scalar metadata are separate tables.
+ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0]  # -> 250000
+ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape            # -> (1, 89)
+```
+Override the default name like any other group with `table_names={(): 'metadata'}`.
+A runnable version of the ERA5 example lives at
 [`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
 [arco-era5]: https://github.com/google-research/arco-era5

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/pyproject.toml RENAMED Viewed

@@ -90,6 +90,7 @@ module = [
     "pyarrow.*",
     "datafusion.*",
     "xarray.*",
+    "pandas.*",
 ]
 ignore_missing_imports = true

{xarray_sql-0.2.3 → xarray_sql-0.3.0}/tests/test_df.py RENAMED Viewed

@@ -3,12 +3,14 @@ import tracemalloc
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pytest
 import xarray as xr
 from xarray_sql.df import (
     DEFAULT_BATCH_SIZE,
     _parse_schema,
     block_slices,
+    compute_chunks,
     dataset_to_record_batch,
     explode,
     from_map,
@@ -58,6 +60,37 @@ def test_explode_data_equal_one_last(air):
     assert air.isel(iselection).equals(ds)
+def test_block_slices_scalar_dataset_yields_single_block():
+    # A dimensionless dataset (e.g. scalar metadata variables) has exactly
+    # one block: the whole, empty selection.
+    ds = xr.Dataset({"projection": ((), 0)})
+    assert list(block_slices(ds)) == [{}]
+def test_block_slices_scalar_ignores_irrelevant_chunks():
+    ds = xr.Dataset({"projection": ((), 0)})
+    assert list(block_slices(ds, chunks={"time": 4})) == [{}]
+def test_block_slices_filters_chunk_keys_to_dataset_dims(air_small):
+    # A chunk key for a dimension the dataset doesn't have is ignored,
+    # rather than raising.
+    base = list(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
+    extra = list(
+        block_slices(
+            air_small, chunks={"time": 4, "lat": 3, "lon": 4, "absent": 2}
+        )
+    )
+    assert len(extra) == len(base)
+def test_block_slices_dimensional_unchunked_raises():
+    # A dataset with dimensions but no chunking is still a user error.
+    ds = xr.Dataset({"v": (["x"], np.arange(3))}, coords={"x": np.arange(3)})
+    with pytest.raises(AssertionError):
+        list(block_slices(ds))
 def test_from_map_basic():
     def make_df(x):
         return pd.DataFrame({"value": [x, x * 2], "index": [0, 1]})
@@ -441,3 +474,61 @@ def test_read_xarray_table_memory_bounds(large_ds):
         )
     finally:
         tracemalloc.stop()
+# ---------------------------------------------------------------------------
+# compute_chunks: arithmetic replacement for ds.chunk(...).chunks.
+# Dask serves as the source of truth.
+# ---------------------------------------------------------------------------
+def _dask_chunks(ds: xr.Dataset, chunks: dict) -> dict:
+    rechunked = ds.copy(data=None, deep=False).chunk(chunks)
+    return {str(k): tuple(v) for k, v in rechunked.chunks.items()}
+def _normalise(result: dict) -> dict:
+    return {str(k): tuple(v) for k, v in result.items()}
+def _simple_ds(shape: tuple[int, ...], dims: tuple[str, ...]) -> xr.Dataset:
+    return xr.Dataset(
+        {"v": (dims, np.zeros(shape))},
+        coords={d: np.arange(s) for d, s in zip(dims, shape)},
+    )
+@pytest.mark.parametrize(
+    "ds,chunks",
+    [
+        # Even divide on a single dim.
+        (_simple_ds((10,), ("x",)), {"x": 5}),
+        # Uneven divide: trailing remainder chunk.
+        (_simple_ds((10,), ("x",)), {"x": 3}),
+        # Requested chunk size larger than the dim → single chunk.
+        (_simple_ds((5,), ("x",)), {"x": 100}),
+        # Multi-dim spec with a dim left unspecified (kept as one chunk).
+        (_simple_ds((4, 6), ("x", "y")), {"x": 2}),
+        # Multi-dim spec rechunking every dim.
+        (_simple_ds((7, 11, 13), ("a", "b", "c")), {"a": 3, "b": 4, "c": 5}),
+    ],
+)
+def test_compute_chunks_matches_dask(ds, chunks):
+    assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
+def test_compute_chunks_preserves_existing_dask_chunking():
+    # When the dataset is already dask-backed, rechunking one dim must
+    # leave other dims' existing chunk tuples alone.
+    ds = _simple_ds((4, 5), ("x", "y")).chunk({"x": 1, "y": 2})
+    chunks = {"x": 2}
+    assert _normalise(compute_chunks(ds, chunks)) == _dask_chunks(ds, chunks)
+def test_compute_chunks_tuples_sum_to_dim_size():
+    # Dask-independent invariant: every per-dim chunk tuple must fully
+    # cover its dimension.
+    ds = _simple_ds((7, 11, 13), ("a", "b", "c"))
+    result = compute_chunks(ds, {"a": 3, "b": 4, "c": 5})
+    for dim, tup in result.items():
+        assert sum(tup) == ds.sizes[dim]

xarray_sql 0.2.3__tar.gz → 0.3.0__tar.gz

xarray_sql 0.2.3tar.gz → 0.3.0tar.gz