PyPI - xarray_sql - Versions diffs - 0.2.2__tar.gz → 0.3.0__tar.gz - Mend

xarray_sql 0.2.2tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.lock +1 -1
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.toml +1 -1
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/PKG-INFO +86 -42
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/README.md +85 -41
xarray_sql-0.3.0/docs/examples.md +125 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/pyproject.toml +8 -5
xarray_sql-0.3.0/tests/conftest.py +150 -0
xarray_sql-0.3.0/tests/test_cft.py +170 -0
xarray_sql-0.3.0/tests/test_df.py +534 -0
xarray_sql-0.3.0/tests/test_ds.py +571 -0
xarray_sql-0.3.0/tests/test_reader.py +1415 -0
xarray_sql-0.3.0/tests/test_sql.py +490 -0
xarray_sql-0.3.0/xarray_sql/cftime.py +248 -0
xarray_sql-0.3.0/xarray_sql/core.py +49 -0
xarray_sql-0.3.0/xarray_sql/df.py +508 -0
xarray_sql-0.3.0/xarray_sql/ds.py +838 -0
xarray_sql-0.3.0/xarray_sql/reader.py +332 -0
xarray_sql-0.3.0/xarray_sql/sql.py +191 -0
xarray_sql-0.2.2/docs/examples.md +0 -23
xarray_sql-0.2.2/tests/conftest.py +0 -144
xarray_sql-0.2.2/tests/test_cft.py +0 -176
xarray_sql-0.2.2/tests/test_df.py +0 -428
xarray_sql-0.2.2/tests/test_reader.py +0 -1372
xarray_sql-0.2.2/tests/test_sql.py +0 -318
xarray_sql-0.2.2/xarray_sql/cftime.py +0 -248
xarray_sql-0.2.2/xarray_sql/core.py +0 -49
xarray_sql-0.2.2/xarray_sql/df.py +0 -445
xarray_sql-0.2.2/xarray_sql/reader.py +0 -299
xarray_sql-0.2.2/xarray_sql/sql.py +0 -63
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/.gitignore +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/LICENSE +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/assets/logo.svg +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/contributing.md +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/index.md +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/docs/reference/xarray_sql.md +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/src/lib.rs +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/tests/__init__.py +0 -0
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/xarray_sql/__init__.py +1 -1
{xarray_sql-0.2.2 → xarray_sql-0.3.0}/zensical.toml +0 -0

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.lock RENAMED Viewed

@@ -3375,7 +3375,7 @@ checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
 [[package]]
 name = "xarray_sql"
-version = "0.2.2"
+version = "0.3.0"
 dependencies = [
  "arrow",
  "async-stream",

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "xarray_sql"
-version = "0.2.2"
+version = "0.3.0"
 authors = ["Alex Merose"]
 edition = "2021"
 exclude = [

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xarray_sql
-Version: 0.2.2
+Version: 0.3.0
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Developers
@@ -62,52 +62,94 @@ This is an experiment to provide a SQL interface for array datasets.
 import xarray as xr
 import xarray_sql as xql
-ds = xr.tutorial.open_dataset('air_temperature')
-# The same as a dask-sql Context; i.e. an Apache DataFusion Context.
-ctx = xql.XarrayContext()
-ctx.from_dataset('air', ds, chunks=dict(time=24))  # the dataset needs to be chunked!
-# data is only materialized when we make a query.
+# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
+# Turning off dask means we don't have to wait to construct a task graph.
+ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
+  chunks=None,  # Turn dask off
+  storage_options={'token': 'anon'}  # Anonymous read from the public GCS bucket — no auth required.
+)
+ctx = xql.XarrayContext()
+# Make sure to pass `chunks`!
+ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
+    ('time', 'latitude', 'longitude'): 'surface',
+    ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
+})
+# Registration takes ~10s on my machine.
+# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
+# pushes column projection down to Zarr, so SELECT only fetches what you ask
+# for — but `SELECT * FROM era5.surface` would try to pull every variable
+# across the year (terabytes from GCS).
+#  ---> Always SELECT specific columns. <---
+# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
+# pushes WHERE clauses on dimension columns down to partition pruning.
+ctx.sql('''
+  SELECT AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+    AND latitude  BETWEEN 39 AND 40
+    AND longitude BETWEEN 286 AND 287  -- ERA5 uses 0-360 longitudes
+''').to_pandas()
+#       avg_c
+# 0  8.640069
+# Average temperature per pressure level, globally.
 result = ctx.sql('''
-  SELECT
-    "lat", "lon", AVG("air") as air_avg
-  FROM
-    "air"
-  GROUP BY
-   "lat", "lon"
+  SELECT level, AVG(temperature) - 273.15 AS avg_c
+  FROM era5.atmosphere
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY level
+  ORDER BY level DESC
 ''')
 # DataFrame()
-# +------+-------+--------------------+
-# | lat  | lon   | air_avg            |
-# +------+-------+--------------------+
-# | 75.0 | 205.0 | 259.88662671232834 |
-# | 75.0 | 207.5 | 259.48268150684896 |
-# | 75.0 | 230.0 | 258.9192123287667  |
-# | 75.0 | 275.0 | 257.07574315068456 |
-# | 75.0 | 322.5 | 250.11792123287654 |
-# | 75.0 | 325.0 | 250.81590068493134 |
-# | 72.5 | 205.0 | 262.74933904109537 |
-# | 72.5 | 207.5 | 262.5384315068488  |
-# | 72.5 | 230.0 | 260.82879452054743 |
-# | 72.5 | 275.0 | 257.3063321917804  |
-# +------+-------+--------------------+
-# Data truncated.
-# The full query is only made when we call `collect()`, or, in this case,
-# `to_pandas()`.
-df = result.to_pandas()
-df.head()
-#     lat    lon     air_avg
-# 0  75.0  232.5  258.836188
-# 1  75.0  247.5  257.716171
-# 2  75.0  262.5  257.347959
-# 3  75.0  277.5  257.671308
-# 4  72.5  232.5  260.654401
+# +-------+----------------------+
+# | level | avg_c                |
+# +-------+----------------------+
+# | 1000  | 6.6210120796502565   |
+# | 975   | 5.185637919348153    |
+# | 950   | 4.028428657263021    |
+# | 925   | 3.0828117974912743   |
+# | 900   | 2.2109172992531967   |
+# | 875   | 1.395017610194202    |
+# | 850   | 0.6342670572626616   |
+# | 825   | -0.21037158786759846 |
+# | 800   | -1.1810754318269687  |
+# | 775   | -2.3064649711534457  |
+# +-------+----------------------+
+ctx.sql('''
+  SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY latitude, longitude
+  ORDER BY latitude DESC, longitude
+''').to_dataset(dims=['latitude', 'longitude'], template=ds)
+# <xarray.Dataset> Size: 8MB
+# Dimensions:    (latitude: 721, longitude: 1440)
+# Coordinates:
+#   * latitude   (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
+#   * longitude  (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
+# Data variables:
+#     avg_c      (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
+# Attributes:
+#     last_updated:           2026-06-20 02:33:34.265980+00:00
+#     valid_time_start:       1940-01-01
+#     valid_time_stop:        2025-12-31
+#     valid_time_stop_era5t:  2026-06-14
 ```
-Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
-SQL queries against them.
+_(A runnable version of this example lives at
+[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
+Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
+SQL queries against them.
 ## Why build this?
@@ -180,14 +222,14 @@ _2025 update_: Something like this is being built across a few projects! The one
 _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
 - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
-- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
+- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
 ## Roadmap
 - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
 - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
 - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
-- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
+- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
 - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
 - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
 - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -209,6 +251,8 @@ I want to give a special thanks to the following folks and institutions:
   who are working to make this library better.
 - Andrew Huang for the sense of taste he brings to the project and consummate code
   changes.
+- Aman Kumar for spending a considerable amount of his GSoC internship
+  contributing to this project.
 ## License

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/README.md RENAMED Viewed

@@ -19,52 +19,94 @@ This is an experiment to provide a SQL interface for array datasets.
 import xarray as xr
 import xarray_sql as xql
-ds = xr.tutorial.open_dataset('air_temperature')
-# The same as a dask-sql Context; i.e. an Apache DataFusion Context.
-ctx = xql.XarrayContext()
-ctx.from_dataset('air', ds, chunks=dict(time=24))  # the dataset needs to be chunked!
-# data is only materialized when we make a query.
+# Open ARCO-ERA5 — a weather dataset with 273 variables since 1940.
+# Turning off dask means we don't have to wait to construct a task graph.
+ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
+  chunks=None,  # Turn dask off
+  storage_options={'token': 'anon'}  # Anonymous read from the public GCS bucket — no auth required.
+)
+ctx = xql.XarrayContext()
+# Make sure to pass `chunks`!
+ctx.from_dataset('era5', ds, chunks=dict(time=6), table_names={
+    ('time', 'latitude', 'longitude'): 'surface',
+    ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
+})
+# Registration takes ~10s on my machine.
+# Heads up: ARCO-ERA5 has 262 surface + 11 atmospheric variables. The library
+# pushes column projection down to Zarr, so SELECT only fetches what you ask
+# for — but `SELECT * FROM era5.surface` would try to pull every variable
+# across the year (terabytes from GCS).
+#  ---> Always SELECT specific columns. <---
+# Average 2m-temperature over NYC on the morning of 2020-01-01. The library
+# pushes WHERE clauses on dimension columns down to partition pruning.
+ctx.sql('''
+  SELECT AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+    AND latitude  BETWEEN 39 AND 40
+    AND longitude BETWEEN 286 AND 287  -- ERA5 uses 0-360 longitudes
+''').to_pandas()
+#       avg_c
+# 0  8.640069
+# Average temperature per pressure level, globally.
 result = ctx.sql('''
-  SELECT
-    "lat", "lon", AVG("air") as air_avg
-  FROM
-    "air"
-  GROUP BY
-   "lat", "lon"
+  SELECT level, AVG(temperature) - 273.15 AS avg_c
+  FROM era5.atmosphere
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY level
+  ORDER BY level DESC
 ''')
 # DataFrame()
-# +------+-------+--------------------+
-# | lat  | lon   | air_avg            |
-# +------+-------+--------------------+
-# | 75.0 | 205.0 | 259.88662671232834 |
-# | 75.0 | 207.5 | 259.48268150684896 |
-# | 75.0 | 230.0 | 258.9192123287667  |
-# | 75.0 | 275.0 | 257.07574315068456 |
-# | 75.0 | 322.5 | 250.11792123287654 |
-# | 75.0 | 325.0 | 250.81590068493134 |
-# | 72.5 | 205.0 | 262.74933904109537 |
-# | 72.5 | 207.5 | 262.5384315068488  |
-# | 72.5 | 230.0 | 260.82879452054743 |
-# | 72.5 | 275.0 | 257.3063321917804  |
-# +------+-------+--------------------+
-# Data truncated.
-# The full query is only made when we call `collect()`, or, in this case,
-# `to_pandas()`.
-df = result.to_pandas()
-df.head()
-#     lat    lon     air_avg
-# 0  75.0  232.5  258.836188
-# 1  75.0  247.5  257.716171
-# 2  75.0  262.5  257.347959
-# 3  75.0  277.5  257.671308
-# 4  72.5  232.5  260.654401
+# +-------+----------------------+
+# | level | avg_c                |
+# +-------+----------------------+
+# | 1000  | 6.6210120796502565   |
+# | 975   | 5.185637919348153    |
+# | 950   | 4.028428657263021    |
+# | 925   | 3.0828117974912743   |
+# | 900   | 2.2109172992531967   |
+# | 875   | 1.395017610194202    |
+# | 850   | 0.6342670572626616   |
+# | 825   | -0.21037158786759846 |
+# | 800   | -1.1810754318269687  |
+# | 775   | -2.3064649711534457  |
+# +-------+----------------------+
+ctx.sql('''
+  SELECT latitude, longitude, AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY latitude, longitude
+  ORDER BY latitude DESC, longitude
+''').to_dataset(dims=['latitude', 'longitude'], template=ds)
+# <xarray.Dataset> Size: 8MB
+# Dimensions:    (latitude: 721, longitude: 1440)
+# Coordinates:
+#   * latitude   (latitude) float32 3kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
+#   * longitude  (longitude) float32 6kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
+# Data variables:
+#     avg_c      (latitude, longitude) float64 8MB -26.84 -26.84 ... -27.38 -27.38
+# Attributes:
+#     last_updated:           2026-06-20 02:33:34.265980+00:00
+#     valid_time_start:       1940-01-01
+#     valid_time_stop:        2025-12-31
+#     valid_time_stop_era5t:  2026-06-14
 ```
-Succinctly, we "pivot" Xarray Datasets (with consistent dimensions) to treat them like tables so we can run
-SQL queries against them.
+_(A runnable version of this example lives at
+[`perf_tests/era5_temp_profile.py`](perf_tests/era5_temp_profile.py).)_
+Succinctly, we "pivot" Xarray Datasets to treat them like tables so we can run
+SQL queries against them.
 ## Why build this?
@@ -137,14 +179,14 @@ _2025 update_: Something like this is being built across a few projects! The one
 _2026 update_: A colleague and I are experimenting with native Zarr RDBMS engines. Check out:
 - [Zarr-Datafusion](https://lib.rs/crates/zarr-datafusion)
-- [DuckDB-Zarr](https://github.com/hobbes-bot/duckdb-zarr)
+- [DuckDB-Zarr](https://github.com/alxmrs/duckdb-zarr)
 ## Roadmap
 - [x] ~Lazy evaluation via the pyarrow Dataset interface [#93](https://github.com/alxmrs/xarray-sql/issues/93).~ _Implemented in [#100](https://github.com/alxmrs/xarray-sql/pull/100)_
 - [x] Support proper parallelism via proper partition handling on the rust/datafusion side. [#106](https://github.com/alxmrs/xarray-sql/issues/106)
 - [x] Support core datafusion optimizations to scan less data, like [104](https://github.com/alxmrs/xarray-sql/issues/104), ...
-- [ ] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
+- [x] Translate a single Zarr to a collection of tables [#85](https://github.com/alxmrs/xarray-sql/issues/85).
 - [ ] Distributed beyond a single node through the DataFusion integration with Ray Datasets [#68](https://github.com/alxmrs/xarray-sql/issues/68) or Apache Ballista [#98](https://github.com/alxmrs/xarray-sql/issues/98).
 - [ ] Demo: calculate Sea Surface Temperature from 1940 - Present in SQL [#36](https://github.com/alxmrs/xarray-sql/issues/36).
 - [ ] Provide an option to integrate DataFusion directly to Zarr via Rust [#4](https://github.com/alxmrs/xarray-sql/issues/4).
@@ -166,6 +208,8 @@ I want to give a special thanks to the following folks and institutions:
   who are working to make this library better.
 - Andrew Huang for the sense of taste he brings to the project and consummate code
   changes.
+- Aman Kumar for spending a considerable amount of his GSoC internship
+  contributing to this project.
 ## License

xarray_sql-0.3.0/docs/examples.md ADDED Viewed

@@ -0,0 +1,125 @@
+# Examples
+```python
+import xarray as xr
+import xarray_sql as xql
+ds = xr.tutorial.open_dataset('air_temperature')
+ctx = xql.XarrayContext()
+ctx.from_dataset('air', ds, chunks=dict(time=24))
+result = ctx.sql('''
+  SELECT
+    "lat", "lon", AVG("air") as air_avg
+  FROM
+    "air"
+  GROUP BY
+   "lat", "lon"
+''')
+df = result.to_pandas()
+df.head()
+```
+## Mixed-dimension datasets: ARCO-ERA5
+When a Dataset has variables with differing dimensions (e.g. surface fields on
+`(time, latitude, longitude)` and atmospheric fields on
+`(time, level, latitude, longitude)`), `from_dataset` splits them into one
+table per dimension group, registered together under a SQL schema named after
+the first argument. [ARCO-ERA5][arco-era5] is a good example: 262 of its
+variables are surface fields and 11 are atmospheric.
+Open a year of ARCO-ERA5 and let SQL `WHERE` clauses do the filtering — the
+library prunes time partitions and pushes dimension-column filters down. Use
+the `table_names` kwarg to give each dimension group a friendly name:
+```python
+import xarray as xr
+import xarray_sql as xql
+# Open ARCO-ERA5 directly from GCS (anonymous read).
+url = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
+full = xr.open_zarr(url, chunks=None, storage_options={'token': 'anon'})
+# A full year of hourly ERA5 — all 273 variables. No spatial slicing on the
+# xarray side; SQL WHERE clauses below express the filters. `chunks={'time': 1}`
+# aligns Dask chunks to native Zarr chunks of shape (1, 37, 721, 1440) so
+# chunk reads from GCS happen concurrently.
+#
+# Heads up: 262 of those variables are surface and 11 are atmospheric. The
+# library pushes column projection down, so SELECT only fetches what you ask
+# for — but `SELECT * FROM era5.surface` would try to pull every variable
+# across the year (terabytes from GCS). Always SELECT specific columns.
+ds = full.sel(time='2020').chunk({'time': 1})
+ctx = xql.XarrayContext()
+ctx.from_dataset('era5', ds, table_names={
+    ('time', 'latitude', 'longitude'): 'surface',
+    ('time', 'level', 'latitude', 'longitude'): 'atmosphere',
+})
+# Registers two tables under a SQL schema named 'era5': 'surface' and 'atmosphere'.
+# Average 2m-temperature over the NYC area on the morning of 2020-01-01.
+ctx.sql('''
+  SELECT AVG("2m_temperature") - 273.15 AS avg_c
+  FROM era5.surface
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+    AND latitude  BETWEEN 39 AND 40
+    AND longitude BETWEEN 286 AND 287
+''').to_pandas()
+# Average temperature per pressure level, globally — the standard
+# atmospheric temperature profile. Scans ~230M rows.
+ctx.sql('''
+  SELECT level, AVG(temperature) - 273.15 AS avg_c
+  FROM era5.atmosphere
+  WHERE time BETWEEN TIMESTAMP '2020-01-01'
+                 AND TIMESTAMP '2020-01-01 05:00:00'
+  GROUP BY level
+  ORDER BY level DESC  -- surface (1000 hPa) first
+''').to_pandas()
+```
+If you omit `table_names`, each table is named by joining its dimension names
+with underscores, e.g. `era5.time_latitude_longitude` and
+`era5.time_level_latitude_longitude`.
+## GOES satellite imagery (scalar variables)
+Real-world stores often mix gridded data with scalar (0-dimensional) metadata.
+GOES satellite imagery, for example, pairs `(y, x)` image bands with dozens of
+scalar variables such as `goes_imager_projection`. `from_dataset` groups all the
+scalars into a single one-row table named `scalar`:
+```python
+import fsspec
+import xarray as xr
+from xarray_sql import XarrayContext
+# A real GOES-16 ABI cloud-and-moisture file from NOAA's public bucket:
+# (y, x) image bands alongside dozens of scalar metadata variables.
+url = (
+    'https://noaa-goes16.s3.amazonaws.com/ABI-L2-MCMIPM/2024/001/00/'
+    'OR_ABI-L2-MCMIPM1-M6_G16_s20240010000281_e20240010000350_c20240010000426.nc'
+)
+ds = xr.open_dataset(fsspec.open_local(f'simplecache::{url}')).chunk(
+    {'y': 250, 'x': 250}
+)
+ctx = XarrayContext()
+ctx.from_dataset('goes', ds)
+# The gridded bands and the scalar metadata are separate tables.
+ctx.sql('SELECT COUNT(*) AS n FROM goes.y_x').to_pandas()['n'][0]  # -> 250000
+ctx.sql('SELECT * FROM goes.scalar').to_pandas().shape            # -> (1, 89)
+```
+Override the default name like any other group with `table_names={(): 'metadata'}`.
+A runnable version of the ERA5 example lives at
+[`perf_tests/era5_temp_profile.py`](../perf_tests/era5_temp_profile.py).
+[arco-era5]: https://github.com/google-research/arco-era5

{xarray_sql-0.2.2 → xarray_sql-0.3.0}/pyproject.toml RENAMED Viewed

@@ -64,11 +64,13 @@ module-name = "xarray_sql._native"
 [tool.setuptools.packages.find]
 exclude = ["demo", "perf_tests", "tests", "tests.*"]
-[tool.pyink]
+[tool.ruff]
 line-length = 80
-preview = true
-pyink-indentation = 2
-pyink-use-majority-quotes = true
+indent-width = 4
+[tool.ruff.format]
+indent-style = "space"
+quote-style = "double"
 [tool.mypy]
 python_version = "3.11"
@@ -88,6 +90,7 @@ module = [
     "pyarrow.*",
     "datafusion.*",
     "xarray.*",
+    "pandas.*",
 ]
 ignore_missing_imports = true
@@ -98,7 +101,7 @@ dev = [
     "xarray_sql[test]",
     "xarray_sql[docs]",
     "py-spy>=0.4.0",
-    "pyink>=24.10.1",
+    "ruff>=0.15.10",
     "maturin>=1.9.1",
 ]

xarray_sql-0.3.0/tests/conftest.py ADDED Viewed

@@ -0,0 +1,150 @@
+import pytest
+import numpy as np
+import pandas as pd
+import xarray as xr
+def rand_wx(start: str, end: str) -> xr.Dataset:
+    np.random.seed(42)
+    lat = np.linspace(-90, 90, num=720)
+    lon = np.linspace(-180, 180, num=1440)
+    time = pd.date_range(start, end, freq="h")
+    level = np.array([1000, 500], dtype=np.int32)
+    reference_time = pd.Timestamp(start)
+    temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
+    precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
+    return xr.Dataset(
+        data_vars=dict(
+            temperature=(["lat", "lon", "time", "level"], temperature),
+            precipitation=(["lat", "lon", "time", "level"], precipitation),
+        ),
+        coords=dict(
+            lat=lat,
+            lon=lon,
+            time=time,
+            level=level,
+            reference_time=reference_time,
+        ),
+        attrs=dict(description="Random weather."),
+    )
+def create_large_dataset(time_steps=1000, lat_points=100, lon_points=100):
+    """Create a large xarray dataset for memory testing."""
+    np.random.seed(42)
+    time = pd.date_range("2020-01-01", periods=time_steps, freq="h")
+    lat = np.linspace(-90, 90, lat_points)
+    lon = np.linspace(-180, 180, lon_points)
+    temp_data = np.random.rand(time_steps, lat_points, lon_points) * 40 - 10
+    precip_data = np.random.rand(time_steps, lat_points, lon_points) * 100
+    return xr.Dataset(
+        {
+            "temperature": (["time", "lat", "lon"], temp_data),
+            "precipitation": (["time", "lat", "lon"], precip_data),
+        },
+        coords={"time": time, "lat": lat, "lon": lon},
+    )
+@pytest.fixture
+def air():
+    ds = xr.tutorial.open_dataset("air_temperature")
+    chunks = {"time": 240}
+    return ds.chunk(chunks)
+@pytest.fixture
+def air_small(air):
+    return air.isel(
+        time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10)
+    ).chunk({"time": 240})
+@pytest.fixture
+def randwx():
+    return rand_wx("1995-01-13T00", "1995-01-13T01")
+@pytest.fixture
+def large_ds():
+    return create_large_dataset().chunk({"time": 25})
+@pytest.fixture
+def air_dataset_small():
+    ds = xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
+    return ds.isel(time=slice(0, 12), lat=slice(0, 11), lon=slice(0, 10))
+@pytest.fixture
+def air_dataset_large():
+    return xr.tutorial.open_dataset("air_temperature").chunk({"time": 240})
+@pytest.fixture
+def rasm_ds():
+    """rasm uses cftime.DatetimeNoLeap (noleap / 365_day) for time."""
+    return xr.tutorial.open_dataset("rasm")
+@pytest.fixture
+def weather_dataset():
+    ds = rand_wx("2023-01-01T00", "2023-01-01T12")
+    return ds.isel(time=slice(0, 6), lat=slice(0, 10), lon=slice(0, 10)).chunk(
+        {"time": 3}
+    )
+@pytest.fixture
+def synthetic_dataset():
+    return create_large_dataset(
+        time_steps=50, lat_points=20, lon_points=20
+    ).chunk({"time": 25})
+@pytest.fixture
+def station_dataset():
+    return xr.Dataset(
+        {
+            "station_id": (["station"], [1, 2, 3, 4, 5]),
+            "elevation": (["station"], [100, 250, 500, 750, 1000]),
+            "name": (
+                ["station"],
+                [
+                    "Station_A",
+                    "Station_B",
+                    "Station_C",
+                    "Station_D",
+                    "Station_E",
+                ],
+            ),
+        }
+    ).chunk({"station": 5})
+@pytest.fixture
+def air_and_stations():
+    air = (
+        xr.tutorial.open_dataset("air_temperature")
+        .isel(time=slice(0, 12), lat=slice(0, 5), lon=slice(0, 8))
+        .chunk({"time": 6})
+    )
+    stations = xr.Dataset(
+        {
+            "station_id": (["station"], [101, 102, 103]),
+            "lat": (
+                ["station"],
+                [air.lat.values[0], air.lat.values[2], air.lat.values[4]],
+            ),
+            "lon": (
+                ["station"],
+                [air.lon.values[1], air.lon.values[3], air.lon.values[5]],
+            ),
+            "elevation": (["station"], [100, 250, 500]),
+        }
+    ).chunk({"station": 3})
+    return air, stations

xarray_sql 0.2.2__tar.gz → 0.3.0__tar.gz

xarray_sql 0.2.2tar.gz → 0.3.0tar.gz