PyPI - climdata - Versions diffs - 0.0.5__tar.gz → 0.0.7__tar.gz - Mend

climdata 0.0.5tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of climdata might be problematic. Click here for more details.

Files changed (56) hide show

{climdata-0.0.5 → climdata-0.0.7}/.gitignore RENAMED Viewed

@@ -107,4 +107,6 @@ ENV/
 .vscode/
 climdata/conf/service.json
 outputs
-*.csv
+*.csv
+*.zarr
+*.nc

{climdata-0.0.5 → climdata-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: climdata
-Version: 0.0.5
+Version: 0.0.7
 Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
 Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
 License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: zarr
 Requires-Dist: ipyleaflet
 Requires-Dist: wetterdienst
 Requires-Dist: pint-pandas
+Requires-Dist: cdsapi
 Requires-Dist: hydra-core
 Requires-Dist: intake
 Requires-Dist: intake-esm

{climdata-0.0.5 → climdata-0.0.7}/climdata/__init__.py RENAMED Viewed

@@ -2,10 +2,13 @@
 __author__ = """Kaushik Muduchuru"""
 __email__ = "kaushik.reddy.m@gmail.com"
-__version__ = "0.0.5"
+__version__ = "0.0.7"
 from .utils.utils_download import * # etc.
 from .utils.config import load_config
 from .datasets.DWD import DWDmirror as DWD
 from .datasets.MSWX import MSWXmirror as MSWX
+from .datasets.ERA5 import ERA5Mirror as ERA5
+from .datasets.CMIPlocal import CMIPmirror as CMIPlocal
+from .datasets.CMIPCloud import CMIPCloud as CMIP

{climdata-0.0.5 → climdata-0.0.7}/climdata/conf/config.yaml RENAMED Viewed

@@ -4,7 +4,7 @@ defaults:
   - mappings/parameters
   - mappings/variables
 dataset: dwd
-data_dir: /beegfs/muduchuru/data
+data_dir: ./data
 weather:
   parameter: tas  # standardized variable name (e.g., tas, pr, rsds)
@@ -33,5 +33,6 @@ time_range:
 output:
   out_dir: "./climdata/data/"
-  filename: "{provider}_{parameter}_LAT{lat}_LON{lon}_{start}_{end}.csv"
+  filename_csv: "{provider}_{parameter}_LAT_{lat}_LON_{lon}_{start}_{end}.csv"
+  filename_zarr: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.zarr"
   fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'

climdata-0.0.7/climdata/datasets/CMIPCloud.py ADDED Viewed

@@ -0,0 +1,120 @@
+import intake
+import xarray as xr
+import pandas as pd
+class CMIPCloud:
+    def __init__(self, experiment_id, source_id, table_id, variables, region_bounds=None):
+        self.experiment_id = experiment_id
+        self.source_id = source_id
+        self.table_id = table_id
+        self.variables = variables
+        self.region_bounds = region_bounds
+        self.col_subsets = []
+        self.ds = None
+    def fetch(self):
+        """Collect intake catalog subsets for each variable."""
+        col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
+        self.col_subsets = []
+        for var in self.variables:
+            query = dict(
+                experiment_id=[self.experiment_id],
+                source_id=self.source_id,
+                table_id=self.table_id,
+                variable_id=var,
+            )
+            col_subset = col.search(require_all_on=["source_id"], **query)
+            if len(col_subset.df) == 0:
+                continue
+            self.col_subsets.append(col_subset)
+        return self.col_subsets
+    def load(self):
+        """Load and merge datasets from collected col_subsets."""
+        datasets = []
+        for col_subset in self.col_subsets:
+            zstore_path = col_subset.df.zstore.values[0].replace('gs:/', "https://storage.googleapis.com")
+            ds_var = xr.open_zarr(zstore_path)
+            datasets.append(ds_var)
+        if datasets:
+            self.ds = xr.merge(datasets)
+        else:
+            self.ds = None
+        return self.ds
+    def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
+        """
+        Extract a subset of the dataset by point, bounding box (dict), or shapefile.
+        """
+        import geopandas as gpd
+        from shapely.geometry import mapping
+        if self.ds is None:
+            raise ValueError("No dataset loaded. Call `load()` first.")
+        ds = self.ds
+        if point is not None:
+            lon, lat = point
+            if buffer_km > 0:
+                buffer_deg = buffer_km / 111
+                ds_subset = ds.sel(
+                    lon=slice(lon-buffer_deg, lon+buffer_deg),
+                    lat=slice(lat-buffer_deg, lat+buffer_deg)
+                )
+            else:
+                ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
+        elif box is not None:
+            # Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
+            ds_subset = ds.sel(
+                lon=slice(box['lon_min'], box['lon_max']),
+                lat=slice(box['lat_min'], box['lat_max'])
+            )
+        elif shapefile is not None:
+            if isinstance(shapefile, str):
+                gdf = gpd.read_file(shapefile)
+            else:
+                gdf = shapefile
+            if buffer_km > 0:
+                gdf = gdf.to_crs(epsg=3857)
+                gdf["geometry"] = gdf.buffer(buffer_km * 1000)
+                gdf = gdf.to_crs(epsg=4326)
+            geom = [mapping(g) for g in gdf.geometry]
+            import rioxarray
+            ds = ds.rio.write_crs("EPSG:4326", inplace=False)
+            ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
+        else:
+            raise ValueError("Must provide either point, box, or shapefile.")
+        self.ds = ds_subset
+        return ds_subset
+    def _subset_time(self, start_date, end_date):
+        """
+        Subset the dataset by time range.
+        Dates should be strings in 'YYYY-MM-DD' format.
+        """
+        if self.ds is None:
+            return None
+        ds_time = self.ds.sel(time=slice(start_date, end_date))
+        self.ds = ds_time
+        return ds_time
+    def save_netcdf(self, filename):
+        if self.ds is not None:
+            if "time" in self.ds.variables:
+                self.ds["time"].encoding.clear()
+            self.ds.to_netcdf(filename)
+            print(f"Saved NetCDF to {filename}")
+    def save_zarr(self, store_path):
+        if self.ds is not None:
+            self.ds.to_zarr(store_path, mode="w")
+            print(f"Saved Zarr to {store_path}")
+    def save_csv(self, filename):
+        if self.ds is not None:
+            df = self.ds.to_dataframe().reset_index()
+            df.to_csv(filename, index=False)
+            print(f"Saved CSV to {filename}")

climdata-0.0.5/climdata/datasets/CMIP.py → climdata-0.0.7/climdata/datasets/CMIPlocal.py RENAMED Viewed

@@ -14,7 +14,7 @@ from xclim.core import units
 warnings.filterwarnings("ignore", category=Warning)
-class CMIP:
+class CMIPmirror:
     def __init__(self, var_cfg: DictConfig, experiments):
         self.var_cfg = var_cfg
         self.files = []

climdata-0.0.7/climdata/datasets/ERA5.py ADDED Viewed

@@ -0,0 +1,322 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import cdsapi
+import xarray as xr
+import datetime
+import json
+import dask
+import calendar
+from dask.diagnostics import ProgressBar
+from typing import List, Tuple, Dict, Union
+import urllib3
+import logging
+import numpy as np
+import fsspec
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+class ERA5Mirror:
+    """
+    A class to manage downloading ERA5 datasets. The datasets are downloaded from the Copernicus Climate Data Store (CDS) and stored in Zarr format.
+    Attributes
+    ----------
+    base_path : Path
+        The path to the Zarr dataset.
+    fs : fsspec.AbstractFileSystem
+        The filesystem to use for the Zarr dataset. If None, the local filesystem will be used.
+    """
+    def __init__(self, base_path: str, fs: fsspec.AbstractFileSystem = None):
+        # Get parameters
+        self.base_path = base_path
+        if fs is None:
+            fs = fsspec.filesystem("file")
+        self.fs = fs
+        # Create the base path if it doesn't exist
+        if not self.fs.exists(self.base_path):
+            self.fs.makedirs(self.base_path)
+        # Create metadata that will be used to track which chunks have been downloaded
+        self.metadata_file = os.path.join(self.base_path, "metadata.json")
+        self.metadata = self.get_metadata()
+    def get_metadata(self):
+        """Get metadata"""
+        if self.fs.exists(self.metadata_file):
+            with self.fs.open(self.metadata_file, "r") as f:
+                try:
+                    metadata = json.load(f)
+                except json.decoder.JSONDecodeError:
+                    metadata = {"chunks": []}
+        else:
+            metadata = {"chunks": []}
+        return metadata
+    def save_metadata(self):
+        """Save metadata"""
+        with self.fs.open(self.metadata_file, "w") as f:
+            json.dump(self.metadata, f)
+    def chunk_exists(self, variable, year, month, pressure_level):
+        """Check if chunk exists"""
+        for chunk in self.metadata["chunks"]:
+            if (
+                chunk["variable"] == variable
+                and chunk["year"] == year
+                and chunk["month"] == month
+                and chunk["pressure_level"] == pressure_level
+            ):
+                return True
+        return False
+    def download_chunk(
+        self,
+        variable: str,
+        year: int,
+        month: int,
+        pressure_level: int = None,
+    ):
+        """
+        Download ERA5 data for the specified variable, date range, and pressure levels.
+        Parameters
+        ----------
+        variable : str
+            The ERA5 variable to download, e.g. 'tisr' for solar radiation or 'z' for geopotential.
+        year : int
+            The year to download.
+        month : int
+            The month to download.
+        pressure_level : int, optional
+            A pressure level to include in the download, by default None. If None, the single-level data will be downloaded.
+        Returns
+        -------
+        xr.Dataset
+            An xarray Dataset containing the downloaded data.
+        """
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Get all days in the month
+            days_in_month = calendar.monthrange(year, month)[1]
+            # Make tmpfile to store the data
+            output_file = os.path.join(
+                tmpdir,
+                f"{variable}_{year}_{month:02d}_{str(pressure_level)}.nc",
+            )
+            # start the CDS API client (maybe need to move this outside the loop?)
+            c = cdsapi.Client(quiet=True)
+            # Setup the request parameters
+            request_params = {
+                "product_type": "reanalysis",
+                "variable": [variable],
+                "year": str(year),
+                "month": str(month),
+                "day": [f"{day:02d}" for day in range(1, days_in_month + 1)],
+                "time_zone": "utc+00:00",
+                "frequency": "6_hourly",
+                "daily_statistic": "daily_mean",
+                "data_format": "netcdf"
+            }
+            if pressure_level:
+                request_params["pressure_level"] = [str(pressure_level)]
+                dataset_name = "derived-era5-pressure-levels-daily-statistics"
+            else:
+                dataset_name = "derived-era5-single-levels-daily-statistics"
+            # Download the data
+            c.retrieve(
+                dataset_name,
+                request_params,
+                output_file,
+            )
+            # Open the downloaded data
+            ds = xr.open_dataset(output_file)
+        return ds
+    def variable_to_zarr_name(self, variable: str, pressure_level: int = None):
+        """convert variable to zarr name"""
+        # create zarr path for variable
+        zarr_path = f"{self.base_path}/{variable}"
+        if pressure_level:
+            zarr_path += f"_pressure_level_{pressure_level}"
+        zarr_path += ".zarr"
+        return zarr_path
+    def download_and_upload_chunk(
+        self,
+        variable: str,
+        year: int,
+        month: int,
+        pressure_level: int = None,
+    ):
+        """
+        Downloads a chunk of ERA5 data for a specific variable and date range, and uploads it to a Zarr array.
+        This downloads a 1-month chunk of data.
+        Parameters
+        ----------
+        variable : str
+            The variable to download.
+        year : int
+            The year to download.
+        month : int
+            The month to download.
+        pressure_level : int, optional
+            Pressure levels to download, if applicable.
+        """
+        # Download the data
+        ds = self.download_chunk(variable, year, month, pressure_level)
+        if "valid_time" in ds.dims:
+            ds = ds.rename({"valid_time": "time"})
+        # Create the Zarr path
+        zarr_path = self.variable_to_zarr_name(variable, pressure_level)
+        # Specify the chunking options
+        chunking = {"time": 1, "latitude": 721, "longitude": 1440}
+        if "level" in ds.dims:
+            chunking["level"] = 1
+        # Re-chunk the dataset
+        ds = ds.chunk(chunking)
+        # Check if the Zarr dataset exists
+        if self.fs.exists(zarr_path):
+            mode = "a"
+            append_dim = "time"
+            create = False
+        else:
+            mode = "w"
+            append_dim = None
+            create = True
+        # Upload the data to the Zarr dataset
+        mapper = self.fs.get_mapper(zarr_path, create=create)
+        ds.to_zarr(mapper, mode=mode, consolidated=True, append_dim=append_dim)
+        # Update the metadata
+        self.metadata["chunks"].append(
+            {
+                "variable": variable,
+                "year": year,
+                "month": month,
+                "pressure_level": pressure_level,
+            }
+        )
+        self.save_metadata()
+    def download(
+        self,
+        variables: List[Union[str, Tuple[str, int]]],
+        date_range: Tuple[datetime.date, datetime.date],
+    ):
+        """
+        Start the process of mirroring the specified ERA5 variables for the given date range.
+        Parameters
+        ----------
+        variables : List[Union[str, Tuple[str, List[int]]]]
+            A list of variables to mirror, where each element can either be a string (single-level variable)
+            or a tuple (variable with pressure level).
+        date_range : Tuple[datetime.date, datetime.date]
+            A tuple containing the start and end dates for the data to be mirrored. This will download and store every month in the range.
+        Returns
+        -------
+        zarr_paths : List[str]
+            A list of Zarr paths for each of the variables.
+        """
+        start_date, end_date = date_range
+        # Reformat the variables list so all elements are tuples
+        reformated_variables = []
+        for variable in variables:
+            if isinstance(variable, str):
+                reformated_variables.append(tuple([variable, None]))
+            else:
+                reformated_variables.append(variable)
+        # Start Downloading
+        with ProgressBar():
+            # Round dates to months
+            current_date = start_date.replace(day=1)
+            end_date = end_date.replace(day=1)
+            while current_date <= end_date:
+                # Create a list of tasks to download the data
+                tasks = []
+                for variable, pressure_level in reformated_variables:
+                    if not self.chunk_exists(
+                        variable,
+                        current_date.year,
+                        current_date.month,
+                        pressure_level,
+                    ):
+                        task = dask.delayed(self.download_and_upload_chunk)(
+                            variable,
+                            current_date.year,
+                            current_date.month,
+                            pressure_level,
+                        )
+                        tasks.append(task)
+                    else:
+                        print(
+                            f"Chunk for {variable} {pressure_level} {current_date.year}-{current_date.month} already exists. Skipping."
+                        )
+                # Execute the tasks with Dask
+                print(f"Downloading data for {current_date.year}-{current_date.month}")
+                if tasks:
+                    dask.compute(*tasks)
+                # Update the metadata
+                self.save_metadata()
+                # Update the current date
+                days_in_month = calendar.monthrange(
+                    year=current_date.year, month=current_date.month
+                )[1]
+                current_date += datetime.timedelta(days=days_in_month)
+        # Return the Zarr paths
+        zarr_paths = []
+        for variable, pressure_level in reformated_variables:
+            zarr_path = self.variable_to_zarr_name(variable, pressure_level)
+            zarr_paths.append(zarr_path)
+        # Check that Zarr arrays have correct dt for time dimension
+        for zarr_path in zarr_paths:
+            ds = xr.open_zarr(zarr_path)
+            time_stamps = ds.time.values
+            dt = time_stamps[1:] - time_stamps[:-1]
+            assert np.all(dt == dt[0]), (
+                f"Zarr array {zarr_path} has incorrect dt for time dimension. An error may have occurred during download. Please delete the Zarr array and try again."
+            )
+        return zarr_paths

{climdata-0.0.5 → climdata-0.0.7}/climdata/datasets/MSWX.py RENAMED Viewed

@@ -23,6 +23,8 @@ from google.oauth2 import service_account
 from googleapiclient.discovery import build
 from googleapiclient.http import MediaIoBaseDownload
+from climdata.utils.utils_download import list_drive_files, download_drive_file
 import io
 import requests
 from scipy.spatial import cKDTree

{climdata-0.0.5 → climdata-0.0.7}/climdata.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: climdata
-Version: 0.0.5
+Version: 0.0.7
 Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
 Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
 License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: zarr
 Requires-Dist: ipyleaflet
 Requires-Dist: wetterdienst
 Requires-Dist: pint-pandas
+Requires-Dist: cdsapi
 Requires-Dist: hydra-core
 Requires-Dist: intake
 Requires-Dist: intake-esm

{climdata-0.0.5 → climdata-0.0.7}/climdata.egg-info/SOURCES.txt RENAMED Viewed

@@ -30,8 +30,10 @@ climdata.egg-info/top_level.txt
 climdata/conf/config.yaml
 climdata/conf/mappings/parameters.yaml
 climdata/conf/mappings/variables.yaml
-climdata/datasets/CMIP.py
+climdata/datasets/CMIPCloud.py
+climdata/datasets/CMIPlocal.py
 climdata/datasets/DWD.py
+climdata/datasets/ERA5.py
 climdata/datasets/MSWX.py
 climdata/utils/__init__.py
 climdata/utils/config.py
@@ -46,5 +48,6 @@ docs/installation.md
 docs/usage.md
 docs/overrides/main.html
 examples/extract_dwd_loc.ipynb
+examples/zarr_tas_data/metadata.json
 tests/__init__.py
 tests/test_climdata.py

{climdata-0.0.5 → climdata-0.0.7}/climdata.egg-info/requires.txt RENAMED Viewed

@@ -18,6 +18,7 @@ zarr
 ipyleaflet
 wetterdienst
 pint-pandas
+cdsapi
 hydra-core
 intake
 intake-esm

climdata 0.0.5__tar.gz → 0.0.7__tar.gz

Potentially problematic release.

climdata 0.0.5tar.gz → 0.0.7tar.gz