lcd-v2-data 1.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lcd_v2_data-1.1.13/LICENSE +13 -0
- lcd_v2_data-1.1.13/MANIFEST.in +1 -0
- lcd_v2_data-1.1.13/PKG-INFO +210 -0
- lcd_v2_data-1.1.13/README.md +184 -0
- lcd_v2_data-1.1.13/pyproject.toml +94 -0
- lcd_v2_data-1.1.13/setup.cfg +4 -0
- lcd_v2_data-1.1.13/src/lcd_data/__init__.py +16 -0
- lcd_v2_data-1.1.13/src/lcd_data/build_lcd_dataset.py +389 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/README.txt +31 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.cpg +1 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.dbf +0 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.prj +1 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp +0 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp.ea.iso.xml +621 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp.iso.xml +834 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shx +0 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/EIA/RTO_ISO_regions.README +23 -0
- lcd_v2_data-1.1.13/src/lcd_data/data/EIA/RTO_ISO_regions.geojson +117 -0
- lcd_v2_data-1.1.13/src/lcd_data/ncei.py +438 -0
- lcd_v2_data-1.1.13/src/lcd_data/region_codes.py +316 -0
- lcd_v2_data-1.1.13/src/lcd_data/rto_iso.py +107 -0
- lcd_v2_data-1.1.13/src/lcd_data/saturation.py +91 -0
- lcd_v2_data-1.1.13/src/lcd_data/stations.py +2000 -0
- lcd_v2_data-1.1.13/src/lcd_data/test.py +5 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/PKG-INFO +210 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/SOURCES.txt +29 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/dependency_links.txt +1 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/entry_points.txt +2 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/requires.txt +16 -0
- lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/top_level.txt +1 -0
- lcd_v2_data-1.1.13/tests/test_test.py +5 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Jan Kazil
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
|
6
|
+
|
|
7
|
+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
|
8
|
+
|
|
9
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
|
10
|
+
|
|
11
|
+
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
|
12
|
+
|
|
13
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
recursive-include src/lcd_data/data *
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lcd-v2-data
|
|
3
|
+
Version: 1.1.13
|
|
4
|
+
Summary: Python toolkit for downloading and processing Local Climatological Data version 2 (LCDv2) data.
|
|
5
|
+
Author: Jan Kazil
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
Requires-Python: <=3.12.9,>=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: geopandas
|
|
11
|
+
Requires-Dist: matplotlib
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: netcdf4
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: requests
|
|
16
|
+
Requires-Dist: shapely
|
|
17
|
+
Requires-Dist: timezonefinder
|
|
18
|
+
Requires-Dist: xarray
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-cov>=5; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy>=1.11; extra == "dev"
|
|
23
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
24
|
+
Requires-Dist: pre-commit>=3.7; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# lcd-v2-data
|
|
28
|
+
|
|
29
|
+
**lcd-v2-data** is a Python toolkit for downloading and processing [Local Climatological Data version 2 (LCDv2) ](https://www.ncei.noaa.gov/products/land-based-station/local-climatological-data) data.
|
|
30
|
+
|
|
31
|
+
It provides:
|
|
32
|
+
|
|
33
|
+
- A top-level command-line tool that
|
|
34
|
+
|
|
35
|
+
- automates the download of LCD v2 station observations for
|
|
36
|
+
- individual stations
|
|
37
|
+
- U.S. states and territories
|
|
38
|
+
- Regional Transmission Organization (RTO) / Independent System Operator (ISO) regions
|
|
39
|
+
|
|
40
|
+
- constructs full-hourly UTC time series of
|
|
41
|
+
- temperature at 2 m
|
|
42
|
+
- dew point temperature at 2 m
|
|
43
|
+
- relative humidity at 2 m
|
|
44
|
+
- wind speed at 10 m
|
|
45
|
+
|
|
46
|
+
from the irregularly spaced, local time LCD v2 station observation time series, for a selected station or for stations in the selected U.S. state/territory or RTO/ISO region, and a user-specified time range. The time series are saved in a netCDF file.
|
|
47
|
+
|
|
48
|
+
- Modules for downloading and processing LCD v2 station observations.
|
|
49
|
+
|
|
50
|
+
LCD v2 is provided by the [National Centers for Environmental Information (NCEI)](https://www.ncei.noaa.gov/).
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
mamba install -c jan.kazil -c conda-forge lcd-v2-data
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Overview
|
|
59
|
+
|
|
60
|
+
The package provides a command-line tool that selects stations by geography (a single station by GHCNh identifier, a U.S. state or territory, RTO/ISO regions, and the special region CONUS representing the contiguous U.S.), checks data availability, downloads LCD v2 observation files for a given year range, constructs full-hourly UTC time series for the observables, and saves them in a NetCDF file. It optionally generates plots showing the original and the interpolated time series.
|
|
61
|
+
|
|
62
|
+
Geospatial region selection is based on U.S. Energy Information Administration definitions of RTO/ISO footprints, and U.S. Census Bureau state/territory boundaries, included with the package.
|
|
63
|
+
|
|
64
|
+
The list of GHCNh station identifiers is available [here](https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/doc/ghcnh-station-list.txt). LCD v2 contains only U.S. stations.
|
|
65
|
+
|
|
66
|
+
## Workflow
|
|
67
|
+
|
|
68
|
+
The following describes the internal workflow performed by the command-line tool:
|
|
69
|
+
|
|
70
|
+
1. Load the region geometry (RTO/ISO polygons or U.S. state/territory boundaries) if a region is specified; skip this step if a station ID is provided.
|
|
71
|
+
2. Retrieve the station list from NCEI and either filter it spatially by region or select the specified station.
|
|
72
|
+
3. Filter the stations by data availability for the requested year range, either online by probing NCEI or offline by checking local files.
|
|
73
|
+
4. Save the filtered station list for reference.
|
|
74
|
+
5. Download LCD v2 observation files from NCEI for the selected stations and years, skipping files already present that match by ETag.
|
|
75
|
+
6. Create full-hourly UTC time series for temperature (T), dew point temperature (Td), relative humidity (RH), and wind speed by converting local observation time to UTC and interpolating the data to full hours. Remove temperatures above 60 °C. Perform interpolation only across gaps of up to 2 hours. Derive RH from T and Td.
|
|
76
|
+
7. Optionally create comparison plots for the original and interpolated series.
|
|
77
|
+
8. Save the full-hourly UTC time series in a NetCDF file, for the given station or the stations in the state/region.
|
|
78
|
+
|
|
79
|
+
**Notes:** Interpolation of station observation time series across many years and/or many stations can be slow due to inherent limitations of Python. Creating plots is very slow and recommended only for individual stations (as opposed to regions).
|
|
80
|
+
|
|
81
|
+
## Command-line interface (CLI)
|
|
82
|
+
|
|
83
|
+
The CLI is exposed as `"build-lcd-dataset"` when installed.
|
|
84
|
+
|
|
85
|
+
**Usage:**
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
build-lcd-dataset START_YEAR END_YEAR REGION DATA_DIR [-n N_JOBS] [-o] [-p PLOT_DIR] [-r] [-v]
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Positional arguments**
|
|
92
|
+
|
|
93
|
+
- `START_YEAR` and `END_YEAR`: Inclusive range of years
|
|
94
|
+
- `REGION`: Region or station selector. Use a two-letter U.S. state or territory code, `'CONUS'`, one of the RTO/ISO codes, or a GHCNh station identifier
|
|
95
|
+
- `DATA_DIR`: Directory into which data will be downloaded
|
|
96
|
+
|
|
97
|
+
**Options**
|
|
98
|
+
|
|
99
|
+
- `-n, --n N_JOBS`: Number of parallel download processes. Values greater than 1 accelerate downloads but may increase the risk of network errors.
|
|
100
|
+
- `-o, --offline`: Work offline. All required files must have been downloaded to `DATA_DIR` in a previous call without this flag.
|
|
101
|
+
- `-p, --plotdir PLOT_DIR`: Directory where plots of the original and interpolated full-hourly time series will be created. Very slow. If omitted, no plots are generated.
|
|
102
|
+
- `-r, --refresh`: Download and process files even if they already exist in `DATA_DIR`.
|
|
103
|
+
- `-v, --verbose`: Print progress information.
|
|
104
|
+
|
|
105
|
+
**Examples:**
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Show usage information, valid region codes, and RTO/ISO region names:
|
|
109
|
+
build-lcd-dataset --help
|
|
110
|
+
|
|
111
|
+
# Download LCDv2 data and build a dataset as a NetCDF file for station USW00003017 for the years 2020–2025,
|
|
112
|
+
# in the directory /path/to/data, and create plots in /path/to/plots:
|
|
113
|
+
build-lcd-dataset -v 2020 2025 USW00003017 /path/to/data -p /path/to/plots
|
|
114
|
+
|
|
115
|
+
# Download LCDv2 data and build a dataset as a NetCDF file for the RTO region ERCOT for the year 2022
|
|
116
|
+
# in the directory /path/to/data, using 32 parallel download processes:
|
|
117
|
+
build-lcd-dataset -v 2022 2022 ERCOT /path/to/data -n 32
|
|
118
|
+
|
|
119
|
+
# Build a dataset as a NetCDF file for the state of Colorado for the year 2021, offline from data
|
|
120
|
+
# previously downloaded to /path/to/data:
|
|
121
|
+
build-lcd-dataset -v 2021 2021 CO /path/to/data --offline
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Sample results
|
|
125
|
+
|
|
126
|
+
Original and interpolated full-hourly UTC time series in November 2024, Twentynine Palms, CA:
|
|
127
|
+
|
|
128
|
+

|
|
129
|
+
|
|
130
|
+
## Public API
|
|
131
|
+
|
|
132
|
+
### Modules
|
|
133
|
+
|
|
134
|
+
#### `lcd_data.build_lcd_dataset`
|
|
135
|
+
|
|
136
|
+
Provides a programmatic API equivalent to the command-line interface for building LCD datasets from NOAA NCEI observations.
|
|
137
|
+
|
|
138
|
+
- `run_build(start_year, end_year, region_name, data_dir, plot_dir=None, n_jobs=1, offline=False, refresh=False, verbose=False)`:
|
|
139
|
+
|
|
140
|
+
Downloads, processes, and assembles NOAA NCEI Local Climatological Data (LCD) into a NetCDF file containing full-hourly UTC time series for a specified geographic region or individual station over an inclusive range of years.
|
|
141
|
+
Operates both online (with automatic downloads) and offline (using pre-downloaded files).
|
|
142
|
+
If `plot_dir` is provided, diagnostic plots of original and interpolated time series are generated.
|
|
143
|
+
Returns the `Path` to the generated NetCDF file.
|
|
144
|
+
|
|
145
|
+
#### `lcd_data.ncei`
|
|
146
|
+
Utilities for station metadata and LCD v2 downloads.
|
|
147
|
+
|
|
148
|
+
- `download_stations_meta_files(local_dir)`: Download GHCNh and LCD v2 station meta documents.
|
|
149
|
+
- `lcd_data_file_name(year, station_id)`: Construct LCD v2 observation file name.
|
|
150
|
+
- `lcd_data_file_paths(start_year, end_year, station_ids, local_dir)`: Build local paths for all expected files.
|
|
151
|
+
- `lcd_data_url(year, station_id)`: Build the absolute URL to an LCD v2 observation file.
|
|
152
|
+
- `lcd_data_urls(station_ids, start_year, end_year, n_jobs)`: Probe NCEI server to list existing files.
|
|
153
|
+
- `download_many(...)` and `download_threaded(...)`: Concurrent file downloads with optional refresh behavior.
|
|
154
|
+
- `download_file(url, local_dir, refresh=False, verbose=False)`: Robust download with ETag checking and retries.
|
|
155
|
+
|
|
156
|
+
#### `lcd_data.rto_iso`
|
|
157
|
+
Helpers to work with RTO/ISO region polygons.
|
|
158
|
+
|
|
159
|
+
- `REGION_NAMES`: `['CAISO', 'ERCOT', 'ISONE', 'NYISO', 'MISO', 'PJM', 'SPP']`.
|
|
160
|
+
- `regions(rto_iso_geojson)`: Read GeoJSON and return a GeoDataFrame with merged geometries for each region.
|
|
161
|
+
- `region(rto_iso_geojson, region_name)`: Return a GeoDataFrame for the requested region.
|
|
162
|
+
|
|
163
|
+
#### `lcd_data.saturation`
|
|
164
|
+
Saturation vapor pressure and relative humidity utilities.
|
|
165
|
+
|
|
166
|
+
- `esatw(T)`: Saturation vapor pressure over liquid water (hPa) using an 8th‑order polynomial fit.
|
|
167
|
+
- `rh(T, Td)`: Relative humidity (%) computed from temperature and dew point.
|
|
168
|
+
|
|
169
|
+
#### `lcd_data.region_codes`
|
|
170
|
+
Provides
|
|
171
|
+
|
|
172
|
+
- `lcd_data.region_codes.countries`: Three-letter ISO 3166-1 alpha-3 country codes
|
|
173
|
+
- `lcd_data.region_codes.us_states_territories`: Two-letter U.S. state or territory codes
|
|
174
|
+
- `lcd_data.region_codes.conus`: The special `CONUS` region code
|
|
175
|
+
- `lcd_data.region_codes.rto_iso_regions`: RTO/ISO region codes
|
|
176
|
+
|
|
177
|
+
#### `lcd_data.stations`
|
|
178
|
+
Station catalog handling, filtering, reading, interpolation, and writing.
|
|
179
|
+
|
|
180
|
+
- `Stations.from_url()` / `Stations.from_file(path)`: Build the station catalog from GHCNh-format metadata.
|
|
181
|
+
- Spatial selection by region geometry with `filter_by_region(region_gdf)` and by bounding box with `filter_by_coordinates(...)`.
|
|
182
|
+
- Availability filters: `filter_by_data_availability_online(start_time, end_time, n_jobs, verbose)` and `filter_by_data_availability_offline(data_dir, start_time, end_time, verbose)`.
|
|
183
|
+
- Station utilities: `filter_by_id(station_id)`, `ids()`, `save_station_list(path)`.
|
|
184
|
+
- `read_station_observations(...)`: Read and clean per‑station LCD v2 observation files; convert times to UTC, coerce numeric columns, correct Celsius-with-18.3° base fields, drop non-observational report types, limit unrealistic temperatures, and compute hourly RH.
|
|
185
|
+
- `construct_hourly(...)`: Build full-hourly UTC series for `T`, `Td`, `RH`, and `windspeed`, with optional plotting and gap-limited interpolation.
|
|
186
|
+
- `write_utc_hourly_netcdf(path)`: Save the hourly dataset to NetCDF with safe encodings.
|
|
187
|
+
|
|
188
|
+
## Development
|
|
189
|
+
|
|
190
|
+
### Code Quality and Testing Commands
|
|
191
|
+
|
|
192
|
+
- `make fmt` - Runs ruff format, which automatically reformats Python files according to the style rules in `pyproject.toml`
|
|
193
|
+
- `make lint` - Runs ruff check - -fix, which lints the code (checks for style errors, bugs, outdated patterns, etc.) and auto-fixes what it can.
|
|
194
|
+
- `make check` - Runs fmt and lint.
|
|
195
|
+
- `make type` - Currently disabled. Runs mypy, the static type checker, using the strictness settings from `pyproject.toml`. Mypy is a static type checker for Python, a dynamically typed language. Because static analysis cannot account for all dynamic runtime behaviors, mypy may report false positives which do not reflect actual runtime issues.
|
|
196
|
+
- `make test` - Runs pytest with reporting (configured in `pyproject.toml`).
|
|
197
|
+
|
|
198
|
+
## Disclaimers
|
|
199
|
+
|
|
200
|
+
The LCD v2 data accessed by this software are publicly available from NOAA's National Centers for Environmental Information (NCEI) and are subject to their terms of use. This project is not affiliated with or endorsed by NOAA.
|
|
201
|
+
|
|
202
|
+
This software uses U.S. Census Bureau and U.S. Energy Information Administration data, but is neither endorsed nor certified by the U.S. Census Bureau or the U.S. Energy Information Administration.
|
|
203
|
+
|
|
204
|
+
## Author
|
|
205
|
+
|
|
206
|
+
Jan Kazil - jan.kazil.dev@gmail.com - [jankazil.com](https://jankazil.com)
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
BSD-3-Clause
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# lcd-v2-data
|
|
2
|
+
|
|
3
|
+
**lcd-v2-data** is a Python toolkit for downloading and processing [Local Climatological Data version 2 (LCDv2) ](https://www.ncei.noaa.gov/products/land-based-station/local-climatological-data) data.
|
|
4
|
+
|
|
5
|
+
It provides:
|
|
6
|
+
|
|
7
|
+
- A top-level command-line tool that
|
|
8
|
+
|
|
9
|
+
- automates the download of LCD v2 station observations for
|
|
10
|
+
- individual stations
|
|
11
|
+
- U.S. states and territories
|
|
12
|
+
- Regional Transmission Organization (RTO) / Independent System Operator (ISO) regions
|
|
13
|
+
|
|
14
|
+
- constructs full-hourly UTC time series of
|
|
15
|
+
- temperature at 2 m
|
|
16
|
+
- dew point temperature at 2 m
|
|
17
|
+
- relative humidity at 2 m
|
|
18
|
+
- wind speed at 10 m
|
|
19
|
+
|
|
20
|
+
from the irregularly spaced, local time LCD v2 station observation time series, for a selected station or for stations in the selected U.S. state/territory or RTO/ISO region, and a user-specified time range. The time series are saved in a netCDF file.
|
|
21
|
+
|
|
22
|
+
- Modules for downloading and processing LCD v2 station observations.
|
|
23
|
+
|
|
24
|
+
LCD v2 is provided by the [National Centers for Environmental Information (NCEI)](https://www.ncei.noaa.gov/).
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
mamba install -c jan.kazil -c conda-forge lcd-v2-data
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Overview
|
|
33
|
+
|
|
34
|
+
The package provides a command-line tool that selects stations by geography (a single station by GHCNh identifier, a U.S. state or territory, RTO/ISO regions, and the special region CONUS representing the contiguous U.S.), checks data availability, downloads LCD v2 observation files for a given year range, constructs full-hourly UTC time series for the observables, and saves them in a NetCDF file. It optionally generates plots showing the original and the interpolated time series.
|
|
35
|
+
|
|
36
|
+
Geospatial region selection is based on U.S. Energy Information Administration definitions of RTO/ISO footprints, and U.S. Census Bureau state/territory boundaries, included with the package.
|
|
37
|
+
|
|
38
|
+
The list of GHCNh station identifiers is available [here](https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/doc/ghcnh-station-list.txt). LCD v2 contains only U.S. stations.
|
|
39
|
+
|
|
40
|
+
## Workflow
|
|
41
|
+
|
|
42
|
+
The following describes the internal workflow performed by the command-line tool:
|
|
43
|
+
|
|
44
|
+
1. Load the region geometry (RTO/ISO polygons or U.S. state/territory boundaries) if a region is specified; skip this step if a station ID is provided.
|
|
45
|
+
2. Retrieve the station list from NCEI and either filter it spatially by region or select the specified station.
|
|
46
|
+
3. Filter the stations by data availability for the requested year range, either online by probing NCEI or offline by checking local files.
|
|
47
|
+
4. Save the filtered station list for reference.
|
|
48
|
+
5. Download LCD v2 observation files from NCEI for the selected stations and years, skipping files already present that match by ETag.
|
|
49
|
+
6. Create full-hourly UTC time series for temperature (T), dew point temperature (Td), relative humidity (RH), and wind speed by converting local observation time to UTC and interpolating the data to full hours. Remove temperatures above 60 °C. Perform interpolation only across gaps of up to 2 hours. Derive RH from T and Td.
|
|
50
|
+
7. Optionally create comparison plots for the original and interpolated series.
|
|
51
|
+
8. Save the full-hourly UTC time series in a NetCDF file, for the given station or the stations in the state/region.
|
|
52
|
+
|
|
53
|
+
**Notes:** Interpolation of station observation time series across many years and/or many stations can be slow due to inherent limitations of Python. Creating plots is very slow and recommended only for individual stations (as opposed to regions).
|
|
54
|
+
|
|
55
|
+
## Command-line interface (CLI)
|
|
56
|
+
|
|
57
|
+
The CLI is exposed as `"build-lcd-dataset"` when installed.
|
|
58
|
+
|
|
59
|
+
**Usage:**
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
build-lcd-dataset START_YEAR END_YEAR REGION DATA_DIR [-n N_JOBS] [-o] [-p PLOT_DIR] [-r] [-v]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Positional arguments**
|
|
66
|
+
|
|
67
|
+
- `START_YEAR` and `END_YEAR`: Inclusive range of years
|
|
68
|
+
- `REGION`: Region or station selector. Use a two-letter U.S. state or territory code, `'CONUS'`, one of the RTO/ISO codes, or a GHCNh station identifier
|
|
69
|
+
- `DATA_DIR`: Directory into which data will be downloaded
|
|
70
|
+
|
|
71
|
+
**Options**
|
|
72
|
+
|
|
73
|
+
- `-n, --n N_JOBS`: Number of parallel download processes. Values greater than 1 accelerate downloads but may increase the risk of network errors.
|
|
74
|
+
- `-o, --offline`: Work offline. All required files must have been downloaded to `DATA_DIR` in a previous call without this flag.
|
|
75
|
+
- `-p, --plotdir PLOT_DIR`: Directory where plots of the original and interpolated full-hourly time series will be created. Very slow. If omitted, no plots are generated.
|
|
76
|
+
- `-r, --refresh`: Download and process files even if they already exist in `DATA_DIR`.
|
|
77
|
+
- `-v, --verbose`: Print progress information.
|
|
78
|
+
|
|
79
|
+
**Examples:**
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Show usage information, valid region codes, and RTO/ISO region names:
|
|
83
|
+
build-lcd-dataset --help
|
|
84
|
+
|
|
85
|
+
# Download LCDv2 data and build a dataset as a NetCDF file for station USW00003017 for the years 2020–2025,
|
|
86
|
+
# in the directory /path/to/data, and create plots in /path/to/plots:
|
|
87
|
+
build-lcd-dataset -v 2020 2025 USW00003017 /path/to/data -p /path/to/plots
|
|
88
|
+
|
|
89
|
+
# Download LCDv2 data and build a dataset as a NetCDF file for the RTO region ERCOT for the year 2022
|
|
90
|
+
# in the directory /path/to/data, using 32 parallel download processes:
|
|
91
|
+
build-lcd-dataset -v 2022 2022 ERCOT /path/to/data -n 32
|
|
92
|
+
|
|
93
|
+
# Build a dataset as a NetCDF file for the state of Colorado for the year 2021, offline from data
|
|
94
|
+
# previously downloaded to /path/to/data:
|
|
95
|
+
build-lcd-dataset -v 2021 2021 CO /path/to/data --offline
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Sample results
|
|
99
|
+
|
|
100
|
+
Original and interpolated full-hourly UTC time series in November 2024, Twentynine Palms, CA:
|
|
101
|
+
|
|
102
|
+

|
|
103
|
+
|
|
104
|
+
## Public API
|
|
105
|
+
|
|
106
|
+
### Modules
|
|
107
|
+
|
|
108
|
+
#### `lcd_data.build_lcd_dataset`
|
|
109
|
+
|
|
110
|
+
Provides a programmatic API equivalent to the command-line interface for building LCD datasets from NOAA NCEI observations.
|
|
111
|
+
|
|
112
|
+
- `run_build(start_year, end_year, region_name, data_dir, plot_dir=None, n_jobs=1, offline=False, refresh=False, verbose=False)`:
|
|
113
|
+
|
|
114
|
+
Downloads, processes, and assembles NOAA NCEI Local Climatological Data (LCD) into a NetCDF file containing full-hourly UTC time series for a specified geographic region or individual station over an inclusive range of years.
|
|
115
|
+
Operates both online (with automatic downloads) and offline (using pre-downloaded files).
|
|
116
|
+
If `plot_dir` is provided, diagnostic plots of original and interpolated time series are generated.
|
|
117
|
+
Returns the `Path` to the generated NetCDF file.
|
|
118
|
+
|
|
119
|
+
#### `lcd_data.ncei`
|
|
120
|
+
Utilities for station metadata and LCD v2 downloads.
|
|
121
|
+
|
|
122
|
+
- `download_stations_meta_files(local_dir)`: Download GHCNh and LCD v2 station meta documents.
|
|
123
|
+
- `lcd_data_file_name(year, station_id)`: Construct LCD v2 observation file name.
|
|
124
|
+
- `lcd_data_file_paths(start_year, end_year, station_ids, local_dir)`: Build local paths for all expected files.
|
|
125
|
+
- `lcd_data_url(year, station_id)`: Build the absolute URL to an LCD v2 observation file.
|
|
126
|
+
- `lcd_data_urls(station_ids, start_year, end_year, n_jobs)`: Probe NCEI server to list existing files.
|
|
127
|
+
- `download_many(...)` and `download_threaded(...)`: Concurrent file downloads with optional refresh behavior.
|
|
128
|
+
- `download_file(url, local_dir, refresh=False, verbose=False)`: Robust download with ETag checking and retries.
|
|
129
|
+
|
|
130
|
+
#### `lcd_data.rto_iso`
|
|
131
|
+
Helpers to work with RTO/ISO region polygons.
|
|
132
|
+
|
|
133
|
+
- `REGION_NAMES`: `['CAISO', 'ERCOT', 'ISONE', 'NYISO', 'MISO', 'PJM', 'SPP']`.
|
|
134
|
+
- `regions(rto_iso_geojson)`: Read GeoJSON and return a GeoDataFrame with merged geometries for each region.
|
|
135
|
+
- `region(rto_iso_geojson, region_name)`: Return a GeoDataFrame for the requested region.
|
|
136
|
+
|
|
137
|
+
#### `lcd_data.saturation`
|
|
138
|
+
Saturation vapor pressure and relative humidity utilities.
|
|
139
|
+
|
|
140
|
+
- `esatw(T)`: Saturation vapor pressure over liquid water (hPa) using an 8th‑order polynomial fit.
|
|
141
|
+
- `rh(T, Td)`: Relative humidity (%) computed from temperature and dew point.
|
|
142
|
+
|
|
143
|
+
#### `lcd_data.region_codes`
|
|
144
|
+
Provides
|
|
145
|
+
|
|
146
|
+
- `lcd_data.region_codes.countries`: Three-letter ISO 3166-1 alpha-3 country codes
|
|
147
|
+
- `lcd_data.region_codes.us_states_territories`: Two-letter U.S. state or territory codes
|
|
148
|
+
- `lcd_data.region_codes.conus`: The special `CONUS` region code
|
|
149
|
+
- `lcd_data.region_codes.rto_iso_regions`: RTO/ISO region codes
|
|
150
|
+
|
|
151
|
+
#### `lcd_data.stations`
|
|
152
|
+
Station catalog handling, filtering, reading, interpolation, and writing.
|
|
153
|
+
|
|
154
|
+
- `Stations.from_url()` / `Stations.from_file(path)`: Build the station catalog from GHCNh-format metadata.
|
|
155
|
+
- Spatial selection by region geometry with `filter_by_region(region_gdf)` and by bounding box with `filter_by_coordinates(...)`.
|
|
156
|
+
- Availability filters: `filter_by_data_availability_online(start_time, end_time, n_jobs, verbose)` and `filter_by_data_availability_offline(data_dir, start_time, end_time, verbose)`.
|
|
157
|
+
- Station utilities: `filter_by_id(station_id)`, `ids()`, `save_station_list(path)`.
|
|
158
|
+
- `read_station_observations(...)`: Read and clean per‑station LCD v2 observation files; convert times to UTC, coerce numeric columns, correct Celsius-with-18.3° base fields, drop non-observational report types, limit unrealistic temperatures, and compute hourly RH.
|
|
159
|
+
- `construct_hourly(...)`: Build full-hourly UTC series for `T`, `Td`, `RH`, and `windspeed`, with optional plotting and gap-limited interpolation.
|
|
160
|
+
- `write_utc_hourly_netcdf(path)`: Save the hourly dataset to NetCDF with safe encodings.
|
|
161
|
+
|
|
162
|
+
## Development
|
|
163
|
+
|
|
164
|
+
### Code Quality and Testing Commands
|
|
165
|
+
|
|
166
|
+
- `make fmt` - Runs ruff format, which automatically reformats Python files according to the style rules in `pyproject.toml`
|
|
167
|
+
- `make lint` - Runs ruff check - -fix, which lints the code (checks for style errors, bugs, outdated patterns, etc.) and auto-fixes what it can.
|
|
168
|
+
- `make check` - Runs fmt and lint.
|
|
169
|
+
- `make type` - Currently disabled. Runs mypy, the static type checker, using the strictness settings from `pyproject.toml`. Mypy is a static type checker for Python, a dynamically typed language. Because static analysis cannot account for all dynamic runtime behaviors, mypy may report false positives which do not reflect actual runtime issues.
|
|
170
|
+
- `make test` - Runs pytest with reporting (configured in `pyproject.toml`).
|
|
171
|
+
|
|
172
|
+
## Disclaimers
|
|
173
|
+
|
|
174
|
+
The LCD v2 data accessed by this software are publicly available from NOAA's National Centers for Environmental Information (NCEI) and are subject to their terms of use. This project is not affiliated with or endorsed by NOAA.
|
|
175
|
+
|
|
176
|
+
This software uses U.S. Census Bureau and U.S. Energy Information Administration data, but is neither endorsed nor certified by the U.S. Census Bureau or the U.S. Energy Information Administration.
|
|
177
|
+
|
|
178
|
+
## Author
|
|
179
|
+
|
|
180
|
+
Jan Kazil - jan.kazil.dev@gmail.com - [jankazil.com](https://jankazil.com)
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
BSD-3-Clause
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lcd-v2-data"
|
|
7
|
+
version = "1.1.13"
|
|
8
|
+
description = "Python toolkit for downloading and processing Local Climatological Data version 2 (LCDv2) data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11,<=3.12.9"
|
|
11
|
+
license = "BSD-3-Clause"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{ name = "Jan Kazil" }]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"geopandas",
|
|
16
|
+
"matplotlib",
|
|
17
|
+
"numpy",
|
|
18
|
+
"netcdf4",
|
|
19
|
+
"pandas",
|
|
20
|
+
"requests",
|
|
21
|
+
"shapely",
|
|
22
|
+
"timezonefinder",
|
|
23
|
+
"xarray",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
build-lcd-dataset = "lcd_data.build_lcd_dataset:main" # Connects name of executable in $PATH to the python module containing main()
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=8",
|
|
32
|
+
"pytest-cov>=5",
|
|
33
|
+
"mypy>=1.11",
|
|
34
|
+
"ruff>=0.5",
|
|
35
|
+
"pre-commit>=3.7",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools]
|
|
39
|
+
package-dir = {"" = "src"}
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
where = ["src"]
|
|
43
|
+
|
|
44
|
+
# Ensure non-Python assets ship inside the wheel/sdist
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
lcd_data = [
|
|
47
|
+
"data/EIA/*.geojson",
|
|
48
|
+
"data/CensusBureau/US_states/*",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[tool.ruff]
|
|
52
|
+
line-length = 128
|
|
53
|
+
target-version = "py313"
|
|
54
|
+
extend-exclude = [
|
|
55
|
+
"dist",
|
|
56
|
+
"build",
|
|
57
|
+
"data",
|
|
58
|
+
"demos",
|
|
59
|
+
"docs",
|
|
60
|
+
"experiments",
|
|
61
|
+
"notebooks",
|
|
62
|
+
"plots",
|
|
63
|
+
"results",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint]
|
|
67
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
68
|
+
ignore = ["E501", "F841", "SIM108"]
|
|
69
|
+
|
|
70
|
+
[tool.ruff.format]
|
|
71
|
+
quote-style = "preserve"
|
|
72
|
+
|
|
73
|
+
#
|
|
74
|
+
# MyPy is disabled because static analysis cannot account for all
|
|
75
|
+
# dynamic runtime behaviors, mypy may report false positives which
|
|
76
|
+
# do no reflect actual runtime issues.
|
|
77
|
+
#
|
|
78
|
+
#[tool.mypy]
|
|
79
|
+
#python_version = "3.11"
|
|
80
|
+
#warn_unused_configs = true
|
|
81
|
+
#disallow_untyped_defs = true
|
|
82
|
+
#disallow_incomplete_defs = true
|
|
83
|
+
#no_implicit_optional = true
|
|
84
|
+
#check_untyped_defs = true
|
|
85
|
+
#strict_optional = true
|
|
86
|
+
#pretty = true
|
|
87
|
+
#namespace_packages = true
|
|
88
|
+
#mypy_path = "src"
|
|
89
|
+
#files = ["src/lcd_data", "scripts"]
|
|
90
|
+
|
|
91
|
+
[tool.pytest.ini_options]
|
|
92
|
+
testpaths = ["tests"]
|
|
93
|
+
python_files = ["test_*.py"]
|
|
94
|
+
python_functions = ["test_*"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
# Distribution name as published (matches [project].name in pyproject.toml)
|
|
4
|
+
_DIST_NAME = "lcd-data"
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
__version__ = version(_DIST_NAME)
|
|
8
|
+
except PackageNotFoundError:
|
|
9
|
+
# Fallback to import package name; if still not installed, use local tag
|
|
10
|
+
pkg = __package__ or __name__.split(".", 1)[0]
|
|
11
|
+
try:
|
|
12
|
+
__version__ = version(pkg)
|
|
13
|
+
except PackageNotFoundError:
|
|
14
|
+
__version__ = "0.0.0+local"
|
|
15
|
+
|
|
16
|
+
__all__ = ["__version__"]
|