lcd-v2-data 1.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. lcd_v2_data-1.1.13/LICENSE +13 -0
  2. lcd_v2_data-1.1.13/MANIFEST.in +1 -0
  3. lcd_v2_data-1.1.13/PKG-INFO +210 -0
  4. lcd_v2_data-1.1.13/README.md +184 -0
  5. lcd_v2_data-1.1.13/pyproject.toml +94 -0
  6. lcd_v2_data-1.1.13/setup.cfg +4 -0
  7. lcd_v2_data-1.1.13/src/lcd_data/__init__.py +16 -0
  8. lcd_v2_data-1.1.13/src/lcd_data/build_lcd_dataset.py +389 -0
  9. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/README.txt +31 -0
  10. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.cpg +1 -0
  11. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.dbf +0 -0
  12. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.prj +1 -0
  13. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp +0 -0
  14. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp.ea.iso.xml +621 -0
  15. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shp.iso.xml +834 -0
  16. lcd_v2_data-1.1.13/src/lcd_data/data/CensusBureau/US_states/tl_2024_us_state.shx +0 -0
  17. lcd_v2_data-1.1.13/src/lcd_data/data/EIA/RTO_ISO_regions.README +23 -0
  18. lcd_v2_data-1.1.13/src/lcd_data/data/EIA/RTO_ISO_regions.geojson +117 -0
  19. lcd_v2_data-1.1.13/src/lcd_data/ncei.py +438 -0
  20. lcd_v2_data-1.1.13/src/lcd_data/region_codes.py +316 -0
  21. lcd_v2_data-1.1.13/src/lcd_data/rto_iso.py +107 -0
  22. lcd_v2_data-1.1.13/src/lcd_data/saturation.py +91 -0
  23. lcd_v2_data-1.1.13/src/lcd_data/stations.py +2000 -0
  24. lcd_v2_data-1.1.13/src/lcd_data/test.py +5 -0
  25. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/PKG-INFO +210 -0
  26. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/SOURCES.txt +29 -0
  27. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/dependency_links.txt +1 -0
  28. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/entry_points.txt +2 -0
  29. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/requires.txt +16 -0
  30. lcd_v2_data-1.1.13/src/lcd_v2_data.egg-info/top_level.txt +1 -0
  31. lcd_v2_data-1.1.13/tests/test_test.py +5 -0
@@ -0,0 +1,13 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2025, Jan Kazil
4
+
5
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8
+
9
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10
+
11
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12
+
13
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1 @@
1
+ recursive-include src/lcd_data/data *
@@ -0,0 +1,210 @@
1
+ Metadata-Version: 2.4
2
+ Name: lcd-v2-data
3
+ Version: 1.1.13
4
+ Summary: Python toolkit for downloading and processing Local Climatological Data version 2 (LCDv2) data.
5
+ Author: Jan Kazil
6
+ License-Expression: BSD-3-Clause
7
+ Requires-Python: <=3.12.9,>=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: geopandas
11
+ Requires-Dist: matplotlib
12
+ Requires-Dist: numpy
13
+ Requires-Dist: netcdf4
14
+ Requires-Dist: pandas
15
+ Requires-Dist: requests
16
+ Requires-Dist: shapely
17
+ Requires-Dist: timezonefinder
18
+ Requires-Dist: xarray
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=8; extra == "dev"
21
+ Requires-Dist: pytest-cov>=5; extra == "dev"
22
+ Requires-Dist: mypy>=1.11; extra == "dev"
23
+ Requires-Dist: ruff>=0.5; extra == "dev"
24
+ Requires-Dist: pre-commit>=3.7; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # lcd-v2-data
28
+
29
+ **lcd-v2-data** is a Python toolkit for downloading and processing [Local Climatological Data version 2 (LCDv2) ](https://www.ncei.noaa.gov/products/land-based-station/local-climatological-data) data.
30
+
31
+ It provides:
32
+
33
+ - A top-level command-line tool that
34
+
35
+ - automates the download of LCD v2 station observations for
36
+ - individual stations
37
+ - U.S. states and territories
38
+ - Regional Transmission Organization (RTO) / Independent System Operator (ISO) regions
39
+
40
+ - constructs full-hourly UTC time series of
41
+ - temperature at 2 m
42
+ - dew point temperature at 2 m
43
+ - relative humidity at 2 m
44
+ - wind speed at 10 m
45
+
46
+ from the irregularly spaced, local time LCD v2 station observation time series, for a selected station or for stations in the selected U.S. state/territory or RTO/ISO region, and a user-specified time range. The time series are saved in a netCDF file.
47
+
48
+ - Modules for downloading and processing LCD v2 station observations.
49
+
50
+ LCD v2 is provided by the [National Centers for Environmental Information (NCEI)](https://www.ncei.noaa.gov/).
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ mamba install -c jan.kazil -c conda-forge lcd-v2-data
56
+ ```
57
+
58
+ ## Overview
59
+
60
+ The package provides a command-line tool that selects stations by geography (a single station by GHCNh identifier, a U.S. state or territory, RTO/ISO regions, and the special region CONUS representing the contiguous U.S.), checks data availability, downloads LCD v2 observation files for a given year range, constructs full-hourly UTC time series for the observables, and saves them in a NetCDF file. It optionally generates plots showing the original and the interpolated time series.
61
+
62
+ Geospatial region selection is based on U.S. Energy Information Administration definitions of RTO/ISO footprints, and U.S. Census Bureau state/territory boundaries, included with the package.
63
+
64
+ The list of GHCNh station identifiers is available [here](https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/doc/ghcnh-station-list.txt). LCD v2 contains only U.S. stations.
65
+
66
+ ## Workflow
67
+
68
+ The following describes the internal workflow performed by the command-line tool:
69
+
70
+ 1. Load the region geometry (RTO/ISO polygons or U.S. state/territory boundaries) if a region is specified; skip this step if a station ID is provided.
71
+ 2. Retrieve the station list from NCEI and either filter it spatially by region or select the specified station.
72
+ 3. Filter the stations by data availability for the requested year range, either online by probing NCEI or offline by checking local files.
73
+ 4. Save the filtered station list for reference.
74
+ 5. Download LCD v2 observation files from NCEI for the selected stations and years, skipping files already present that match by ETag.
75
+ 6. Create full-hourly UTC time series for temperature (T), dew point temperature (Td), relative humidity (RH), and wind speed by converting local observation time to UTC and interpolating the data to full hours. Remove temperatures above 60 °C. Perform interpolation only across gaps of up to 2 hours. Derive RH from T and Td.
76
+ 7. Optionally create comparison plots for the original and interpolated series.
77
+ 8. Save the full-hourly UTC time series in a NetCDF file, for the given station or the stations in the state/region.
78
+
79
+ **Notes:** Interpolation of station observation time series across many years and/or many stations can be slow due to inherent limitations of Python. Creating plots is very slow and recommended only for individual stations (as opposed to regions).
80
+
81
+ ## Command-line interface (CLI)
82
+
83
+ The CLI is exposed as `"build-lcd-dataset"` when installed.
84
+
85
+ **Usage:**
86
+
87
+ ```bash
88
+ build-lcd-dataset START_YEAR END_YEAR REGION DATA_DIR [-n N_JOBS] [-o] [-p PLOT_DIR] [-r] [-v]
89
+ ```
90
+
91
+ **Positional arguments**
92
+
93
+ - `START_YEAR` and `END_YEAR`: Inclusive range of years
94
+ - `REGION`: Region or station selector. Use a two-letter U.S. state or territory code, `'CONUS'`, one of the RTO/ISO codes, or a GHCNh station identifier
95
+ - `DATA_DIR`: Directory into which data will be downloaded
96
+
97
+ **Options**
98
+
99
+ - `-n, --n N_JOBS`: Number of parallel download processes. Values greater than 1 accelerate downloads but may increase the risk of network errors.
100
+ - `-o, --offline`: Work offline. All required files must have been downloaded to `DATA_DIR` in a previous call without this flag.
101
+ - `-p, --plotdir PLOT_DIR`: Directory where plots of the original and interpolated full-hourly time series will be created. Very slow. If omitted, no plots are generated.
102
+ - `-r, --refresh`: Download and process files even if they already exist in `DATA_DIR`.
103
+ - `-v, --verbose`: Print progress information.
104
+
105
+ **Examples:**
106
+
107
+ ```bash
108
+ # Show usage information, valid region codes, and RTO/ISO region names:
109
+ build-lcd-dataset --help
110
+
111
+ # Download LCDv2 data and build a dataset as a NetCDF file for station USW00003017 for the years 2020–2025,
112
+ # in the directory /path/to/data, and create plots in /path/to/plots:
113
+ build-lcd-dataset -v 2020 2025 USW00003017 /path/to/data -p /path/to/plots
114
+
115
+ # Download LCDv2 data and build a dataset as a NetCDF file for the RTO region ERCOT for the year 2022
116
+ # in the directory /path/to/data, using 32 parallel download processes:
117
+ build-lcd-dataset -v 2022 2022 ERCOT /path/to/data -n 32
118
+
119
+ # Build a dataset as a NetCDF file for the state of Colorado for the year 2021, offline from data
120
+ # previously downloaded to /path/to/data:
121
+ build-lcd-dataset -v 2021 2021 CO /path/to/data --offline
122
+ ```
123
+
124
+ ## Sample results
125
+
126
+ Original and interpolated full-hourly UTC time series in November 2024, Twentynine Palms, CA:
127
+
128
+ ![LCD station USL000ANVC1 time series, December 2024](plots/USW00093121.Nov-2024.png)
129
+
130
+ ## Public API
131
+
132
+ ### Modules
133
+
134
+ #### `lcd_data.build_lcd_dataset`
135
+
136
+ Provides a programmatic API equivalent to the command-line interface for building LCD datasets from NOAA NCEI observations.
137
+
138
+ - `run_build(start_year, end_year, region_name, data_dir, plot_dir=None, n_jobs=1, offline=False, refresh=False, verbose=False)`:
139
+
140
+ Downloads, processes, and assembles NOAA NCEI Local Climatological Data (LCD) into a NetCDF file containing full-hourly UTC time series for a specified geographic region or individual station over an inclusive range of years.
141
+ Operates both online (with automatic downloads) and offline (using pre-downloaded files).
142
+ If `plot_dir` is provided, diagnostic plots of original and interpolated time series are generated.
143
+ Returns the `Path` to the generated NetCDF file.
144
+
145
+ #### `lcd_data.ncei`
146
+ Utilities for station metadata and LCD v2 downloads.
147
+
148
+ - `download_stations_meta_files(local_dir)`: Download GHCNh and LCD v2 station meta documents.
149
+ - `lcd_data_file_name(year, station_id)`: Construct LCD v2 observation file name.
150
+ - `lcd_data_file_paths(start_year, end_year, station_ids, local_dir)`: Build local paths for all expected files.
151
+ - `lcd_data_url(year, station_id)`: Build the absolute URL to an LCD v2 observation file.
152
+ - `lcd_data_urls(station_ids, start_year, end_year, n_jobs)`: Probe NCEI server to list existing files.
153
+ - `download_many(...)` and `download_threaded(...)`: Concurrent file downloads with optional refresh behavior.
154
+ - `download_file(url, local_dir, refresh=False, verbose=False)`: Robust download with ETag checking and retries.
155
+
156
+ #### `lcd_data.rto_iso`
157
+ Helpers to work with RTO/ISO region polygons.
158
+
159
+ - `REGION_NAMES`: `['CAISO', 'ERCOT', 'ISONE', 'NYISO', 'MISO', 'PJM', 'SPP']`.
160
+ - `regions(rto_iso_geojson)`: Read GeoJSON and return a GeoDataFrame with merged geometries for each region.
161
+ - `region(rto_iso_geojson, region_name)`: Return a GeoDataFrame for the requested region.
162
+
163
+ #### `lcd_data.saturation`
164
+ Saturation vapor pressure and relative humidity utilities.
165
+
166
+ - `esatw(T)`: Saturation vapor pressure over liquid water (hPa) using an 8th‑order polynomial fit.
167
+ - `rh(T, Td)`: Relative humidity (%) computed from temperature and dew point.
168
+
169
+ #### `lcd_data.region_codes`
170
+ Provides
171
+
172
+ - `lcd_data.region_codes.countries`: Three-letter ISO 3166-1 alpha-3 country codes
173
+ - `lcd_data.region_codes.us_states_territories`: Two-letter U.S. state or territory codes
174
+ - `lcd_data.region_codes.conus`: The special `CONUS` region code
175
+ - `lcd_data.region_codes.rto_iso_regions`: RTO/ISO region codes
176
+
177
+ #### `lcd_data.stations`
178
+ Station catalog handling, filtering, reading, interpolation, and writing.
179
+
180
+ - `Stations.from_url()` / `Stations.from_file(path)`: Build the station catalog from GHCNh-format metadata.
181
+ - Spatial selection by region geometry with `filter_by_region(region_gdf)` and by bounding box with `filter_by_coordinates(...)`.
182
+ - Availability filters: `filter_by_data_availability_online(start_time, end_time, n_jobs, verbose)` and `filter_by_data_availability_offline(data_dir, start_time, end_time, verbose)`.
183
+ - Station utilities: `filter_by_id(station_id)`, `ids()`, `save_station_list(path)`.
184
+ - `read_station_observations(...)`: Read and clean per‑station LCD v2 observation files; convert times to UTC, coerce numeric columns, correct Celsius-with-18.3° base fields, drop non-observational report types, limit unrealistic temperatures, and compute hourly RH.
185
+ - `construct_hourly(...)`: Build full-hourly UTC series for `T`, `Td`, `RH`, and `windspeed`, with optional plotting and gap-limited interpolation.
186
+ - `write_utc_hourly_netcdf(path)`: Save the hourly dataset to NetCDF with safe encodings.
187
+
188
+ ## Development
189
+
190
+ ### Code Quality and Testing Commands
191
+
192
+ - `make fmt` - Runs ruff format, which automatically reformats Python files according to the style rules in `pyproject.toml`
193
+ - `make lint` - Runs ruff check - -fix, which lints the code (checks for style errors, bugs, outdated patterns, etc.) and auto-fixes what it can.
194
+ - `make check` - Runs fmt and lint.
195
+ - `make type` - Currently disabled. Runs mypy, the static type checker, using the strictness settings from `pyproject.toml`. Mypy is a static type checker for Python, a dynamically typed language. Because static analysis cannot account for all dynamic runtime behaviors, mypy may report false positives which do not reflect actual runtime issues.
196
+ - `make test` - Runs pytest with reporting (configured in `pyproject.toml`).
197
+
198
+ ## Disclaimers
199
+
200
+ The LCD v2 data accessed by this software are publicly available from NOAA's National Centers for Environmental Information (NCEI) and are subject to their terms of use. This project is not affiliated with or endorsed by NOAA.
201
+
202
+ This software uses U.S. Census Bureau and U.S. Energy Information Administration data, but is neither endorsed nor certified by the U.S. Census Bureau or the U.S. Energy Information Administration.
203
+
204
+ ## Author
205
+
206
+ Jan Kazil - jan.kazil.dev@gmail.com - [jankazil.com](https://jankazil.com)
207
+
208
+ ## License
209
+
210
+ BSD-3-Clause
@@ -0,0 +1,184 @@
1
+ # lcd-v2-data
2
+
3
+ **lcd-v2-data** is a Python toolkit for downloading and processing [Local Climatological Data version 2 (LCDv2) ](https://www.ncei.noaa.gov/products/land-based-station/local-climatological-data) data.
4
+
5
+ It provides:
6
+
7
+ - A top-level command-line tool that
8
+
9
+ - automates the download of LCD v2 station observations for
10
+ - individual stations
11
+ - U.S. states and territories
12
+ - Regional Transmission Organization (RTO) / Independent System Operator (ISO) regions
13
+
14
+ - constructs full-hourly UTC time series of
15
+ - temperature at 2 m
16
+ - dew point temperature at 2 m
17
+ - relative humidity at 2 m
18
+ - wind speed at 10 m
19
+
20
+ from the irregularly spaced, local time LCD v2 station observation time series, for a selected station or for stations in the selected U.S. state/territory or RTO/ISO region, and a user-specified time range. The time series are saved in a netCDF file.
21
+
22
+ - Modules for downloading and processing LCD v2 station observations.
23
+
24
+ LCD v2 is provided by the [National Centers for Environmental Information (NCEI)](https://www.ncei.noaa.gov/).
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ mamba install -c jan.kazil -c conda-forge lcd-v2-data
30
+ ```
31
+
32
+ ## Overview
33
+
34
+ The package provides a command-line tool that selects stations by geography (a single station by GHCNh identifier, a U.S. state or territory, RTO/ISO regions, and the special region CONUS representing the contiguous U.S.), checks data availability, downloads LCD v2 observation files for a given year range, constructs full-hourly UTC time series for the observables, and saves them in a NetCDF file. It optionally generates plots showing the original and the interpolated time series.
35
+
36
+ Geospatial region selection is based on U.S. Energy Information Administration definitions of RTO/ISO footprints, and U.S. Census Bureau state/territory boundaries, included with the package.
37
+
38
+ The list of GHCNh station identifiers is available [here](https://www.ncei.noaa.gov/oa/global-historical-climatology-network/hourly/doc/ghcnh-station-list.txt). LCD v2 contains only U.S. stations.
39
+
40
+ ## Workflow
41
+
42
+ The following describes the internal workflow performed by the command-line tool:
43
+
44
+ 1. Load the region geometry (RTO/ISO polygons or U.S. state/territory boundaries) if a region is specified; skip this step if a station ID is provided.
45
+ 2. Retrieve the station list from NCEI and either filter it spatially by region or select the specified station.
46
+ 3. Filter the stations by data availability for the requested year range, either online by probing NCEI or offline by checking local files.
47
+ 4. Save the filtered station list for reference.
48
+ 5. Download LCD v2 observation files from NCEI for the selected stations and years, skipping files already present that match by ETag.
49
+ 6. Create full-hourly UTC time series for temperature (T), dew point temperature (Td), relative humidity (RH), and wind speed by converting local observation time to UTC and interpolating the data to full hours. Remove temperatures above 60 °C. Perform interpolation only across gaps of up to 2 hours. Derive RH from T and Td.
50
+ 7. Optionally create comparison plots for the original and interpolated series.
51
+ 8. Save the full-hourly UTC time series in a NetCDF file, for the given station or the stations in the state/region.
52
+
53
+ **Notes:** Interpolation of station observation time series across many years and/or many stations can be slow due to inherent limitations of Python. Creating plots is very slow and recommended only for individual stations (as opposed to regions).
54
+
55
+ ## Command-line interface (CLI)
56
+
57
+ The CLI is exposed as `"build-lcd-dataset"` when installed.
58
+
59
+ **Usage:**
60
+
61
+ ```bash
62
+ build-lcd-dataset START_YEAR END_YEAR REGION DATA_DIR [-n N_JOBS] [-o] [-p PLOT_DIR] [-r] [-v]
63
+ ```
64
+
65
+ **Positional arguments**
66
+
67
+ - `START_YEAR` and `END_YEAR`: Inclusive range of years
68
+ - `REGION`: Region or station selector. Use a two-letter U.S. state or territory code, `'CONUS'`, one of the RTO/ISO codes, or a GHCNh station identifier
69
+ - `DATA_DIR`: Directory into which data will be downloaded
70
+
71
+ **Options**
72
+
73
+ - `-n, --n N_JOBS`: Number of parallel download processes. Values greater than 1 accelerate downloads but may increase the risk of network errors.
74
+ - `-o, --offline`: Work offline. All required files must have been downloaded to `DATA_DIR` in a previous call without this flag.
75
+ - `-p, --plotdir PLOT_DIR`: Directory where plots of the original and interpolated full-hourly time series will be created. Very slow. If omitted, no plots are generated.
76
+ - `-r, --refresh`: Download and process files even if they already exist in `DATA_DIR`.
77
+ - `-v, --verbose`: Print progress information.
78
+
79
+ **Examples:**
80
+
81
+ ```bash
82
+ # Show usage information, valid region codes, and RTO/ISO region names:
83
+ build-lcd-dataset --help
84
+
85
+ # Download LCDv2 data and build a dataset as a NetCDF file for station USW00003017 for the years 2020–2025,
86
+ # in the directory /path/to/data, and create plots in /path/to/plots:
87
+ build-lcd-dataset -v 2020 2025 USW00003017 /path/to/data -p /path/to/plots
88
+
89
+ # Download LCDv2 data and build a dataset as a NetCDF file for the RTO region ERCOT for the year 2022
90
+ # in the directory /path/to/data, using 32 parallel download processes:
91
+ build-lcd-dataset -v 2022 2022 ERCOT /path/to/data -n 32
92
+
93
+ # Build a dataset as a NetCDF file for the state of Colorado for the year 2021, offline from data
94
+ # previously downloaded to /path/to/data:
95
+ build-lcd-dataset -v 2021 2021 CO /path/to/data --offline
96
+ ```
97
+
98
+ ## Sample results
99
+
100
+ Original and interpolated full-hourly UTC time series in November 2024, Twentynine Palms, CA:
101
+
102
+ ![LCD station USL000ANVC1 time series, December 2024](plots/USW00093121.Nov-2024.png)
103
+
104
+ ## Public API
105
+
106
+ ### Modules
107
+
108
+ #### `lcd_data.build_lcd_dataset`
109
+
110
+ Provides a programmatic API equivalent to the command-line interface for building LCD datasets from NOAA NCEI observations.
111
+
112
+ - `run_build(start_year, end_year, region_name, data_dir, plot_dir=None, n_jobs=1, offline=False, refresh=False, verbose=False)`:
113
+
114
+ Downloads, processes, and assembles NOAA NCEI Local Climatological Data (LCD) into a NetCDF file containing full-hourly UTC time series for a specified geographic region or individual station over an inclusive range of years.
115
+ Operates both online (with automatic downloads) and offline (using pre-downloaded files).
116
+ If `plot_dir` is provided, diagnostic plots of original and interpolated time series are generated.
117
+ Returns the `Path` to the generated NetCDF file.
118
+
119
+ #### `lcd_data.ncei`
120
+ Utilities for station metadata and LCD v2 downloads.
121
+
122
+ - `download_stations_meta_files(local_dir)`: Download GHCNh and LCD v2 station meta documents.
123
+ - `lcd_data_file_name(year, station_id)`: Construct LCD v2 observation file name.
124
+ - `lcd_data_file_paths(start_year, end_year, station_ids, local_dir)`: Build local paths for all expected files.
125
+ - `lcd_data_url(year, station_id)`: Build the absolute URL to an LCD v2 observation file.
126
+ - `lcd_data_urls(station_ids, start_year, end_year, n_jobs)`: Probe NCEI server to list existing files.
127
+ - `download_many(...)` and `download_threaded(...)`: Concurrent file downloads with optional refresh behavior.
128
+ - `download_file(url, local_dir, refresh=False, verbose=False)`: Robust download with ETag checking and retries.
129
+
130
+ #### `lcd_data.rto_iso`
131
+ Helpers to work with RTO/ISO region polygons.
132
+
133
+ - `REGION_NAMES`: `['CAISO', 'ERCOT', 'ISONE', 'NYISO', 'MISO', 'PJM', 'SPP']`.
134
+ - `regions(rto_iso_geojson)`: Read GeoJSON and return a GeoDataFrame with merged geometries for each region.
135
+ - `region(rto_iso_geojson, region_name)`: Return a GeoDataFrame for the requested region.
136
+
137
+ #### `lcd_data.saturation`
138
+ Saturation vapor pressure and relative humidity utilities.
139
+
140
+ - `esatw(T)`: Saturation vapor pressure over liquid water (hPa) using an 8th‑order polynomial fit.
141
+ - `rh(T, Td)`: Relative humidity (%) computed from temperature and dew point.
142
+
143
+ #### `lcd_data.region_codes`
144
+ Provides
145
+
146
+ - `lcd_data.region_codes.countries`: Three-letter ISO 3166-1 alpha-3 country codes
147
+ - `lcd_data.region_codes.us_states_territories`: Two-letter U.S. state or territory codes
148
+ - `lcd_data.region_codes.conus`: The special `CONUS` region code
149
+ - `lcd_data.region_codes.rto_iso_regions`: RTO/ISO region codes
150
+
151
+ #### `lcd_data.stations`
152
+ Station catalog handling, filtering, reading, interpolation, and writing.
153
+
154
+ - `Stations.from_url()` / `Stations.from_file(path)`: Build the station catalog from GHCNh-format metadata.
155
+ - Spatial selection by region geometry with `filter_by_region(region_gdf)` and by bounding box with `filter_by_coordinates(...)`.
156
+ - Availability filters: `filter_by_data_availability_online(start_time, end_time, n_jobs, verbose)` and `filter_by_data_availability_offline(data_dir, start_time, end_time, verbose)`.
157
+ - Station utilities: `filter_by_id(station_id)`, `ids()`, `save_station_list(path)`.
158
+ - `read_station_observations(...)`: Read and clean per‑station LCD v2 observation files; convert times to UTC, coerce numeric columns, correct Celsius-with-18.3° base fields, drop non-observational report types, limit unrealistic temperatures, and compute hourly RH.
159
+ - `construct_hourly(...)`: Build full-hourly UTC series for `T`, `Td`, `RH`, and `windspeed`, with optional plotting and gap-limited interpolation.
160
+ - `write_utc_hourly_netcdf(path)`: Save the hourly dataset to NetCDF with safe encodings.
161
+
162
+ ## Development
163
+
164
+ ### Code Quality and Testing Commands
165
+
166
+ - `make fmt` - Runs ruff format, which automatically reformats Python files according to the style rules in `pyproject.toml`
167
+ - `make lint` - Runs ruff check - -fix, which lints the code (checks for style errors, bugs, outdated patterns, etc.) and auto-fixes what it can.
168
+ - `make check` - Runs fmt and lint.
169
+ - `make type` - Currently disabled. Runs mypy, the static type checker, using the strictness settings from `pyproject.toml`. Mypy is a static type checker for Python, a dynamically typed language. Because static analysis cannot account for all dynamic runtime behaviors, mypy may report false positives which do not reflect actual runtime issues.
170
+ - `make test` - Runs pytest with reporting (configured in `pyproject.toml`).
171
+
172
+ ## Disclaimers
173
+
174
+ The LCD v2 data accessed by this software are publicly available from NOAA's National Centers for Environmental Information (NCEI) and are subject to their terms of use. This project is not affiliated with or endorsed by NOAA.
175
+
176
+ This software uses U.S. Census Bureau and U.S. Energy Information Administration data, but is neither endorsed nor certified by the U.S. Census Bureau or the U.S. Energy Information Administration.
177
+
178
+ ## Author
179
+
180
+ Jan Kazil - jan.kazil.dev@gmail.com - [jankazil.com](https://jankazil.com)
181
+
182
+ ## License
183
+
184
+ BSD-3-Clause
@@ -0,0 +1,94 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "lcd-v2-data"
7
+ version = "1.1.13"
8
+ description = "Python toolkit for downloading and processing Local Climatological Data version 2 (LCDv2) data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11,<=3.12.9"
11
+ license = "BSD-3-Clause"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "Jan Kazil" }]
14
+ dependencies = [
15
+ "geopandas",
16
+ "matplotlib",
17
+ "numpy",
18
+ "netcdf4",
19
+ "pandas",
20
+ "requests",
21
+ "shapely",
22
+ "timezonefinder",
23
+ "xarray",
24
+ ]
25
+
26
+ [project.scripts]
27
+ build-lcd-dataset = "lcd_data.build_lcd_dataset:main" # Connects name of executable in $PATH to the python module containing main()
28
+
29
+ [project.optional-dependencies]
30
+ dev = [
31
+ "pytest>=8",
32
+ "pytest-cov>=5",
33
+ "mypy>=1.11",
34
+ "ruff>=0.5",
35
+ "pre-commit>=3.7",
36
+ ]
37
+
38
+ [tool.setuptools]
39
+ package-dir = {"" = "src"}
40
+
41
+ [tool.setuptools.packages.find]
42
+ where = ["src"]
43
+
44
+ # Ensure non-Python assets ship inside the wheel/sdist
45
+ [tool.setuptools.package-data]
46
+ lcd_data = [
47
+ "data/EIA/*.geojson",
48
+ "data/CensusBureau/US_states/*",
49
+ ]
50
+
51
+ [tool.ruff]
52
+ line-length = 128
53
+ target-version = "py313"
54
+ extend-exclude = [
55
+ "dist",
56
+ "build",
57
+ "data",
58
+ "demos",
59
+ "docs",
60
+ "experiments",
61
+ "notebooks",
62
+ "plots",
63
+ "results",
64
+ ]
65
+
66
+ [tool.ruff.lint]
67
+ select = ["E", "F", "I", "UP", "B", "SIM"]
68
+ ignore = ["E501", "F841", "SIM108"]
69
+
70
+ [tool.ruff.format]
71
+ quote-style = "preserve"
72
+
73
+ #
74
+ # MyPy is disabled because static analysis cannot account for all
75
+ # dynamic runtime behaviors, mypy may report false positives which
76
+ # do no reflect actual runtime issues.
77
+ #
78
+ #[tool.mypy]
79
+ #python_version = "3.11"
80
+ #warn_unused_configs = true
81
+ #disallow_untyped_defs = true
82
+ #disallow_incomplete_defs = true
83
+ #no_implicit_optional = true
84
+ #check_untyped_defs = true
85
+ #strict_optional = true
86
+ #pretty = true
87
+ #namespace_packages = true
88
+ #mypy_path = "src"
89
+ #files = ["src/lcd_data", "scripts"]
90
+
91
+ [tool.pytest.ini_options]
92
+ testpaths = ["tests"]
93
+ python_files = ["test_*.py"]
94
+ python_functions = ["test_*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,16 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ # Distribution name as published (matches [project].name in pyproject.toml)
4
+ _DIST_NAME = "lcd-data"
5
+
6
+ try:
7
+ __version__ = version(_DIST_NAME)
8
+ except PackageNotFoundError:
9
+ # Fallback to import package name; if still not installed, use local tag
10
+ pkg = __package__ or __name__.split(".", 1)[0]
11
+ try:
12
+ __version__ = version(pkg)
13
+ except PackageNotFoundError:
14
+ __version__ = "0.0.0+local"
15
+
16
+ __all__ = ["__version__"]