lcd-v2-data 1.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lcd_data/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ # Distribution name as published (matches [project].name in pyproject.toml)
4
+ _DIST_NAME = "lcd-data"
5
+
6
+ try:
7
+ __version__ = version(_DIST_NAME)
8
+ except PackageNotFoundError:
9
+ # Fallback to import package name; if still not installed, use local tag
10
+ pkg = __package__ or __name__.split(".", 1)[0]
11
+ try:
12
+ __version__ = version(pkg)
13
+ except PackageNotFoundError:
14
+ __version__ = "0.0.0+local"
15
+
16
+ __all__ = ["__version__"]
@@ -0,0 +1,389 @@
1
+ #!/usr/bin/env python
2
+
3
+ '''
4
+
5
+ Builds Local Climatological Data (LCD) datasets from NOAA NCEI observations for a specified U.S. state,
6
+ territory, RTO/ISO region, CONUS, or individual station. The module automates the complete workflow of
7
+ downloading station metadata and LCD observations, filtering stations by region and data availability,
8
+ constructing full-hourly UTC time series, saving results as NetCDF files, and optionally generating
9
+ diagnostic plots. It can be executed as a command-line tool or used programmatically through the
10
+ `run_build()` function.
11
+
12
+ Workflow:
13
+
14
+ 1) Load the requested region geometry (RTO/ISO polygons or U.S. state boundaries) if a region is specified.
15
+ If a station ID (GHCNh station identifier) is provided, geometry loading is skipped.
16
+ 2) Retrieve the full LCD station catalog and spatially filter stations to the region,
17
+ or select the specified station by ID.
18
+ 3) Filter stations by data availability over [start_year, end_year].
19
+ 4) Save the resulting station list to a given data directory (for regions) or proceed with the
20
+ specified station (for individual station mode).
21
+ 5) Download LCD observation files for the filtered station IDs. Files already present
22
+ in the download directory and unchanged on the NOAA NCEI server are not re-downloaded.
23
+ - Downloads can be run in parallel with the -n option.
24
+ - The --offline flag disables network access and expects all required files to be present locally.
25
+ 6) Construct full-hourly UTC time series for the selected region or station and period.
26
+ 7) Optionally create plots of original and interpolated full-hourly UTC time series
27
+ if a plot directory is provided (plot generation is slow).
28
+ 8) Save the full-hourly UTC time series as a NetCDF file.
29
+
30
+ Output files:
31
+
32
+ - A text file listing the stations used (for regions).
33
+ - The downloaded LCD files (unless offline mode is selected).
34
+ - Optional diagnostic plots, if a plot directory is specified.
35
+ - A NetCDF file containing the full-hourly UTC time series.
36
+
37
+ Assumptions:
38
+
39
+ - Network access to NOAA NCEI is available unless --offline is specified.
40
+
41
+ Example usage:
42
+
43
+ build-lcd-dataset 2020 2025 USW00003017 /path/to/data -p /path/to/plots
44
+ build-lcd-dataset 2022 2022 CAISO /path/to/data -n 32
45
+ build-lcd-dataset 2021 2021 CO /path/to/data --offline
46
+ '''
47
+
48
+ import argparse
49
+ import os
50
+ import sys
51
+ from datetime import datetime
52
+ from importlib.resources import as_file, files
53
+ from pathlib import Path
54
+
55
+ import geopandas as gpd
56
+
57
+ from lcd_data import ncei, region_codes, rto_iso, stations
58
+
59
+
60
+ def run_build(
61
+ start_year: int,
62
+ end_year: int,
63
+ region_name: str,
64
+ data_dir: Path,
65
+ plot_dir: Path,
66
+ n_jobs: int = 1,
67
+ offline: bool = False,
68
+ refresh: bool = False,
69
+ verbose: bool = False,
70
+ ) -> Path:
71
+ '''
72
+ Download, process, and assemble NOAA NCEI Local Climatological Data (LCD) into a netCDF file with
73
+ full-hourly UTC time series for a specified geographic region or individual station over a given
74
+ range of years.
75
+
76
+ Parameters
77
+ ----------
78
+ start_year : int
79
+ Inclusive start year of the data range to process.
80
+ end_year : int
81
+ Inclusive end year of the data range to process.
82
+ region_name : str
83
+ Region selector, which may be a two-letter U.S. state or territory code, 'CONUS',
84
+ an RTO/ISO code, or a specific station ID (GHCNh station identifier).
85
+ data_dir : Path
86
+ Directory where station lists, downloaded LCD files, and outputs will be stored. Created if it does not exist.
87
+ plot_dir : Path
88
+ Directory where diagnostic plots will be generated. If None, plots are not created.
89
+ n_jobs : int, optional
90
+ Number of parallel download processes to use. If 1, downloads are performed serially. Default is 1.
91
+ offline : bool, optional
92
+ If True, operates without network access and expects all required files to be present locally. Default is False.
93
+ refresh : bool, optional:
94
+ If True:
95
+ - Download a LCD file even if it already exists on disk.
96
+ - Assemble the LCD into a netCDF file with full-hourly UTC time series even if the netCDF file already exists on disk.
97
+ If False:
98
+ - Do not download a LCD file even if already exists on disk.
99
+ - Do not assemble the LCD into a netCDF file with full-hourly UTC time series if the netCDF file already exists on disk.
100
+ Default is False.
101
+ verbose : bool, optional
102
+ If True, prints detailed progress messages. Default is False.
103
+
104
+ Returns
105
+ -------
106
+ Path
107
+ Path to the generated NetCDF file containing the full-hourly UTC LCD time series for the selected region or station.
108
+
109
+ Notes
110
+ -----
111
+ The function downloads or loads LCD station metadata, filters stations by spatial and temporal
112
+ availability, downloads the corresponding LCD observation files (unless offline mode is enabled),
113
+ constructs complete hourly UTC time series, and writes the resulting dataset to a NetCDF file.
114
+ Optionally, it can generate plots of the original and interpolated time series for visual inspection.
115
+ '''
116
+
117
+ if verbose:
118
+ if offline:
119
+ print('Working offline. All required files must have been downloaded to ' + str(data_dir) + ' in a previous call.')
120
+ else:
121
+ print(
122
+ 'Working online. Will download files to '
123
+ + str(data_dir)
124
+ + ' unless they are already present and identical with their version on the NCEI server.'
125
+ )
126
+
127
+ # Create data directory unless it exists
128
+ data_dir.mkdir(parents=True, exist_ok=True)
129
+
130
+ # Construct datetime objects from the start and end year
131
+ start_date = datetime(year=start_year, month=1, day=1)
132
+ end_date = datetime(year=end_year, month=12, day=31)
133
+
134
+ #
135
+ # Identify stations in the selected US state, territory, region, or handle an individual station
136
+ #
137
+
138
+ working_on_region = False
139
+
140
+ if len(region_name) == 2:
141
+ assert region_name in region_codes.us_states_territories, (
142
+ 'US state/territory code ' + region_name + ' is not available.'
143
+ )
144
+ working_on_region = True
145
+ # Load US states shapefile directory from installed distribution using importlib.resources
146
+ us_states_dir_res = files('lcd_data') / 'data' / 'CensusBureau' / 'US_states'
147
+ with as_file(us_states_dir_res) as us_states_dir_path:
148
+ us_states_shp_file = us_states_dir_path / 'tl_2024_us_state.shp'
149
+ us_gdf = gpd.read_file(us_states_shp_file)
150
+ region_gdf = us_gdf[us_gdf['STUSPS'].isin([region_name])]
151
+
152
+ elif region_name in region_codes.rto_iso_regions:
153
+ working_on_region = True
154
+ # Load RTO/ISO region GeoJSON from installed distribution using importlib.resources
155
+ rto_iso_geojson_res = files('lcd_data') / 'data' / 'EIA' / 'RTO_ISO_regions.geojson'
156
+ with as_file(rto_iso_geojson_res) as rto_iso_geojson_path:
157
+ region_gdf = rto_iso.region(rto_iso_geojson_path, region_name)
158
+
159
+ elif region_name == region_codes.conus:
160
+ working_on_region = True
161
+ # Load US states shapefile directory from installed distribution using importlib.resources
162
+ us_states_dir_res = files('lcd_data') / 'data' / 'CensusBureau' / 'US_states'
163
+ with as_file(us_states_dir_res) as us_states_dir_path:
164
+ us_states_shp_file = us_states_dir_path / 'tl_2024_us_state.shp'
165
+ us_gdf = gpd.read_file(us_states_shp_file)
166
+ exclude_codes = ['AK', 'HI', 'PR', 'GU', 'VI', 'AS', 'MP']
167
+ region_gdf = us_gdf[~us_gdf['STUSPS'].isin(exclude_codes)]
168
+
169
+ # File with list of all stations and metadata
170
+
171
+ all_stations_file = data_dir / Path(os.path.basename(ncei.ghcnh_station_list_url))
172
+
173
+ if offline:
174
+ # Load the file from disk - it must have been downloaded previously to the data directory
175
+ all_stations = stations.Stations.from_file(all_stations_file)
176
+ else:
177
+ # Load the file from the NCEI server and save it for later use
178
+ all_stations = stations.Stations.from_url()
179
+ all_stations.save_station_list(all_stations_file, verbose=verbose)
180
+
181
+ # Determine if we are working on a region or an individual station
182
+
183
+ if working_on_region:
184
+ region_stations = all_stations.filter_by_region(region_gdf)
185
+ else:
186
+ region_stations = all_stations.filter_by_id(region_name)
187
+
188
+ # Filter by data availability
189
+
190
+ if offline:
191
+ region_stations = region_stations.filter_by_data_availability_offline(data_dir, start_date, end_date, verbose=verbose)
192
+ else:
193
+ region_stations = region_stations.filter_by_data_availability_online(start_date, end_date, verbose=verbose)
194
+
195
+ # Save the metadata file for these stations
196
+
197
+ region_stations_file = data_dir / Path(region_name + '.' + str(start_date.year) + '-' + str(end_date.year) + '.txt')
198
+
199
+ region_stations.save_station_list(region_stations_file)
200
+
201
+ # Download LCD station data
202
+
203
+ if not offline:
204
+ _ = ncei.download_many(
205
+ start_date.year, end_date.year, region_stations.ids(), data_dir, n_jobs=n_jobs, refresh=refresh, verbose=verbose
206
+ )
207
+
208
+ # Construct full-hourly UTC time series from the LCD station data
209
+
210
+ lcd_netcdf_file = data_dir / Path(region_name + '.' + str(start_date.year) + '-' + str(end_date.year) + '.nc')
211
+
212
+ if refresh or not lcd_netcdf_file.exists():
213
+ region_stations.construct_hourly(
214
+ data_dir, start_date.year, end_date.year, region=region_name, plot_dir=plot_dir, verbose=verbose
215
+ )
216
+
217
+ # Save full-hourly UTC time series as a netCDF file
218
+
219
+ region_stations.write_utc_hourly_netcdf(lcd_netcdf_file, verbose=verbose)
220
+
221
+ return lcd_netcdf_file
222
+
223
+
224
+ def arg_parse(argv=None):
225
+ '''
226
+
227
+ Command line argument parser.
228
+
229
+ Parses command-line arguments and returns normalized values used by the script.
230
+
231
+ Parameters
232
+ ----------
233
+ argv : list[str] or None
234
+ Sequence of argument tokens to parse (excluding the program name). If None,
235
+ arguments are taken from sys.argv[1:].
236
+
237
+ Returns
238
+ -------
239
+ tuple[datetime, datetime, str, Path, Path | None, int | None, bool]
240
+ start_date : datetime
241
+ Inclusive start date constructed from `start_year` (January 1).
242
+
243
+ end_date : datetime
244
+ Inclusive end date constructed from `end_year` (December 31).
245
+
246
+ region_name : str
247
+ Region selector. One of a two-letter U.S. state or territory code
248
+ (e.g., 'CA', 'PR'), the special region 'CONUS', an RTO/ISO region code
249
+ {'ERCOT','CAISO','ISONE','NYISO','MISO','SPP','PJM'}, or an individual
250
+ station ID (GHCNh station identifier).
251
+
252
+ data_dir : Path
253
+ Destination directory into which the station list, downloaded LCD files,
254
+ and outputs will be written.
255
+
256
+ plot_dir : Path | None
257
+ Directory where plots of the original and the interpolated full-hourly UTC time
258
+ series will be created. If None, plots are not generated. Creating plots is
259
+ very slow.
260
+
261
+ n_jobs : int | None
262
+ Maximum number of parallel download workers. If None, downloads run
263
+ single-threaded.
264
+
265
+ offline : bool
266
+ If True, work offline and expect all required inputs to be present in data_dir.
267
+
268
+ verbose : bool
269
+ If True, print information.
270
+
271
+ Raises
272
+ ------
273
+ SystemExit
274
+ If the provided arguments fail validation performed by argparse.
275
+ '''
276
+
277
+ code_description = (
278
+ "Download NOAA NCEI Local Climatological Data (LCD) observations for stations located "
279
+ "within a selected U.S. state, territory, the contiguous United States (CONUS), an "
280
+ "RTO/ISO region, or for an individual station by station ID (GHCNh station identifier), "
281
+ "over an inclusive range of years. The script filters stations spatially and by data "
282
+ "availability (for regions) or selects the given station, saves the station list when "
283
+ "applicable, downloads observations, constructs full-hourly UTC time series, optionally "
284
+ "creates diagnostic plots, and writes a NetCDF file.\n\n"
285
+ "Valid region or station arguments:\n\n"
286
+ f" - US states/territories: {', '.join(region_codes.us_states_territories)}\n\n"
287
+ f" - Special region: {region_codes.conus}\n\n"
288
+ f" - RTO/ISO regions: {', '.join(region_codes.rto_iso_regions)}\n\n"
289
+ " - Individual station: provide a station ID (GHCNh station identifier)\n\n"
290
+ "Parallel downloads can be enabled with -n.\n\n"
291
+ "LCD observation files already present in the download directory and unchanged on the NOAA NCEI server are not re-downloaded."
292
+ )
293
+
294
+ parser = argparse.ArgumentParser(description=code_description, formatter_class=argparse.RawDescriptionHelpFormatter)
295
+
296
+ # Mandatory arguments
297
+
298
+ parser.add_argument('start_year', type=int, help='Start year of time range.')
299
+
300
+ parser.add_argument('end_year', type=int, help='End year of time range (inclusive).')
301
+
302
+ parser.add_argument(
303
+ 'region_name',
304
+ type=str,
305
+ help=(
306
+ "Region or station selector. Use a two-letter U.S. state or territory code, 'CONUS', "
307
+ "one of the RTO/ISO codes "
308
+ f"({', '.join(region_codes.rto_iso_regions)}), or a station ID (GHCNh station identifier)."
309
+ ),
310
+ )
311
+
312
+ parser.add_argument('data_dir', type=str, help='Directory path into which the data will be downloaded.')
313
+
314
+ # Optional arguments
315
+
316
+ parser.add_argument(
317
+ '-n',
318
+ '--n',
319
+ type=int,
320
+ help=(
321
+ 'Number of parallel download processes. n > 1 accelerates downloads significantly, '
322
+ 'but can result in network errors or in the server refusing to cooperate.'
323
+ ),
324
+ )
325
+
326
+ parser.add_argument(
327
+ '-o',
328
+ '--offline',
329
+ action='store_true',
330
+ help=('Work offline. All required files must have been downloaded to data_dir in a previous call without this flag.'),
331
+ )
332
+
333
+ parser.add_argument(
334
+ '-v',
335
+ '--verbose',
336
+ action='store_true',
337
+ help=('Print progress information.'),
338
+ )
339
+
340
+ parser.add_argument(
341
+ '-r',
342
+ '--refresh',
343
+ action='store_true',
344
+ help=('Download and process files even if they already exist in the data directory'),
345
+ )
346
+
347
+ parser.add_argument(
348
+ '-p',
349
+ '--plotdir',
350
+ type=str,
351
+ help=(
352
+ 'Directory where plots of the original and the interpolated full-hourly time series '
353
+ 'will be created. Very slow. If omitted, no plots are generated.'
354
+ ),
355
+ )
356
+
357
+ args = parser.parse_args(argv)
358
+
359
+ start_year = args.start_year
360
+ end_year = args.end_year
361
+ region_name = args.region_name
362
+ data_dir = Path(args.data_dir)
363
+
364
+ plot_dir = Path(args.plotdir) if args.plotdir is not None else None
365
+
366
+ n_jobs: int | None = args.n
367
+ offline: bool | None = args.offline
368
+ refresh: bool | None = args.refresh
369
+ verbose: bool | None = args.verbose
370
+
371
+ return (start_year, end_year, region_name, data_dir, plot_dir, n_jobs, offline, refresh, verbose)
372
+
373
+
374
+ def main(argv=None):
375
+ '''
376
+ Command line interface entry point.
377
+ '''
378
+
379
+ (start_year, end_year, region_name, data_dir, plot_dir, n_jobs, offline, refresh, verbose) = arg_parse(
380
+ argv if argv is not None else sys.argv[1:]
381
+ )
382
+
383
+ lcd_netcdf_file = run_build(
384
+ start_year, end_year, region_name, data_dir, plot_dir, n_jobs=n_jobs, offline=offline, refresh=refresh, verbose=verbose
385
+ )
386
+
387
+
388
+ if __name__ == '__main__':
389
+ main()
@@ -0,0 +1,31 @@
1
+ This directory contains data from the U.S. Census Bureau.
2
+
3
+ This product uses Census Bureau data but is not endorsed
4
+ or certified by the Census Bureau, and Census Bureau data
5
+ are not covered by the license of the lcd-data package.
6
+
7
+ Dataset:
8
+
9
+ TIGER/Line Shapefiles
10
+
11
+ US_states/tl_2024_us_state.cpg
12
+ US_states/tl_2024_us_state.dbf
13
+ US_states/tl_2024_us_state.prj
14
+ US_states/tl_2024_us_state.shp
15
+ US_states/tl_2024_us_state.shp.ea.iso.xml
16
+ US_states/tl_2024_us_state.shp.iso.xml
17
+ US_states/tl_2024_us_state.shx
18
+
19
+ Source:
20
+
21
+ https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html, accessed September 20, 2025
22
+
23
+ These data are public domain products provided by the U.S. Census Bureau.
24
+ There are no restrictions on their use or redistribution. For more
25
+ information, see the Census Bureau’s terms of service:
26
+
27
+ https://www.census.gov/data/developers/about/terms-of-service.html
28
+
29
+ https://ask.census.gov/prweb/PRServletCustom?pyActivity=pyMobileSnapStart&ArticleID=KCP-4928
30
+
31
+ Attribution: U.S. Census Bureau (September 2025)
@@ -0,0 +1 @@
1
+ GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]