goesgcp 1.0.8__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- goesgcp/main.py +158 -18
- {goesgcp-1.0.8.dist-info → goesgcp-2.0.0.dist-info}/METADATA +23 -2
- goesgcp-2.0.0.dist-info/RECORD +8 -0
- {goesgcp-1.0.8.dist-info → goesgcp-2.0.0.dist-info}/WHEEL +1 -1
- goesgcp-1.0.8.dist-info/RECORD +0 -8
- {goesgcp-1.0.8.dist-info → goesgcp-2.0.0.dist-info}/LICENSE +0 -0
- {goesgcp-1.0.8.dist-info → goesgcp-2.0.0.dist-info}/entry_points.txt +0 -0
- {goesgcp-1.0.8.dist-info → goesgcp-2.0.0.dist-info}/top_level.txt +0 -0
goesgcp/main.py
CHANGED
|
@@ -5,6 +5,7 @@ import xarray as xr
|
|
|
5
5
|
import argparse
|
|
6
6
|
import sys
|
|
7
7
|
import tqdm
|
|
8
|
+
import pandas as pd
|
|
8
9
|
from distutils.util import strtobool
|
|
9
10
|
from multiprocessing import Pool
|
|
10
11
|
from google.cloud import storage
|
|
@@ -21,12 +22,90 @@ def list_blobs(connection, bucket_name, prefix):
|
|
|
21
22
|
bucket = connection.bucket(bucket_name)
|
|
22
23
|
|
|
23
24
|
blobs = bucket.list_blobs(prefix=prefix)
|
|
25
|
+
|
|
24
26
|
return blobs
|
|
25
27
|
|
|
26
28
|
def get_directory_prefix(year, julian_day, hour):
|
|
27
29
|
"""Generates the directory path based on year, Julian day, and hour."""
|
|
28
30
|
return f"{year}/{julian_day}/{str(hour).zfill(2)}/"
|
|
29
31
|
|
|
32
|
+
|
|
33
|
+
def get_files_period(connection, bucket_name, base_prefix, pattern,
|
|
34
|
+
start, end, bt_hour=[0, 23], bt_min=[0, 60], freq='10 min'):
|
|
35
|
+
"""
|
|
36
|
+
Fetches files from a GCP bucket within a specified time period and returns them as a DataFrame.
|
|
37
|
+
|
|
38
|
+
:param connection: The GCP storage client connection.
|
|
39
|
+
:param bucket_name: Name of the GCP bucket.
|
|
40
|
+
:param base_prefix: Base directory prefix for the files.
|
|
41
|
+
:param pattern: Search pattern for file names.
|
|
42
|
+
:param start: Start datetime (inclusive).
|
|
43
|
+
:param end: End datetime (exclusive).
|
|
44
|
+
:return: DataFrame containing the file names and their metadata.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
print(f"GOESGCP: Fetching files between {start} and {end}...")
|
|
48
|
+
|
|
49
|
+
# Ensure datetime objects
|
|
50
|
+
start = pd.to_datetime(start).tz_localize('UTC')
|
|
51
|
+
end = pd.to_datetime(end).tz_localize('UTC')
|
|
52
|
+
|
|
53
|
+
# Initialize list to store file metadata
|
|
54
|
+
files_metadata = []
|
|
55
|
+
|
|
56
|
+
# Generate the list of dates from start to end
|
|
57
|
+
current_time = start
|
|
58
|
+
while current_time < end:
|
|
59
|
+
year = current_time.year
|
|
60
|
+
julian_day = str(current_time.timetuple().tm_yday).zfill(3) # Julian day
|
|
61
|
+
hour = current_time.hour
|
|
62
|
+
|
|
63
|
+
# Generate the directory prefix
|
|
64
|
+
prefix = f"{base_prefix}/{get_directory_prefix(year, julian_day, hour)}"
|
|
65
|
+
|
|
66
|
+
# List blobs in the bucket for the current prefix
|
|
67
|
+
blobs = list_blobs(connection, bucket_name, prefix)
|
|
68
|
+
|
|
69
|
+
# Filter blobs by pattern
|
|
70
|
+
for blob in blobs:
|
|
71
|
+
if pattern in blob.name:
|
|
72
|
+
files_metadata.append({
|
|
73
|
+
'file_name': blob.name,
|
|
74
|
+
'last_modified': blob.updated
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Move to the next hour
|
|
78
|
+
current_time += timedelta(hours=1)
|
|
79
|
+
|
|
80
|
+
# Create a DataFrame from the list of files
|
|
81
|
+
df = pd.DataFrame(files_metadata)
|
|
82
|
+
|
|
83
|
+
if df.empty:
|
|
84
|
+
print("No files found matching the pattern.")
|
|
85
|
+
return pd.DataFrame()
|
|
86
|
+
|
|
87
|
+
# Ensure 'last_modified' is in the correct datetime format without timezone
|
|
88
|
+
df['last_modified'] = pd.to_datetime(df['last_modified']).dt.tz_localize(None)
|
|
89
|
+
start = pd.to_datetime(start).tz_localize(None)
|
|
90
|
+
end = pd.to_datetime(end).tz_localize(None)
|
|
91
|
+
|
|
92
|
+
# Filter the DataFrame based on the date range
|
|
93
|
+
df = df[(df['last_modified'] >= start) & (df['last_modified'] < end)]
|
|
94
|
+
|
|
95
|
+
# Filter the DataFrame based on the hour range
|
|
96
|
+
df['hour'] = df['last_modified'].dt.hour
|
|
97
|
+
df = df[(df['hour'] >= bt_hour[0]) & (df['hour'] <= bt_hour[1])]
|
|
98
|
+
|
|
99
|
+
# Filter the DataFrame based on the minute range
|
|
100
|
+
df['minute'] = df['last_modified'].dt.minute
|
|
101
|
+
df = df[(df['minute'] >= bt_min[0]) & (df['minute'] <= bt_min[1])]
|
|
102
|
+
|
|
103
|
+
# Filter the DataFrame based on the frequency
|
|
104
|
+
df['freq'] = df['last_modified'].dt.floor(freq)
|
|
105
|
+
df = df.groupby('freq').first().reset_index()
|
|
106
|
+
|
|
107
|
+
return df['file_name'].tolist()
|
|
108
|
+
|
|
30
109
|
def get_recent_files(connection, bucket_name, base_prefix, pattern, min_files):
|
|
31
110
|
"""
|
|
32
111
|
Fetches the most recent files in a GCP bucket.
|
|
@@ -44,6 +123,8 @@ def get_recent_files(connection, bucket_name, base_prefix, pattern, min_files):
|
|
|
44
123
|
while len(files) < min_files:
|
|
45
124
|
year = current_time.year
|
|
46
125
|
julian_day = current_time.timetuple().tm_yday # Get the Julian day
|
|
126
|
+
# Add 3 digits to the Julian day
|
|
127
|
+
julian_day = str(julian_day).zfill(3)
|
|
47
128
|
hour = current_time.hour
|
|
48
129
|
|
|
49
130
|
# Generate the directory prefix for the current date and time
|
|
@@ -145,12 +226,31 @@ def crop_reproject(args):
|
|
|
145
226
|
# Add global metadata comments
|
|
146
227
|
ds.attrs['comments'] = "Data processed by goesgcp, author: Helvecio B. L. Neto (helvecioblneto@gmail.com)"
|
|
147
228
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
229
|
+
if save_format == 'by_date':
|
|
230
|
+
file_datetime = datetime.strptime(ds.time_coverage_start,
|
|
231
|
+
"%Y-%m-%dT%H:%M:%S.%fZ")
|
|
232
|
+
year = file_datetime.strftime("%Y")
|
|
233
|
+
month = file_datetime.strftime("%m")
|
|
234
|
+
day = file_datetime.strftime("%d")
|
|
235
|
+
output_directory = f"{output}{year}/{month}/{day}/"
|
|
236
|
+
elif save_format == 'julian':
|
|
237
|
+
file_datetime = datetime.strptime(ds.time_coverage_start,
|
|
238
|
+
"%Y-%m-%dT%H:%M:%S.%fZ")
|
|
239
|
+
year = file_datetime.strftime("%Y")
|
|
240
|
+
julian_day = file_datetime.timetuple().tm_yday
|
|
241
|
+
output_directory = f"{output}{year}/{julian_day}/"
|
|
242
|
+
else:
|
|
243
|
+
output_directory = output
|
|
244
|
+
|
|
245
|
+
# Create the output directory
|
|
246
|
+
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)
|
|
247
|
+
|
|
248
|
+
# Save the file
|
|
249
|
+
output_file = f"{output_directory}{file.split('/')[-1]}"
|
|
250
|
+
ds.to_netcdf(output_file, mode='w', format='NETCDF4_CLASSIC')
|
|
251
|
+
|
|
252
|
+
# Fechar o dataset
|
|
152
253
|
ds.close()
|
|
153
|
-
|
|
154
254
|
return
|
|
155
255
|
|
|
156
256
|
|
|
@@ -194,14 +294,20 @@ def main():
|
|
|
194
294
|
|
|
195
295
|
global output_path, var_name, \
|
|
196
296
|
lat_min, lat_max, lon_min, lon_max, \
|
|
197
|
-
max_attempts, parallel, recent, resolution, storage_client
|
|
297
|
+
max_attempts, parallel, recent, resolution, storage_client, \
|
|
298
|
+
satellite, product, domain, op_mode, channel, save_format
|
|
198
299
|
|
|
199
300
|
epilog = """
|
|
200
301
|
Example usage:
|
|
201
302
|
|
|
202
|
-
- To download recent
|
|
303
|
+
- To download recent 3 files from the GOES-16 satellite for the ABI-L2-CMIPF product:
|
|
304
|
+
|
|
305
|
+
goesgcp --satellite goes16 --product ABI-L2-CMIP --recent 3"
|
|
306
|
+
|
|
307
|
+
- To download files from the GOES-16 satellite for the ABI-L2-CMIPF product between 2022-12-15 and 2022-12-20:
|
|
308
|
+
|
|
309
|
+
goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
|
|
203
310
|
|
|
204
|
-
goesgcp --satellite goes16 --product ABI-L2-CMIP --recent 10 --output_path "output/"
|
|
205
311
|
"""
|
|
206
312
|
|
|
207
313
|
|
|
@@ -216,7 +322,17 @@ def main():
|
|
|
216
322
|
parser.add_argument('--var_name', type=str, default='CMI', help='Variable name to extract (e.g., CMI)')
|
|
217
323
|
parser.add_argument('--channel', type=int, default=13, help='Channel to use (e.g., 13)')
|
|
218
324
|
parser.add_argument('--domain', type=str, default='F', help='Domain to use (e.g., F or C)')
|
|
219
|
-
parser.add_argument('--
|
|
325
|
+
parser.add_argument('--op_mode', type=str, default='M6C', help='Operational mode to use (e.g., M6C)')
|
|
326
|
+
|
|
327
|
+
# Recent files settings
|
|
328
|
+
parser.add_argument('--recent', type=int, help='Number of recent files to download (e.g., 3)')
|
|
329
|
+
|
|
330
|
+
# Date and time settings
|
|
331
|
+
parser.add_argument('--start', type=str, help='Start date in YYYY-MM-DD format')
|
|
332
|
+
parser.add_argument('--end', type=str, help='End date in YYYY-MM-DD format')
|
|
333
|
+
parser.add_argument('--freq', type=str, default='10 min', help='Frequency for the time range (e.g., "10 min")')
|
|
334
|
+
parser.add_argument('--bt_hour', nargs=2, type=int, default=[0, 23], help='Filter data between these hours (e.g., 0 23)')
|
|
335
|
+
parser.add_argument('--bt_min', nargs=2, type=int, default=[0, 60], help='Filter data between these minutes (e.g., 0 60)')
|
|
220
336
|
|
|
221
337
|
# Geographic bounding box
|
|
222
338
|
parser.add_argument('--lat_min', type=float, default=-81.3282, help='Minimum latitude of the bounding box')
|
|
@@ -230,6 +346,9 @@ def main():
|
|
|
230
346
|
parser.add_argument('--parallel', type=lambda x: bool(strtobool(x)), default=True, help='Use parallel processing')
|
|
231
347
|
parser.add_argument('--processes', type=int, default=4, help='Number of processes for parallel execution')
|
|
232
348
|
parser.add_argument('--max_attempts', type=int, default=3, help='Number of attempts to download a file')
|
|
349
|
+
parser.add_argument('--save_format', type=str, default='flat', choices=['flat', 'by_date','julian'],
|
|
350
|
+
help="Save the files in a flat structure or by date")
|
|
351
|
+
|
|
233
352
|
|
|
234
353
|
# Parse arguments
|
|
235
354
|
args = parser.parse_args()
|
|
@@ -243,6 +362,7 @@ def main():
|
|
|
243
362
|
satellite = args.satellite
|
|
244
363
|
product = args.product
|
|
245
364
|
domain = args.domain
|
|
365
|
+
op_mode = args.op_mode
|
|
246
366
|
channel = str(args.channel).zfill(2)
|
|
247
367
|
var_name = args.var_name
|
|
248
368
|
lat_min = args.lat_min
|
|
@@ -252,11 +372,22 @@ def main():
|
|
|
252
372
|
resolution = args.resolution
|
|
253
373
|
max_attempts = args.max_attempts
|
|
254
374
|
parallel = args.parallel
|
|
375
|
+
recent = args.recent
|
|
376
|
+
start = args.start
|
|
377
|
+
end = args.end
|
|
378
|
+
freq = args.freq
|
|
379
|
+
bt_hour = args.bt_hour
|
|
380
|
+
bt_min = args.bt_min
|
|
381
|
+
save_format = args.save_format
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# Check mandatory arguments
|
|
385
|
+
if not args.recent and not (args.start and args.end):
|
|
386
|
+
print("You must provide either the --recent or --start and --end arguments. Exiting...")
|
|
387
|
+
sys.exit(1)
|
|
255
388
|
|
|
256
389
|
# Set bucket name and pattern
|
|
257
390
|
bucket_name = "gcp-public-data-" + satellite
|
|
258
|
-
pattern = "OR_"+product+domain+"-M6C"+channel+"_G" + satellite[-2:]
|
|
259
|
-
min_files = args.recent
|
|
260
391
|
|
|
261
392
|
# Create output directory
|
|
262
393
|
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
|
|
@@ -271,11 +402,20 @@ def main():
|
|
|
271
402
|
print(f"Bucket {bucket_name} not found. Exiting...")
|
|
272
403
|
sys.exit(1)
|
|
273
404
|
|
|
274
|
-
#
|
|
275
|
-
|
|
405
|
+
# Set pattern for the files
|
|
406
|
+
pattern = "OR_"+product+domain+"-"+op_mode+channel+"_G" + satellite[-2:]
|
|
407
|
+
|
|
408
|
+
# Check operational mode if is recent or specific date
|
|
409
|
+
if start and end:
|
|
410
|
+
files_list = get_files_period(storage_client, bucket_name,
|
|
411
|
+
product + domain, pattern, start, end,
|
|
412
|
+
bt_hour, bt_min, freq)
|
|
413
|
+
else:
|
|
414
|
+
# Get recent files
|
|
415
|
+
files_list = get_recent_files(storage_client, bucket_name, product + domain, pattern, recent)
|
|
276
416
|
|
|
277
417
|
# Check if any files were found
|
|
278
|
-
if not
|
|
418
|
+
if not files_list:
|
|
279
419
|
print(f"No files found with the pattern {pattern}. Exiting...")
|
|
280
420
|
sys.exit(1)
|
|
281
421
|
|
|
@@ -283,14 +423,14 @@ def main():
|
|
|
283
423
|
pathlib.Path('tmp/').mkdir(parents=True, exist_ok=True)
|
|
284
424
|
|
|
285
425
|
# Download files
|
|
286
|
-
print(f"GOESGCP: Downloading and processing {len(
|
|
287
|
-
loading_bar = tqdm.tqdm(total=len(
|
|
426
|
+
print(f"GOESGCP: Downloading and processing {len(files_list)} files...")
|
|
427
|
+
loading_bar = tqdm.tqdm(total=len(files_list), ncols=100, position=0, leave=True,
|
|
288
428
|
bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} + \
|
|
289
429
|
[Elapsed:{elapsed} Remaining:<{remaining}]')
|
|
290
430
|
|
|
291
431
|
if parallel: # Run in parallel
|
|
292
432
|
# Create a list of tasks
|
|
293
|
-
tasks = [(bucket_name, file, f"tmp/{file.split('/')[-1]}") for file in
|
|
433
|
+
tasks = [(bucket_name, file, f"tmp/{file.split('/')[-1]}") for file in files_list]
|
|
294
434
|
|
|
295
435
|
# Download files in parallel
|
|
296
436
|
with Pool(processes=args.processes) as pool:
|
|
@@ -298,7 +438,7 @@ def main():
|
|
|
298
438
|
loading_bar.update(1)
|
|
299
439
|
loading_bar.close()
|
|
300
440
|
else: # Run in serial
|
|
301
|
-
for file in
|
|
441
|
+
for file in files_list:
|
|
302
442
|
local_path = f"tmp/{file.split('/')[-1]}"
|
|
303
443
|
process_file((bucket_name, file, local_path))
|
|
304
444
|
loading_bar.update(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: goesgcp
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: A package to download and process GOES-16/17 data
|
|
5
5
|
Home-page: https://github.com/helvecioneto/goesgcp
|
|
6
6
|
Author: Helvecio B. L. Neto
|
|
@@ -22,6 +22,15 @@ Requires-Dist: pyproj
|
|
|
22
22
|
Requires-Dist: xarray
|
|
23
23
|
Requires-Dist: netcdf4
|
|
24
24
|
Requires-Dist: rioxarray
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: home-page
|
|
31
|
+
Dynamic: license
|
|
32
|
+
Dynamic: requires-dist
|
|
33
|
+
Dynamic: summary
|
|
25
34
|
|
|
26
35
|
# goesgcp
|
|
27
36
|
|
|
@@ -58,6 +67,12 @@ goesgcp [OPTIONS]
|
|
|
58
67
|
| `--lon_min` | Minimum longitude of the bounding box (default: `-116`). |
|
|
59
68
|
| `--lon_max` | Maximum longitude of the bounding box (default: `-25`). |
|
|
60
69
|
| `--resolution` | Set the reprojet data resolution in degree (default: `-0.045`). |
|
|
70
|
+
| `--recent` | Number of most recent data to download (default: `1`). |
|
|
71
|
+
| `--start` | Start date for downloading data (default: `None`). |
|
|
72
|
+
| `--end` | End date for downloading data (default: `None`). |
|
|
73
|
+
| `--bt_hour` | Hour of the day to download data (default: [0, 1, ..., 23]). |
|
|
74
|
+
| `--bt_minute` | Minute of the hour to download data (default: [0, 15, 30, 45]). |
|
|
75
|
+
| `--save_format` | Format for saving output files (default: `by_date`). |
|
|
61
76
|
|
|
62
77
|
### Examples
|
|
63
78
|
|
|
@@ -67,5 +82,11 @@ To download most 3 recent data for the GOES-16 satellite, ABI-L2-CMIPF product,
|
|
|
67
82
|
goesgcp --satellite goes16 --product ABI-L2-CMIPF --var_name CMI --channel 13 --recent 3 --output "output/"
|
|
68
83
|
```
|
|
69
84
|
|
|
85
|
+
To download data for a specific date range, use the `--start` and `--end` options:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
|
|
89
|
+
```
|
|
90
|
+
|
|
70
91
|
### Credits
|
|
71
92
|
And this is a otimization by Helvecio Neto - 2025
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
goesgcp/__init__.py,sha256=MigXIT7A1M9YZuH2MyjKReSziFwzbZX2boVYsLosR6s,22
|
|
2
|
+
goesgcp/main.py,sha256=5vCAcSuMgFRzTLKZL4IUGgXVBxM3Buw19l-2MrpGR9M,16837
|
|
3
|
+
goesgcp-2.0.0.dist-info/LICENSE,sha256=AHeZifD4UyBZI61Ug5lETXgX3Anp_XfAvFXQqrW9AnU,1078
|
|
4
|
+
goesgcp-2.0.0.dist-info/METADATA,sha256=fgCGPkdI51BKwv7kpgiZgHP9WZbXhSdWcxkPhaZ2fKk,4066
|
|
5
|
+
goesgcp-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
6
|
+
goesgcp-2.0.0.dist-info/entry_points.txt,sha256=6afMW51WnUR9VZ_xvDoiB8JQb2OFiLuzRtV6dPL__OQ,46
|
|
7
|
+
goesgcp-2.0.0.dist-info/top_level.txt,sha256=C-C3vipI0AwEDW9nWFkJ6D0TkcKkIYlyyM15LMskUEc,8
|
|
8
|
+
goesgcp-2.0.0.dist-info/RECORD,,
|
goesgcp-1.0.8.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
goesgcp/__init__.py,sha256=MigXIT7A1M9YZuH2MyjKReSziFwzbZX2boVYsLosR6s,22
|
|
2
|
-
goesgcp/main.py,sha256=_7QyMp7MRfAvCb5ChqTc2dyeyQwc5ftH5nJJz6HiD4Y,11100
|
|
3
|
-
goesgcp-1.0.8.dist-info/LICENSE,sha256=AHeZifD4UyBZI61Ug5lETXgX3Anp_XfAvFXQqrW9AnU,1078
|
|
4
|
-
goesgcp-1.0.8.dist-info/METADATA,sha256=IlkX413bUXozaKP2s65cj8aq2HbAPRbdFNEkPwiaA0o,2993
|
|
5
|
-
goesgcp-1.0.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
6
|
-
goesgcp-1.0.8.dist-info/entry_points.txt,sha256=6afMW51WnUR9VZ_xvDoiB8JQb2OFiLuzRtV6dPL__OQ,46
|
|
7
|
-
goesgcp-1.0.8.dist-info/top_level.txt,sha256=C-C3vipI0AwEDW9nWFkJ6D0TkcKkIYlyyM15LMskUEc,8
|
|
8
|
-
goesgcp-1.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|