goesgcp 1.0.8__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: goesgcp
3
- Version: 1.0.8
3
+ Version: 2.0.0
4
4
  Summary: A package to download and process GOES-16/17 data
5
5
  Home-page: https://github.com/helvecioneto/goesgcp
6
6
  Author: Helvecio B. L. Neto
@@ -22,6 +22,15 @@ Requires-Dist: pyproj
22
22
  Requires-Dist: xarray
23
23
  Requires-Dist: netcdf4
24
24
  Requires-Dist: rioxarray
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: home-page
31
+ Dynamic: license
32
+ Dynamic: requires-dist
33
+ Dynamic: summary
25
34
 
26
35
  # goesgcp
27
36
 
@@ -58,6 +67,12 @@ goesgcp [OPTIONS]
58
67
  | `--lon_min` | Minimum longitude of the bounding box (default: `-116`). |
59
68
  | `--lon_max` | Maximum longitude of the bounding box (default: `-25`). |
60
69
  | `--resolution` | Set the reprojet data resolution in degree (default: `-0.045`). |
70
+ | `--recent` | Number of most recent data to download (default: `1`). |
71
+ | `--start` | Start date for downloading data (default: `None`). |
72
+ | `--end` | End date for downloading data (default: `None`). |
73
+ | `--bt_hour` | Hour of the day to download data (default: [0, 1, ..., 23]). |
74
+ | `--bt_minute` | Minute of the hour to download data (default: [0, 15, 30, 45]). |
75
+ | `--save_format` | Format for saving output files (default: `by_date`). |
61
76
 
62
77
  ### Examples
63
78
 
@@ -67,5 +82,11 @@ To download most 3 recent data for the GOES-16 satellite, ABI-L2-CMIPF product,
67
82
  goesgcp --satellite goes16 --product ABI-L2-CMIPF --var_name CMI --channel 13 --recent 3 --output "output/"
68
83
  ```
69
84
 
85
+ To download data for a specific date range, use the `--start` and `--end` options:
86
+
87
+ ```bash
88
+ goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
89
+ ```
90
+
70
91
  ### Credits
71
92
  And this is a otimization by Helvecio Neto - 2025
@@ -33,6 +33,12 @@ goesgcp [OPTIONS]
33
33
  | `--lon_min` | Minimum longitude of the bounding box (default: `-116`). |
34
34
  | `--lon_max` | Maximum longitude of the bounding box (default: `-25`). |
35
35
  | `--resolution` | Set the reprojet data resolution in degree (default: `-0.045`). |
36
+ | `--recent` | Number of most recent data to download (default: `1`). |
37
+ | `--start` | Start date for downloading data (default: `None`). |
38
+ | `--end` | End date for downloading data (default: `None`). |
39
+ | `--bt_hour` | Hour of the day to download data (default: [0, 1, ..., 23]). |
40
+ | `--bt_minute` | Minute of the hour to download data (default: [0, 15, 30, 45]). |
41
+ | `--save_format` | Format for saving output files (default: `by_date`). |
36
42
 
37
43
  ### Examples
38
44
 
@@ -42,5 +48,11 @@ To download most 3 recent data for the GOES-16 satellite, ABI-L2-CMIPF product,
42
48
  goesgcp --satellite goes16 --product ABI-L2-CMIPF --var_name CMI --channel 13 --recent 3 --output "output/"
43
49
  ```
44
50
 
51
+ To download data for a specific date range, use the `--start` and `--end` options:
52
+
53
+ ```bash
54
+ goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
55
+ ```
56
+
45
57
  ### Credits
46
58
  And this is a otimization by Helvecio Neto - 2025
@@ -5,6 +5,7 @@ import xarray as xr
5
5
  import argparse
6
6
  import sys
7
7
  import tqdm
8
+ import pandas as pd
8
9
  from distutils.util import strtobool
9
10
  from multiprocessing import Pool
10
11
  from google.cloud import storage
@@ -21,12 +22,90 @@ def list_blobs(connection, bucket_name, prefix):
21
22
  bucket = connection.bucket(bucket_name)
22
23
 
23
24
  blobs = bucket.list_blobs(prefix=prefix)
25
+
24
26
  return blobs
25
27
 
26
28
  def get_directory_prefix(year, julian_day, hour):
27
29
  """Generates the directory path based on year, Julian day, and hour."""
28
30
  return f"{year}/{julian_day}/{str(hour).zfill(2)}/"
29
31
 
32
+
33
+ def get_files_period(connection, bucket_name, base_prefix, pattern,
34
+ start, end, bt_hour=[0, 23], bt_min=[0, 60], freq='10 min'):
35
+ """
36
+ Fetches files from a GCP bucket within a specified time period and returns them as a DataFrame.
37
+
38
+ :param connection: The GCP storage client connection.
39
+ :param bucket_name: Name of the GCP bucket.
40
+ :param base_prefix: Base directory prefix for the files.
41
+ :param pattern: Search pattern for file names.
42
+ :param start: Start datetime (inclusive).
43
+ :param end: End datetime (exclusive).
44
+ :return: DataFrame containing the file names and their metadata.
45
+ """
46
+
47
+ print(f"GOESGCP: Fetching files between {start} and {end}...")
48
+
49
+ # Ensure datetime objects
50
+ start = pd.to_datetime(start).tz_localize('UTC')
51
+ end = pd.to_datetime(end).tz_localize('UTC')
52
+
53
+ # Initialize list to store file metadata
54
+ files_metadata = []
55
+
56
+ # Generate the list of dates from start to end
57
+ current_time = start
58
+ while current_time < end:
59
+ year = current_time.year
60
+ julian_day = str(current_time.timetuple().tm_yday).zfill(3) # Julian day
61
+ hour = current_time.hour
62
+
63
+ # Generate the directory prefix
64
+ prefix = f"{base_prefix}/{get_directory_prefix(year, julian_day, hour)}"
65
+
66
+ # List blobs in the bucket for the current prefix
67
+ blobs = list_blobs(connection, bucket_name, prefix)
68
+
69
+ # Filter blobs by pattern
70
+ for blob in blobs:
71
+ if pattern in blob.name:
72
+ files_metadata.append({
73
+ 'file_name': blob.name,
74
+ 'last_modified': blob.updated
75
+ })
76
+
77
+ # Move to the next hour
78
+ current_time += timedelta(hours=1)
79
+
80
+ # Create a DataFrame from the list of files
81
+ df = pd.DataFrame(files_metadata)
82
+
83
+ if df.empty:
84
+ print("No files found matching the pattern.")
85
+ return pd.DataFrame()
86
+
87
+ # Ensure 'last_modified' is in the correct datetime format without timezone
88
+ df['last_modified'] = pd.to_datetime(df['last_modified']).dt.tz_localize(None)
89
+ start = pd.to_datetime(start).tz_localize(None)
90
+ end = pd.to_datetime(end).tz_localize(None)
91
+
92
+ # Filter the DataFrame based on the date range
93
+ df = df[(df['last_modified'] >= start) & (df['last_modified'] < end)]
94
+
95
+ # Filter the DataFrame based on the hour range
96
+ df['hour'] = df['last_modified'].dt.hour
97
+ df = df[(df['hour'] >= bt_hour[0]) & (df['hour'] <= bt_hour[1])]
98
+
99
+ # Filter the DataFrame based on the minute range
100
+ df['minute'] = df['last_modified'].dt.minute
101
+ df = df[(df['minute'] >= bt_min[0]) & (df['minute'] <= bt_min[1])]
102
+
103
+ # Filter the DataFrame based on the frequency
104
+ df['freq'] = df['last_modified'].dt.floor(freq)
105
+ df = df.groupby('freq').first().reset_index()
106
+
107
+ return df['file_name'].tolist()
108
+
30
109
  def get_recent_files(connection, bucket_name, base_prefix, pattern, min_files):
31
110
  """
32
111
  Fetches the most recent files in a GCP bucket.
@@ -44,6 +123,8 @@ def get_recent_files(connection, bucket_name, base_prefix, pattern, min_files):
44
123
  while len(files) < min_files:
45
124
  year = current_time.year
46
125
  julian_day = current_time.timetuple().tm_yday # Get the Julian day
126
+ # Add 3 digits to the Julian day
127
+ julian_day = str(julian_day).zfill(3)
47
128
  hour = current_time.hour
48
129
 
49
130
  # Generate the directory prefix for the current date and time
@@ -145,12 +226,31 @@ def crop_reproject(args):
145
226
  # Add global metadata comments
146
227
  ds.attrs['comments'] = "Data processed by goesgcp, author: Helvecio B. L. Neto (helvecioblneto@gmail.com)"
147
228
 
148
- # Save as netcdf overwriting the original file
149
- ds.to_netcdf(f'{output}{file.split("/")[-1]}', mode='w', format='NETCDF4_CLASSIC')
150
-
151
- # Close the dataset
229
+ if save_format == 'by_date':
230
+ file_datetime = datetime.strptime(ds.time_coverage_start,
231
+ "%Y-%m-%dT%H:%M:%S.%fZ")
232
+ year = file_datetime.strftime("%Y")
233
+ month = file_datetime.strftime("%m")
234
+ day = file_datetime.strftime("%d")
235
+ output_directory = f"{output}{year}/{month}/{day}/"
236
+ elif save_format == 'julian':
237
+ file_datetime = datetime.strptime(ds.time_coverage_start,
238
+ "%Y-%m-%dT%H:%M:%S.%fZ")
239
+ year = file_datetime.strftime("%Y")
240
+ julian_day = file_datetime.timetuple().tm_yday
241
+ output_directory = f"{output}{year}/{julian_day}/"
242
+ else:
243
+ output_directory = output
244
+
245
+ # Create the output directory
246
+ pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)
247
+
248
+ # Save the file
249
+ output_file = f"{output_directory}{file.split('/')[-1]}"
250
+ ds.to_netcdf(output_file, mode='w', format='NETCDF4_CLASSIC')
251
+
252
+ # Fechar o dataset
152
253
  ds.close()
153
-
154
254
  return
155
255
 
156
256
 
@@ -194,14 +294,20 @@ def main():
194
294
 
195
295
  global output_path, var_name, \
196
296
  lat_min, lat_max, lon_min, lon_max, \
197
- max_attempts, parallel, recent, resolution, storage_client
297
+ max_attempts, parallel, recent, resolution, storage_client, \
298
+ satellite, product, domain, op_mode, channel, save_format
198
299
 
199
300
  epilog = """
200
301
  Example usage:
201
302
 
202
- - To download recent 10 files from the GOES-16 satellite for the ABI-L2-CMIPF product:
303
+ - To download recent 3 files from the GOES-16 satellite for the ABI-L2-CMIPF product:
304
+
305
+ goesgcp --satellite goes16 --product ABI-L2-CMIP --recent 3"
306
+
307
+ - To download files from the GOES-16 satellite for the ABI-L2-CMIPF product between 2022-12-15 and 2022-12-20:
308
+
309
+ goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
203
310
 
204
- goesgcp --satellite goes16 --product ABI-L2-CMIP --recent 10 --output_path "output/"
205
311
  """
206
312
 
207
313
 
@@ -216,7 +322,17 @@ def main():
216
322
  parser.add_argument('--var_name', type=str, default='CMI', help='Variable name to extract (e.g., CMI)')
217
323
  parser.add_argument('--channel', type=int, default=13, help='Channel to use (e.g., 13)')
218
324
  parser.add_argument('--domain', type=str, default='F', help='Domain to use (e.g., F or C)')
219
- parser.add_argument('--recent', type=int, default=3, help='Number of recent files to download')
325
+ parser.add_argument('--op_mode', type=str, default='M6C', help='Operational mode to use (e.g., M6C)')
326
+
327
+ # Recent files settings
328
+ parser.add_argument('--recent', type=int, help='Number of recent files to download (e.g., 3)')
329
+
330
+ # Date and time settings
331
+ parser.add_argument('--start', type=str, help='Start date in YYYY-MM-DD format')
332
+ parser.add_argument('--end', type=str, help='End date in YYYY-MM-DD format')
333
+ parser.add_argument('--freq', type=str, default='10 min', help='Frequency for the time range (e.g., "10 min")')
334
+ parser.add_argument('--bt_hour', nargs=2, type=int, default=[0, 23], help='Filter data between these hours (e.g., 0 23)')
335
+ parser.add_argument('--bt_min', nargs=2, type=int, default=[0, 60], help='Filter data between these minutes (e.g., 0 60)')
220
336
 
221
337
  # Geographic bounding box
222
338
  parser.add_argument('--lat_min', type=float, default=-81.3282, help='Minimum latitude of the bounding box')
@@ -230,6 +346,9 @@ def main():
230
346
  parser.add_argument('--parallel', type=lambda x: bool(strtobool(x)), default=True, help='Use parallel processing')
231
347
  parser.add_argument('--processes', type=int, default=4, help='Number of processes for parallel execution')
232
348
  parser.add_argument('--max_attempts', type=int, default=3, help='Number of attempts to download a file')
349
+ parser.add_argument('--save_format', type=str, default='flat', choices=['flat', 'by_date','julian'],
350
+ help="Save the files in a flat structure or by date")
351
+
233
352
 
234
353
  # Parse arguments
235
354
  args = parser.parse_args()
@@ -243,6 +362,7 @@ def main():
243
362
  satellite = args.satellite
244
363
  product = args.product
245
364
  domain = args.domain
365
+ op_mode = args.op_mode
246
366
  channel = str(args.channel).zfill(2)
247
367
  var_name = args.var_name
248
368
  lat_min = args.lat_min
@@ -252,11 +372,22 @@ def main():
252
372
  resolution = args.resolution
253
373
  max_attempts = args.max_attempts
254
374
  parallel = args.parallel
375
+ recent = args.recent
376
+ start = args.start
377
+ end = args.end
378
+ freq = args.freq
379
+ bt_hour = args.bt_hour
380
+ bt_min = args.bt_min
381
+ save_format = args.save_format
382
+
383
+
384
+ # Check mandatory arguments
385
+ if not args.recent and not (args.start and args.end):
386
+ print("You must provide either the --recent or --start and --end arguments. Exiting...")
387
+ sys.exit(1)
255
388
 
256
389
  # Set bucket name and pattern
257
390
  bucket_name = "gcp-public-data-" + satellite
258
- pattern = "OR_"+product+domain+"-M6C"+channel+"_G" + satellite[-2:]
259
- min_files = args.recent
260
391
 
261
392
  # Create output directory
262
393
  pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
@@ -271,11 +402,20 @@ def main():
271
402
  print(f"Bucket {bucket_name} not found. Exiting...")
272
403
  sys.exit(1)
273
404
 
274
- # Search for recent files
275
- recent_files = get_recent_files(storage_client, bucket_name, product + domain, pattern, min_files)
405
+ # Set pattern for the files
406
+ pattern = "OR_"+product+domain+"-"+op_mode+channel+"_G" + satellite[-2:]
407
+
408
+ # Check operational mode if is recent or specific date
409
+ if start and end:
410
+ files_list = get_files_period(storage_client, bucket_name,
411
+ product + domain, pattern, start, end,
412
+ bt_hour, bt_min, freq)
413
+ else:
414
+ # Get recent files
415
+ files_list = get_recent_files(storage_client, bucket_name, product + domain, pattern, recent)
276
416
 
277
417
  # Check if any files were found
278
- if not recent_files:
418
+ if not files_list:
279
419
  print(f"No files found with the pattern {pattern}. Exiting...")
280
420
  sys.exit(1)
281
421
 
@@ -283,14 +423,14 @@ def main():
283
423
  pathlib.Path('tmp/').mkdir(parents=True, exist_ok=True)
284
424
 
285
425
  # Download files
286
- print(f"GOESGCP: Downloading and processing {len(recent_files)} files...")
287
- loading_bar = tqdm.tqdm(total=len(recent_files), ncols=100, position=0, leave=True,
426
+ print(f"GOESGCP: Downloading and processing {len(files_list)} files...")
427
+ loading_bar = tqdm.tqdm(total=len(files_list), ncols=100, position=0, leave=True,
288
428
  bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} + \
289
429
  [Elapsed:{elapsed} Remaining:<{remaining}]')
290
430
 
291
431
  if parallel: # Run in parallel
292
432
  # Create a list of tasks
293
- tasks = [(bucket_name, file, f"tmp/{file.split('/')[-1]}") for file in recent_files]
433
+ tasks = [(bucket_name, file, f"tmp/{file.split('/')[-1]}") for file in files_list]
294
434
 
295
435
  # Download files in parallel
296
436
  with Pool(processes=args.processes) as pool:
@@ -298,7 +438,7 @@ def main():
298
438
  loading_bar.update(1)
299
439
  loading_bar.close()
300
440
  else: # Run in serial
301
- for file in recent_files:
441
+ for file in files_list:
302
442
  local_path = f"tmp/{file.split('/')[-1]}"
303
443
  process_file((bucket_name, file, local_path))
304
444
  loading_bar.update(1)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: goesgcp
3
- Version: 1.0.8
3
+ Version: 2.0.0
4
4
  Summary: A package to download and process GOES-16/17 data
5
5
  Home-page: https://github.com/helvecioneto/goesgcp
6
6
  Author: Helvecio B. L. Neto
@@ -22,6 +22,15 @@ Requires-Dist: pyproj
22
22
  Requires-Dist: xarray
23
23
  Requires-Dist: netcdf4
24
24
  Requires-Dist: rioxarray
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: home-page
31
+ Dynamic: license
32
+ Dynamic: requires-dist
33
+ Dynamic: summary
25
34
 
26
35
  # goesgcp
27
36
 
@@ -58,6 +67,12 @@ goesgcp [OPTIONS]
58
67
  | `--lon_min` | Minimum longitude of the bounding box (default: `-116`). |
59
68
  | `--lon_max` | Maximum longitude of the bounding box (default: `-25`). |
60
69
  | `--resolution` | Set the reprojet data resolution in degree (default: `-0.045`). |
70
+ | `--recent` | Number of most recent data to download (default: `1`). |
71
+ | `--start` | Start date for downloading data (default: `None`). |
72
+ | `--end` | End date for downloading data (default: `None`). |
73
+ | `--bt_hour` | Hour of the day to download data (default: [0, 1, ..., 23]). |
74
+ | `--bt_minute` | Minute of the hour to download data (default: [0, 15, 30, 45]). |
75
+ | `--save_format` | Format for saving output files (default: `by_date`). |
61
76
 
62
77
  ### Examples
63
78
 
@@ -67,5 +82,11 @@ To download most 3 recent data for the GOES-16 satellite, ABI-L2-CMIPF product,
67
82
  goesgcp --satellite goes16 --product ABI-L2-CMIPF --var_name CMI --channel 13 --recent 3 --output "output/"
68
83
  ```
69
84
 
85
+ To download data for a specific date range, use the `--start` and `--end` options:
86
+
87
+ ```bash
88
+ goesgcp --start '2022-12-15 00:00:00' --end '2022-12-20 10:00:00' --bt_hour 5 6 --save_format by_date --resolution 0.045 --lat_min -35 --lat_max 5 --lon_min -80 --lon_max -30
89
+ ```
90
+
70
91
  ### Credits
71
92
  And this is a otimization by Helvecio Neto - 2025
@@ -13,7 +13,7 @@ with open('requirements.txt') as f:
13
13
 
14
14
  setup(
15
15
  name="goesgcp",
16
- version='1.0.8',
16
+ version='2.0.0',
17
17
  author="Helvecio B. L. Neto",
18
18
  author_email="helvecioblneto@gmail.com",
19
19
  description="A package to download and process GOES-16/17 data",
File without changes
File without changes
File without changes