ngiab-data-preprocess 4.0.2__tar.gz → 4.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/PKG-INFO +2 -1
  2. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/source_validation.py +72 -72
  3. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_cli/__main__.py +3 -3
  4. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/PKG-INFO +2 -1
  5. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/requires.txt +1 -0
  6. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/pyproject.toml +2 -1
  7. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/.github/workflows/build_only.yml +0 -0
  8. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/.github/workflows/publish.yml +0 -0
  9. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/.gitignore +0 -0
  10. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/LICENSE +0 -0
  11. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/README.md +0 -0
  12. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/map.html +0 -0
  13. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/create_realization.py +0 -0
  14. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/dataset_utils.py +0 -0
  15. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/datasets.py +0 -0
  16. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/file_paths.py +0 -0
  17. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/forcings.py +0 -0
  18. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/gpkg_utils.py +0 -0
  19. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/graph_utils.py +0 -0
  20. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/s3fs_utils.py +0 -0
  21. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_processing/subset.py +0 -0
  22. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/cfe-nowpm-realization-template.json +0 -0
  23. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/cfe-template.ini +0 -0
  24. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/em-catchment-template.yml +0 -0
  25. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/em-config.yml +0 -0
  26. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/em-realization-template.json +0 -0
  27. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/forcing_template.nc +0 -0
  28. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/ngen-routing-template.yaml +0 -0
  29. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/noah-owp-modular-init.namelist.input +0 -0
  30. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/template.sql +0 -0
  31. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/data_sources/triggers.sql +0 -0
  32. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/__init__.py +0 -0
  33. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/__main__.py +0 -0
  34. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/css/console.css +0 -0
  35. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/css/main.css +0 -0
  36. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/css/toggle.css +0 -0
  37. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/js/console.js +0 -0
  38. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/js/data_processing.js +0 -0
  39. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/js/main.js +0 -0
  40. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/resources/dark-style.json +0 -0
  41. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/resources/light-style.json +0 -0
  42. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/resources/loading.gif +0 -0
  43. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/static/resources/screenshot.jpg +0 -0
  44. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/templates/index.html +0 -0
  45. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/map_app/views.py +0 -0
  46. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_cli/arguments.py +0 -0
  47. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_cli/custom_logging.py +0 -0
  48. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_cli/forcing_cli.py +0 -0
  49. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/SOURCES.txt +0 -0
  50. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/dependency_links.txt +0 -0
  51. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/entry_points.txt +0 -0
  52. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/modules/ngiab_data_preprocess.egg-info/top_level.txt +0 -0
  53. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/output/.gitkeep +0 -0
  54. {ngiab_data_preprocess-4.0.2 → ngiab_data_preprocess-4.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ngiab_data_preprocess
3
- Version: 4.0.2
3
+ Version: 4.0.4
4
4
  Summary: Graphical Tools for creating Next Gen Water model input data.
5
5
  Author-email: Josh Cunningham <jcunningham8@ua.edu>
6
6
  Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -32,6 +32,7 @@ Requires-Dist: tqdm==4.66.4
32
32
  Requires-Dist: rich==13.7.1
33
33
  Requires-Dist: colorama==0.4.6
34
34
  Requires-Dist: bokeh==3.5.1
35
+ Requires-Dist: boto3
35
36
  Provides-Extra: eval
36
37
  Requires-Dist: ngiab_eval; extra == "eval"
37
38
  Provides-Extra: plot
@@ -3,26 +3,30 @@ import os
3
3
  import tarfile
4
4
  import warnings
5
5
  import json
6
- from concurrent.futures import ThreadPoolExecutor
7
6
  import requests
8
7
  from data_processing.file_paths import file_paths
9
8
  from tqdm import TqdmExperimentalWarning
10
- from tqdm.rich import tqdm
11
9
  from time import sleep
10
+ import boto3
11
+ from botocore.exceptions import ClientError
12
+ from boto3.s3.transfer import TransferConfig
12
13
  from rich.console import Console
13
14
  from rich.prompt import Prompt
14
- from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn, TimeRemainingColumn, DownloadColumn, TransferSpeedColumn
15
- import threading
15
+ from rich.progress import Progress, TextColumn, TimeElapsedColumn, SpinnerColumn
16
16
  import psutil
17
17
 
18
18
  warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
19
19
 
20
20
  console = Console()
21
-
21
+ S3_BUCKET = "communityhydrofabric"
22
+ S3_KEY = "hydrofabrics/community/conus_nextgen.tar.gz"
23
+ S3_REGION = "us-east-1"
24
+ hydrofabric_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{S3_KEY}"
22
25
 
23
26
  def decompress_gzip_tar(file_path, output_dir):
24
27
  # use rich to display "decompressing" message with a progress bar that just counts down from 30s
25
28
  # actually measuring this is hard and it usually takes ~20s to decompress
29
+ console.print("Decompressing Hydrofabric...", style="bold green")
26
30
  progress = Progress(
27
31
  SpinnerColumn(),
28
32
  TextColumn("[progress.description]{task.description}"),
@@ -40,79 +44,71 @@ def decompress_gzip_tar(file_path, output_dir):
40
44
  progress.stop()
41
45
 
42
46
 
43
- def download_chunk(url, start, end, index, save_path):
44
- headers = {"Range": f"bytes={start}-{end}"}
45
- response = requests.get(url, headers=headers, stream=True)
46
- chunk_path = f"{save_path}.part{index}"
47
- # store the response in memory rather than streaming to disk
48
- # OSX has a limit of 256 open files so this is a workaround
49
- response_bytes = bytes()
50
- for chunk in response.iter_content(chunk_size=8 * 1024):
51
- response_bytes += chunk
52
- with open(chunk_path, "wb") as f_out:
53
- f_out.write(response_bytes)
54
- return chunk_path
55
-
56
- def download_progress_estimate(progress, task, total_size):
57
- network_bytes_start = psutil.net_io_counters().bytes_recv
58
- # make a new progress bar that will be updated by a separate thread
59
- progress.start()
60
- interval = 0.5
61
- while not progress.finished:
62
- current_downloaded = psutil.net_io_counters().bytes_recv
63
- total_downloaded = current_downloaded - network_bytes_start
64
- progress.update(task, completed=total_downloaded)
65
- sleep(interval)
66
- if total_downloaded >= total_size or progress.finished:
67
- break
68
- progress.stop()
69
-
70
-
71
- def download_file(url, save_path, num_threads=150):
47
+ def download_from_s3(save_path, bucket=S3_BUCKET, key=S3_KEY, region=S3_REGION):
48
+ """Download file from S3 with optimal multipart configuration"""
72
49
  if not os.path.exists(os.path.dirname(save_path)):
73
50
  os.makedirs(os.path.dirname(save_path))
74
51
 
75
- response = requests.head(url)
76
- total_size = int(response.headers.get("content-length", 0))
77
- chunk_size = total_size // num_threads
78
-
79
- progress = Progress(
80
- TextColumn("[progress.description]{task.description}"),
81
- BarColumn(),
82
- DownloadColumn(),
83
- TransferSpeedColumn(),
84
- TextColumn(" Elapsed Time:"),
85
- TimeElapsedColumn(),
86
- TextColumn(" Remaining Time:"),
87
- TimeRemainingColumn(),
88
- )
89
- task = progress.add_task("Downloading", total=total_size)
90
-
91
- download_progress_thread = threading.Thread(target=download_progress_estimate, args=(progress, task ,total_size))
92
- download_progress_thread.start()
93
-
94
- with ThreadPoolExecutor(max_workers=num_threads) as executor:
95
- futures = []
96
- for i in range(num_threads):
97
- start = i * chunk_size
98
- end = start + chunk_size - 1 if i < num_threads - 1 else total_size - 1
99
- futures.append(executor.submit(download_chunk, url, start, end, i, save_path))
52
+ # Check if file already exists
53
+ if os.path.exists(save_path):
54
+ console.print(f"File already exists: {save_path}", style="bold yellow")
55
+ os.remove(save_path)
100
56
 
101
- chunk_paths = [
102
- future.result() for future in futures
103
- ]
104
-
105
- with open(save_path, "wb") as f_out:
106
- for chunk_path in chunk_paths:
107
- with open(chunk_path, "rb") as f_in:
108
- f_out.write(f_in.read())
109
- os.remove(chunk_path)
57
+ # Initialize S3 client
58
+ s3_client = boto3.client(
59
+ "s3", aws_access_key_id="", aws_secret_access_key="", region_name=region
60
+ )
61
+ # Disable request signing for public buckets
62
+ s3_client._request_signer.sign = lambda *args, **kwargs: None
110
63
 
111
- progress.update(task, completed=total_size)
112
- download_progress_thread.join()
64
+ # Get object size
65
+ try:
66
+ response = s3_client.head_object(Bucket=bucket, Key=key)
67
+ total_size = int(response.get("ContentLength", 0))
68
+ except ClientError as e:
69
+ console.print(f"Error getting object info: {e}", style="bold red")
70
+ return False
71
+
72
+ # Configure transfer settings for maximum speed
73
+ # Use more CPU cores for parallel processing
74
+ cpu_count = os.cpu_count() or 8
75
+ max_threads = cpu_count * 4
76
+
77
+ # Optimize chunk size based on file size and available memory
78
+ memory = psutil.virtual_memory()
79
+ available_mem_mb = memory.available / (1024 * 1024)
80
+
81
+ # Calculate optimal chunk size (min 8MB, max 100MB)
82
+ # Larger files get larger chunks for better throughput
83
+ optimal_chunk_mb = min(max(8, total_size / (50 * 1024 * 1024)), 100)
84
+ # Ensure we don't use too much memory
85
+ optimal_chunk_mb = min(optimal_chunk_mb, available_mem_mb / (max_threads * 2))
86
+
87
+ # Create transfer config
88
+ config = TransferConfig(
89
+ # multipart_threshold=8 * 1024 * 1024, # 8MB
90
+ max_concurrency=max_threads,
91
+ multipart_chunksize=int(optimal_chunk_mb * 1024 * 1024),
92
+ use_threads=True,
93
+ )
113
94
 
95
+ console.print(f"Downloading {key} to {save_path}...", style="bold green")
96
+ console.print(
97
+ f"The file downloads faster with no progress indicator, this should take around 30s",
98
+ style="bold yellow",
99
+ )
100
+ console.print(
101
+ f"Please use network monitoring on your computer if you wish to track the download",
102
+ style="green",
103
+ )
114
104
 
115
- hydrofabric_url = "https://communityhydrofabric.s3.us-east-1.amazonaws.com/hydrofabrics/community/conus_nextgen.tar.gz"
105
+ try:
106
+ # Download file using optimized transfer config
107
+ s3_client.download_file(Bucket=bucket, Key=key, Filename=save_path, Config=config)
108
+ return True
109
+ except Exception as e:
110
+ console.print(f"Error downloading file: {e}", style="bold red")
111
+ return False
116
112
 
117
113
 
118
114
  def get_headers():
@@ -126,7 +122,11 @@ def get_headers():
126
122
 
127
123
 
128
124
  def download_and_update_hf():
129
- download_file(hydrofabric_url, file_paths.conus_hydrofabric.with_suffix(".tar.gz"))
125
+ download_from_s3(
126
+ file_paths.conus_hydrofabric.with_suffix(".tar.gz"),
127
+ bucket="communityhydrofabric",
128
+ key="hydrofabrics/community/conus_nextgen.tar.gz",
129
+ )
130
130
  status, headers = get_headers()
131
131
 
132
132
  if status == 200:
@@ -194,12 +194,12 @@ def main() -> None:
194
194
  num_partitions = int(f.read())
195
195
 
196
196
  try:
197
- subprocess.run("docker pull joshcu/ngiab", shell=True)
197
+ subprocess.run("docker pull awiciroh/ciroh-ngen-image:latest", shell=True)
198
198
  except:
199
199
  logging.error("Docker is not running, please start Docker and try again.")
200
200
  try:
201
- command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" joshcu/ngiab /ngen/ngen/data/ auto {num_partitions} local'
202
- # command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" awiciroh/ciroh-ngen-image:latest-x86 /ngen/ngen/data/ auto {num_partitions}'
201
+ #command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" joshcu/ngiab /ngen/ngen/data/ auto {num_partitions} local'
202
+ command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" awiciroh/ciroh-ngen-image:latest /ngen/ngen/data/ auto {num_partitions} local'
203
203
  subprocess.run(command, shell=True)
204
204
  logging.info("Next Gen run complete.")
205
205
  except:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ngiab_data_preprocess
3
- Version: 4.0.2
3
+ Version: 4.0.4
4
4
  Summary: Graphical Tools for creating Next Gen Water model input data.
5
5
  Author-email: Josh Cunningham <jcunningham8@ua.edu>
6
6
  Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -32,6 +32,7 @@ Requires-Dist: tqdm==4.66.4
32
32
  Requires-Dist: rich==13.7.1
33
33
  Requires-Dist: colorama==0.4.6
34
34
  Requires-Dist: bokeh==3.5.1
35
+ Requires-Dist: boto3
35
36
  Provides-Extra: eval
36
37
  Requires-Dist: ngiab_eval; extra == "eval"
37
38
  Provides-Extra: plot
@@ -19,6 +19,7 @@ tqdm==4.66.4
19
19
  rich==13.7.1
20
20
  colorama==0.4.6
21
21
  bokeh==3.5.1
22
+ boto3
22
23
 
23
24
  [eval]
24
25
  ngiab_eval
@@ -12,7 +12,7 @@ exclude = ["tests*"]
12
12
 
13
13
  [project]
14
14
  name = "ngiab_data_preprocess"
15
- version = "v4.0.2"
15
+ version = "v4.0.4"
16
16
  authors = [{ name = "Josh Cunningham", email = "jcunningham8@ua.edu" }]
17
17
  description = "Graphical Tools for creating Next Gen Water model input data."
18
18
  readme = "README.md"
@@ -44,6 +44,7 @@ dependencies = [
44
44
  "rich==13.7.1",
45
45
  "colorama==0.4.6",
46
46
  "bokeh==3.5.1",
47
+ "boto3"
47
48
  ]
48
49
 
49
50
  [project.optional-dependencies]