ngiab-data-preprocess 4.0.2__py3-none-any.whl → 4.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_sources/source_validation.py +72 -72
- ngiab_data_cli/__main__.py +3 -3
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/METADATA +2 -1
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/RECORD +8 -8
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/WHEEL +1 -1
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/LICENSE +0 -0
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/entry_points.txt +0 -0
- {ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/top_level.txt +0 -0
|
@@ -3,26 +3,30 @@ import os
|
|
|
3
3
|
import tarfile
|
|
4
4
|
import warnings
|
|
5
5
|
import json
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
7
6
|
import requests
|
|
8
7
|
from data_processing.file_paths import file_paths
|
|
9
8
|
from tqdm import TqdmExperimentalWarning
|
|
10
|
-
from tqdm.rich import tqdm
|
|
11
9
|
from time import sleep
|
|
10
|
+
import boto3
|
|
11
|
+
from botocore.exceptions import ClientError
|
|
12
|
+
from boto3.s3.transfer import TransferConfig
|
|
12
13
|
from rich.console import Console
|
|
13
14
|
from rich.prompt import Prompt
|
|
14
|
-
from rich.progress import Progress,
|
|
15
|
-
import threading
|
|
15
|
+
from rich.progress import Progress, TextColumn, TimeElapsedColumn, SpinnerColumn
|
|
16
16
|
import psutil
|
|
17
17
|
|
|
18
18
|
warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
|
|
19
19
|
|
|
20
20
|
console = Console()
|
|
21
|
-
|
|
21
|
+
S3_BUCKET = "communityhydrofabric"
|
|
22
|
+
S3_KEY = "hydrofabrics/community/conus_nextgen.tar.gz"
|
|
23
|
+
S3_REGION = "us-east-1"
|
|
24
|
+
hydrofabric_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{S3_KEY}"
|
|
22
25
|
|
|
23
26
|
def decompress_gzip_tar(file_path, output_dir):
|
|
24
27
|
# use rich to display "decompressing" message with a progress bar that just counts down from 30s
|
|
25
28
|
# actually measuring this is hard and it usually takes ~20s to decompress
|
|
29
|
+
console.print("Decompressing Hydrofabric...", style="bold green")
|
|
26
30
|
progress = Progress(
|
|
27
31
|
SpinnerColumn(),
|
|
28
32
|
TextColumn("[progress.description]{task.description}"),
|
|
@@ -40,79 +44,71 @@ def decompress_gzip_tar(file_path, output_dir):
|
|
|
40
44
|
progress.stop()
|
|
41
45
|
|
|
42
46
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
response = requests.get(url, headers=headers, stream=True)
|
|
46
|
-
chunk_path = f"{save_path}.part{index}"
|
|
47
|
-
# store the response in memory rather than streaming to disk
|
|
48
|
-
# OSX has a limit of 256 open files so this is a workaround
|
|
49
|
-
response_bytes = bytes()
|
|
50
|
-
for chunk in response.iter_content(chunk_size=8 * 1024):
|
|
51
|
-
response_bytes += chunk
|
|
52
|
-
with open(chunk_path, "wb") as f_out:
|
|
53
|
-
f_out.write(response_bytes)
|
|
54
|
-
return chunk_path
|
|
55
|
-
|
|
56
|
-
def download_progress_estimate(progress, task, total_size):
|
|
57
|
-
network_bytes_start = psutil.net_io_counters().bytes_recv
|
|
58
|
-
# make a new progress bar that will be updated by a separate thread
|
|
59
|
-
progress.start()
|
|
60
|
-
interval = 0.5
|
|
61
|
-
while not progress.finished:
|
|
62
|
-
current_downloaded = psutil.net_io_counters().bytes_recv
|
|
63
|
-
total_downloaded = current_downloaded - network_bytes_start
|
|
64
|
-
progress.update(task, completed=total_downloaded)
|
|
65
|
-
sleep(interval)
|
|
66
|
-
if total_downloaded >= total_size or progress.finished:
|
|
67
|
-
break
|
|
68
|
-
progress.stop()
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def download_file(url, save_path, num_threads=150):
|
|
47
|
+
def download_from_s3(save_path, bucket=S3_BUCKET, key=S3_KEY, region=S3_REGION):
|
|
48
|
+
"""Download file from S3 with optimal multipart configuration"""
|
|
72
49
|
if not os.path.exists(os.path.dirname(save_path)):
|
|
73
50
|
os.makedirs(os.path.dirname(save_path))
|
|
74
51
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
progress = Progress(
|
|
80
|
-
TextColumn("[progress.description]{task.description}"),
|
|
81
|
-
BarColumn(),
|
|
82
|
-
DownloadColumn(),
|
|
83
|
-
TransferSpeedColumn(),
|
|
84
|
-
TextColumn(" Elapsed Time:"),
|
|
85
|
-
TimeElapsedColumn(),
|
|
86
|
-
TextColumn(" Remaining Time:"),
|
|
87
|
-
TimeRemainingColumn(),
|
|
88
|
-
)
|
|
89
|
-
task = progress.add_task("Downloading", total=total_size)
|
|
90
|
-
|
|
91
|
-
download_progress_thread = threading.Thread(target=download_progress_estimate, args=(progress, task ,total_size))
|
|
92
|
-
download_progress_thread.start()
|
|
93
|
-
|
|
94
|
-
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
95
|
-
futures = []
|
|
96
|
-
for i in range(num_threads):
|
|
97
|
-
start = i * chunk_size
|
|
98
|
-
end = start + chunk_size - 1 if i < num_threads - 1 else total_size - 1
|
|
99
|
-
futures.append(executor.submit(download_chunk, url, start, end, i, save_path))
|
|
52
|
+
# Check if file already exists
|
|
53
|
+
if os.path.exists(save_path):
|
|
54
|
+
console.print(f"File already exists: {save_path}", style="bold yellow")
|
|
55
|
+
os.remove(save_path)
|
|
100
56
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
with open(chunk_path, "rb") as f_in:
|
|
108
|
-
f_out.write(f_in.read())
|
|
109
|
-
os.remove(chunk_path)
|
|
57
|
+
# Initialize S3 client
|
|
58
|
+
s3_client = boto3.client(
|
|
59
|
+
"s3", aws_access_key_id="", aws_secret_access_key="", region_name=region
|
|
60
|
+
)
|
|
61
|
+
# Disable request signing for public buckets
|
|
62
|
+
s3_client._request_signer.sign = lambda *args, **kwargs: None
|
|
110
63
|
|
|
111
|
-
|
|
112
|
-
|
|
64
|
+
# Get object size
|
|
65
|
+
try:
|
|
66
|
+
response = s3_client.head_object(Bucket=bucket, Key=key)
|
|
67
|
+
total_size = int(response.get("ContentLength", 0))
|
|
68
|
+
except ClientError as e:
|
|
69
|
+
console.print(f"Error getting object info: {e}", style="bold red")
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
# Configure transfer settings for maximum speed
|
|
73
|
+
# Use more CPU cores for parallel processing
|
|
74
|
+
cpu_count = os.cpu_count() or 8
|
|
75
|
+
max_threads = cpu_count * 4
|
|
76
|
+
|
|
77
|
+
# Optimize chunk size based on file size and available memory
|
|
78
|
+
memory = psutil.virtual_memory()
|
|
79
|
+
available_mem_mb = memory.available / (1024 * 1024)
|
|
80
|
+
|
|
81
|
+
# Calculate optimal chunk size (min 8MB, max 100MB)
|
|
82
|
+
# Larger files get larger chunks for better throughput
|
|
83
|
+
optimal_chunk_mb = min(max(8, total_size / (50 * 1024 * 1024)), 100)
|
|
84
|
+
# Ensure we don't use too much memory
|
|
85
|
+
optimal_chunk_mb = min(optimal_chunk_mb, available_mem_mb / (max_threads * 2))
|
|
86
|
+
|
|
87
|
+
# Create transfer config
|
|
88
|
+
config = TransferConfig(
|
|
89
|
+
# multipart_threshold=8 * 1024 * 1024, # 8MB
|
|
90
|
+
max_concurrency=max_threads,
|
|
91
|
+
multipart_chunksize=int(optimal_chunk_mb * 1024 * 1024),
|
|
92
|
+
use_threads=True,
|
|
93
|
+
)
|
|
113
94
|
|
|
95
|
+
console.print(f"Downloading {key} to {save_path}...", style="bold green")
|
|
96
|
+
console.print(
|
|
97
|
+
f"The file downloads faster with no progress indicator, this should take around 30s",
|
|
98
|
+
style="bold yellow",
|
|
99
|
+
)
|
|
100
|
+
console.print(
|
|
101
|
+
f"Please use network monitoring on your computer if you wish to track the download",
|
|
102
|
+
style="green",
|
|
103
|
+
)
|
|
114
104
|
|
|
115
|
-
|
|
105
|
+
try:
|
|
106
|
+
# Download file using optimized transfer config
|
|
107
|
+
s3_client.download_file(Bucket=bucket, Key=key, Filename=save_path, Config=config)
|
|
108
|
+
return True
|
|
109
|
+
except Exception as e:
|
|
110
|
+
console.print(f"Error downloading file: {e}", style="bold red")
|
|
111
|
+
return False
|
|
116
112
|
|
|
117
113
|
|
|
118
114
|
def get_headers():
|
|
@@ -126,7 +122,11 @@ def get_headers():
|
|
|
126
122
|
|
|
127
123
|
|
|
128
124
|
def download_and_update_hf():
|
|
129
|
-
|
|
125
|
+
download_from_s3(
|
|
126
|
+
file_paths.conus_hydrofabric.with_suffix(".tar.gz"),
|
|
127
|
+
bucket="communityhydrofabric",
|
|
128
|
+
key="hydrofabrics/community/conus_nextgen.tar.gz",
|
|
129
|
+
)
|
|
130
130
|
status, headers = get_headers()
|
|
131
131
|
|
|
132
132
|
if status == 200:
|
ngiab_data_cli/__main__.py
CHANGED
|
@@ -194,12 +194,12 @@ def main() -> None:
|
|
|
194
194
|
num_partitions = int(f.read())
|
|
195
195
|
|
|
196
196
|
try:
|
|
197
|
-
subprocess.run("docker pull
|
|
197
|
+
subprocess.run("docker pull awiciroh/ciroh-ngen-image:latest", shell=True)
|
|
198
198
|
except:
|
|
199
199
|
logging.error("Docker is not running, please start Docker and try again.")
|
|
200
200
|
try:
|
|
201
|
-
command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" joshcu/ngiab /ngen/ngen/data/ auto {num_partitions} local'
|
|
202
|
-
|
|
201
|
+
#command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" joshcu/ngiab /ngen/ngen/data/ auto {num_partitions} local'
|
|
202
|
+
command = f'docker run --rm -it -v "{str(paths.subset_dir)}:/ngen/ngen/data" awiciroh/ciroh-ngen-image:latest /ngen/ngen/data/ auto {num_partitions} local'
|
|
203
203
|
subprocess.run(command, shell=True)
|
|
204
204
|
logging.info("Next Gen run complete.")
|
|
205
205
|
except:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: ngiab_data_preprocess
|
|
3
|
-
Version: 4.0.
|
|
3
|
+
Version: 4.0.4
|
|
4
4
|
Summary: Graphical Tools for creating Next Gen Water model input data.
|
|
5
5
|
Author-email: Josh Cunningham <jcunningham8@ua.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
|
|
@@ -32,6 +32,7 @@ Requires-Dist: tqdm==4.66.4
|
|
|
32
32
|
Requires-Dist: rich==13.7.1
|
|
33
33
|
Requires-Dist: colorama==0.4.6
|
|
34
34
|
Requires-Dist: bokeh==3.5.1
|
|
35
|
+
Requires-Dist: boto3
|
|
35
36
|
Provides-Extra: eval
|
|
36
37
|
Requires-Dist: ngiab_eval; extra == "eval"
|
|
37
38
|
Provides-Extra: plot
|
|
@@ -15,7 +15,7 @@ data_sources/em-realization-template.json,sha256=DJvB7N8lCeS2vLFenmbTzysBDR-xPaJ
|
|
|
15
15
|
data_sources/forcing_template.nc,sha256=uRuVAqX3ngdlougZINavtwl_wC2VLD8fHqG7_CLim1s,85284
|
|
16
16
|
data_sources/ngen-routing-template.yaml,sha256=RV28MAbyQNx9U8FAYmZhD2Fv8Yu6o_08Ekoc77KNdH4,4622
|
|
17
17
|
data_sources/noah-owp-modular-init.namelist.input,sha256=Vb7mp40hFpJogruOrXrDHwVW1bKi9h1ciDNyDvTzn20,3045
|
|
18
|
-
data_sources/source_validation.py,sha256=
|
|
18
|
+
data_sources/source_validation.py,sha256=9mC1LiL8TXh0mtHFwOlFkT2jN1DEorgw_K7qccrdFfA,8038
|
|
19
19
|
data_sources/template.sql,sha256=ZnFqAqleEq9wgmAhNO90Wue_L9k0JAn8KF99DYtcxgs,10457
|
|
20
20
|
data_sources/triggers.sql,sha256=G0d_175eNsamKAFhsbphPATvzMPuPL_iCleIhlToduQ,14906
|
|
21
21
|
map_app/__init__.py,sha256=OarJao9X98kcbLyiwewN4ObWNAYkKDichcxbuWywTsA,818
|
|
@@ -32,13 +32,13 @@ map_app/static/resources/light-style.json,sha256=DaE52qKpAkjiWSKY_z7LxreqA2rW4Zy
|
|
|
32
32
|
map_app/static/resources/loading.gif,sha256=ggdkZf1AD7rSwIpSJwfiIqANgmVV1WHlxGuKxQKv7uY,72191
|
|
33
33
|
map_app/static/resources/screenshot.jpg,sha256=Ia358aX-OHM9BP4B8lX05cLnguF2fHUIimno9bnFLYw,253730
|
|
34
34
|
map_app/templates/index.html,sha256=OAPh-EMlk3cAQqFwhSw5oPjUu6PS-H0oLfhyHmnI0_4,6629
|
|
35
|
-
ngiab_data_cli/__main__.py,sha256=
|
|
35
|
+
ngiab_data_cli/__main__.py,sha256=_m0bhAXdyLeZBuh03PKNXlJ6BUJnZDR-xZIbFyRH8iE,10164
|
|
36
36
|
ngiab_data_cli/arguments.py,sha256=7ptImy8tLM1XvjfDr13tZszkjGVtenXo0KqllJeE3Mw,4372
|
|
37
37
|
ngiab_data_cli/custom_logging.py,sha256=iS2XozaxudcxQj17qAsrCgbVK9LJAYAPmarJuVWJo1k,1280
|
|
38
38
|
ngiab_data_cli/forcing_cli.py,sha256=lkcqWDk5H8IPyGv0DwLIZMQldqTUXpfwSX0C_RIuIJ8,3890
|
|
39
|
-
ngiab_data_preprocess-4.0.
|
|
40
|
-
ngiab_data_preprocess-4.0.
|
|
41
|
-
ngiab_data_preprocess-4.0.
|
|
42
|
-
ngiab_data_preprocess-4.0.
|
|
43
|
-
ngiab_data_preprocess-4.0.
|
|
44
|
-
ngiab_data_preprocess-4.0.
|
|
39
|
+
ngiab_data_preprocess-4.0.4.dist-info/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
|
|
40
|
+
ngiab_data_preprocess-4.0.4.dist-info/METADATA,sha256=Rgmbfo8XC_T9Vw6zUfHGFl2du6HBmL2WCR8S0ds5T2k,9411
|
|
41
|
+
ngiab_data_preprocess-4.0.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
42
|
+
ngiab_data_preprocess-4.0.4.dist-info/entry_points.txt,sha256=spwlhKEJ3ZnNETQsJGeTjD7Vwy8O_zGHb9GdX8ACCtw,128
|
|
43
|
+
ngiab_data_preprocess-4.0.4.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
|
|
44
|
+
ngiab_data_preprocess-4.0.4.dist-info/RECORD,,
|
|
File without changes
|
{ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{ngiab_data_preprocess-4.0.2.dist-info → ngiab_data_preprocess-4.0.4.dist-info}/top_level.txt
RENAMED
|
File without changes
|