hydroanomaly 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hydroanomaly/__init__.py +37 -137
- hydroanomaly/sentinel_bands.py +157 -0
- hydroanomaly/usgs_turbidity.py +150 -0
- hydroanomaly/visualize.py +226 -0
- {hydroanomaly-0.4.0.dist-info → hydroanomaly-0.6.0.dist-info}/METADATA +2 -2
- hydroanomaly-0.6.0.dist-info/RECORD +9 -0
- hydroanomaly/hello.py +0 -29
- hydroanomaly/math_utils.py +0 -50
- hydroanomaly/plotting.py +0 -389
- hydroanomaly/sentinel_data.py +0 -516
- hydroanomaly/usgs_data.py +0 -311
- hydroanomaly-0.4.0.dist-info/RECORD +0 -11
- {hydroanomaly-0.4.0.dist-info → hydroanomaly-0.6.0.dist-info}/WHEEL +0 -0
- {hydroanomaly-0.4.0.dist-info → hydroanomaly-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {hydroanomaly-0.4.0.dist-info → hydroanomaly-0.6.0.dist-info}/top_level.txt +0 -0
hydroanomaly/usgs_data.py
DELETED
@@ -1,311 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
USGS Data Retrieval Module
|
3
|
-
|
4
|
-
This module provides functionality to retrieve water data from the USGS Water Services API.
|
5
|
-
Supports various water quality parameters and time series data.
|
6
|
-
"""
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
import numpy as np
|
10
|
-
import requests
|
11
|
-
from io import StringIO
|
12
|
-
from datetime import datetime
|
13
|
-
from typing import Optional, Dict, Any
|
14
|
-
import warnings
|
15
|
-
|
16
|
-
|
17
|
-
class USGSDataRetriever:
|
18
|
-
"""
|
19
|
-
A class to retrieve and process USGS water data.
|
20
|
-
|
21
|
-
This class handles the retrieval of time series data from USGS Water Services
|
22
|
-
and provides methods to clean, process, and validate the data.
|
23
|
-
"""
|
24
|
-
|
25
|
-
def __init__(self):
|
26
|
-
"""Initialize the USGS Data Retriever."""
|
27
|
-
self.base_url = "https://waterservices.usgs.gov/nwis/iv/"
|
28
|
-
self.last_request_url = None
|
29
|
-
self.last_response = None
|
30
|
-
|
31
|
-
def retrieve_data(
|
32
|
-
self,
|
33
|
-
site_number: str,
|
34
|
-
parameter_code: str,
|
35
|
-
start_date: str,
|
36
|
-
end_date: str,
|
37
|
-
create_synthetic: bool = True
|
38
|
-
) -> pd.DataFrame:
|
39
|
-
"""
|
40
|
-
Retrieve USGS water data for specified parameters.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
site_number (str): USGS site number (e.g., "294643095035200")
|
44
|
-
parameter_code (str): USGS parameter code (e.g., "63680" for turbidity)
|
45
|
-
start_date (str): Start date in YYYY-MM-DD format
|
46
|
-
end_date (str): End date in YYYY-MM-DD format
|
47
|
-
create_synthetic (bool): Whether to create synthetic data if no data found
|
48
|
-
|
49
|
-
Returns:
|
50
|
-
pd.DataFrame: DataFrame with datetime and parameter columns
|
51
|
-
|
52
|
-
Raises:
|
53
|
-
ValueError: If invalid dates or parameters provided
|
54
|
-
requests.RequestException: If API request fails
|
55
|
-
"""
|
56
|
-
# Validate inputs
|
57
|
-
self._validate_inputs(site_number, parameter_code, start_date, end_date)
|
58
|
-
|
59
|
-
# Construct URL
|
60
|
-
url = self._build_url(site_number, parameter_code, start_date, end_date)
|
61
|
-
self.last_request_url = url
|
62
|
-
|
63
|
-
print(f"🔄 Requesting data from USGS...")
|
64
|
-
print(f"📍 Site: {site_number}")
|
65
|
-
print(f"📊 Parameter: {parameter_code}")
|
66
|
-
print(f"📅 Period: {start_date} to {end_date}")
|
67
|
-
|
68
|
-
try:
|
69
|
-
# Make API request
|
70
|
-
response = requests.get(url, timeout=30)
|
71
|
-
self.last_response = response
|
72
|
-
|
73
|
-
if response.status_code == 200:
|
74
|
-
data = self._process_response(response.text, parameter_code)
|
75
|
-
|
76
|
-
if len(data) == 0 and create_synthetic:
|
77
|
-
print("\n⚠️ No USGS data available. Creating synthetic data...")
|
78
|
-
data = self._create_synthetic_data(start_date, end_date, parameter_code)
|
79
|
-
|
80
|
-
if len(data) > 0:
|
81
|
-
print(f"✅ Successfully retrieved {len(data)} data points")
|
82
|
-
return data
|
83
|
-
else:
|
84
|
-
print("❌ No data available for the specified parameters")
|
85
|
-
return pd.DataFrame(columns=["datetime", "value"])
|
86
|
-
|
87
|
-
else:
|
88
|
-
raise requests.RequestException(f"HTTP {response.status_code}: {response.reason}")
|
89
|
-
|
90
|
-
except requests.RequestException as e:
|
91
|
-
print(f"❌ Error retrieving data: {e}")
|
92
|
-
if create_synthetic:
|
93
|
-
print("🔄 Creating synthetic data as fallback...")
|
94
|
-
return self._create_synthetic_data(start_date, end_date, parameter_code)
|
95
|
-
else:
|
96
|
-
raise
|
97
|
-
|
98
|
-
def _validate_inputs(self, site_number: str, parameter_code: str, start_date: str, end_date: str):
|
99
|
-
"""Validate input parameters."""
|
100
|
-
if not site_number or not isinstance(site_number, str):
|
101
|
-
raise ValueError("Site number must be a non-empty string")
|
102
|
-
|
103
|
-
if not parameter_code or not isinstance(parameter_code, str):
|
104
|
-
raise ValueError("Parameter code must be a non-empty string")
|
105
|
-
|
106
|
-
try:
|
107
|
-
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
108
|
-
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
109
|
-
if start_dt >= end_dt:
|
110
|
-
raise ValueError("Start date must be before end date")
|
111
|
-
except ValueError as e:
|
112
|
-
raise ValueError(f"Invalid date format. Use YYYY-MM-DD: {e}")
|
113
|
-
|
114
|
-
def _build_url(self, site_number: str, parameter_code: str, start_date: str, end_date: str) -> str:
|
115
|
-
"""Build the USGS API URL."""
|
116
|
-
return (
|
117
|
-
f"{self.base_url}?sites={site_number}"
|
118
|
-
f"¶meterCd={parameter_code}"
|
119
|
-
f"&startDT={start_date}&endDT={end_date}"
|
120
|
-
f"&format=rdb"
|
121
|
-
)
|
122
|
-
|
123
|
-
def _process_response(self, content: str, parameter_code: str) -> pd.DataFrame:
|
124
|
-
"""Process the USGS API response."""
|
125
|
-
if "No sites found matching" in content or "No data" in content:
|
126
|
-
print("⚠️ No data available for this site/parameter combination")
|
127
|
-
return pd.DataFrame(columns=["datetime", "value"])
|
128
|
-
|
129
|
-
try:
|
130
|
-
# Read the tab-separated data
|
131
|
-
data = pd.read_csv(StringIO(content), sep='\t', comment='#')
|
132
|
-
|
133
|
-
# Drop empty columns
|
134
|
-
data = data.dropna(axis=1, how='all')
|
135
|
-
|
136
|
-
# Clean column names
|
137
|
-
data.columns = data.columns.str.strip()
|
138
|
-
|
139
|
-
# Find datetime and parameter columns
|
140
|
-
datetime_cols = [col for col in data.columns if 'datetime' in col.lower()]
|
141
|
-
parameter_cols = [col for col in data.columns if parameter_code in col]
|
142
|
-
|
143
|
-
if not datetime_cols:
|
144
|
-
raise ValueError("No datetime column found in response")
|
145
|
-
if not parameter_cols:
|
146
|
-
raise ValueError(f"No column found for parameter {parameter_code}")
|
147
|
-
|
148
|
-
datetime_col = datetime_cols[0]
|
149
|
-
parameter_col = parameter_cols[0]
|
150
|
-
|
151
|
-
# Keep only relevant columns
|
152
|
-
data = data[[datetime_col, parameter_col]].copy()
|
153
|
-
data.columns = ['datetime', 'value']
|
154
|
-
|
155
|
-
# Convert and clean data
|
156
|
-
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')
|
157
|
-
data['value'] = pd.to_numeric(data['value'], errors='coerce')
|
158
|
-
|
159
|
-
# Remove rows with missing data
|
160
|
-
initial_count = len(data)
|
161
|
-
data = data.dropna()
|
162
|
-
final_count = len(data)
|
163
|
-
|
164
|
-
if initial_count > final_count:
|
165
|
-
print(f"⚠️ Removed {initial_count - final_count} rows with missing data")
|
166
|
-
|
167
|
-
return data
|
168
|
-
|
169
|
-
except Exception as e:
|
170
|
-
print(f"❌ Error parsing USGS response: {e}")
|
171
|
-
return pd.DataFrame(columns=["datetime", "value"])
|
172
|
-
|
173
|
-
def _create_synthetic_data(self, start_date: str, end_date: str, parameter_code: str) -> pd.DataFrame:
|
174
|
-
"""Create synthetic data as fallback."""
|
175
|
-
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
|
176
|
-
|
177
|
-
# Create realistic synthetic data based on parameter type
|
178
|
-
if parameter_code == "63680": # Turbidity
|
179
|
-
base_value = 12
|
180
|
-
noise_std = 3
|
181
|
-
anomaly_range = (5, 15)
|
182
|
-
elif parameter_code == "00060": # Discharge
|
183
|
-
base_value = 100
|
184
|
-
noise_std = 20
|
185
|
-
anomaly_range = (50, 200)
|
186
|
-
elif parameter_code == "00065": # Gage height
|
187
|
-
base_value = 5
|
188
|
-
noise_std = 1
|
189
|
-
anomaly_range = (2, 8)
|
190
|
-
else: # Generic water quality parameter
|
191
|
-
base_value = 10
|
192
|
-
noise_std = 2
|
193
|
-
anomaly_range = (3, 10)
|
194
|
-
|
195
|
-
# Generate base synthetic data
|
196
|
-
synthetic_values = np.random.normal(base_value, noise_std, len(date_range))
|
197
|
-
|
198
|
-
# Add some anomalies (10% of data)
|
199
|
-
anomaly_count = int(len(date_range) * 0.1)
|
200
|
-
anomaly_indices = np.random.choice(len(date_range), size=anomaly_count, replace=False)
|
201
|
-
anomaly_values = np.random.uniform(anomaly_range[0], anomaly_range[1], anomaly_count)
|
202
|
-
synthetic_values[anomaly_indices] += anomaly_values
|
203
|
-
|
204
|
-
# Ensure positive values
|
205
|
-
synthetic_values = np.maximum(synthetic_values, 0.1)
|
206
|
-
|
207
|
-
synthetic_data = pd.DataFrame({
|
208
|
-
'datetime': date_range,
|
209
|
-
'value': synthetic_values
|
210
|
-
})
|
211
|
-
|
212
|
-
print(f"📊 Created {len(synthetic_data)} synthetic data points")
|
213
|
-
print("🔍 Sample synthetic data:")
|
214
|
-
print(synthetic_data.head())
|
215
|
-
|
216
|
-
return synthetic_data
|
217
|
-
|
218
|
-
def save_data(self, data: pd.DataFrame, filename: str, parameter_name: str = "parameter") -> str:
|
219
|
-
"""
|
220
|
-
Save data to CSV file.
|
221
|
-
|
222
|
-
Args:
|
223
|
-
data (pd.DataFrame): Data to save
|
224
|
-
filename (str): Output filename
|
225
|
-
parameter_name (str): Name of the parameter for column naming
|
226
|
-
|
227
|
-
Returns:
|
228
|
-
str: Path to saved file
|
229
|
-
"""
|
230
|
-
if len(data) == 0:
|
231
|
-
print("⚠️ No data to save")
|
232
|
-
return ""
|
233
|
-
|
234
|
-
# Rename value column to parameter name
|
235
|
-
save_data = data.copy()
|
236
|
-
save_data.columns = ['datetime', parameter_name]
|
237
|
-
|
238
|
-
# Add date column for convenience
|
239
|
-
save_data['date'] = save_data['datetime'].dt.date
|
240
|
-
|
241
|
-
# Save to CSV
|
242
|
-
save_data.to_csv(filename, index=False)
|
243
|
-
print(f"💾 Saved {len(save_data)} records to '{filename}'")
|
244
|
-
|
245
|
-
return filename
|
246
|
-
|
247
|
-
def get_data_summary(self, data: pd.DataFrame) -> Dict[str, Any]:
|
248
|
-
"""
|
249
|
-
Get summary statistics of the data.
|
250
|
-
|
251
|
-
Args:
|
252
|
-
data (pd.DataFrame): Data to summarize
|
253
|
-
|
254
|
-
Returns:
|
255
|
-
dict: Summary statistics
|
256
|
-
"""
|
257
|
-
if len(data) == 0:
|
258
|
-
return {"error": "No data available"}
|
259
|
-
|
260
|
-
summary = {
|
261
|
-
"record_count": len(data),
|
262
|
-
"date_range": {
|
263
|
-
"start": data['datetime'].min(),
|
264
|
-
"end": data['datetime'].max()
|
265
|
-
},
|
266
|
-
"value_stats": {
|
267
|
-
"min": data['value'].min(),
|
268
|
-
"max": data['value'].max(),
|
269
|
-
"mean": data['value'].mean(),
|
270
|
-
"median": data['value'].median(),
|
271
|
-
"std": data['value'].std()
|
272
|
-
},
|
273
|
-
"missing_data": {
|
274
|
-
"count": data['value'].isna().sum(),
|
275
|
-
"percentage": (data['value'].isna().sum() / len(data)) * 100
|
276
|
-
}
|
277
|
-
}
|
278
|
-
|
279
|
-
return summary
|
280
|
-
|
281
|
-
|
282
|
-
# Convenience function for easy access
|
283
|
-
def get_usgs_data(
|
284
|
-
site_number: str,
|
285
|
-
parameter_code: str,
|
286
|
-
start_date: str,
|
287
|
-
end_date: str,
|
288
|
-
save_to_file: Optional[str] = None,
|
289
|
-
parameter_name: str = "value"
|
290
|
-
) -> pd.DataFrame:
|
291
|
-
"""
|
292
|
-
Convenience function to retrieve USGS data.
|
293
|
-
|
294
|
-
Args:
|
295
|
-
site_number (str): USGS site number
|
296
|
-
parameter_code (str): USGS parameter code
|
297
|
-
start_date (str): Start date (YYYY-MM-DD)
|
298
|
-
end_date (str): End date (YYYY-MM-DD)
|
299
|
-
save_to_file (str, optional): Filename to save data
|
300
|
-
parameter_name (str): Name for the parameter column
|
301
|
-
|
302
|
-
Returns:
|
303
|
-
pd.DataFrame: Retrieved data
|
304
|
-
"""
|
305
|
-
retriever = USGSDataRetriever()
|
306
|
-
data = retriever.retrieve_data(site_number, parameter_code, start_date, end_date)
|
307
|
-
|
308
|
-
if save_to_file and len(data) > 0:
|
309
|
-
retriever.save_data(data, save_to_file, parameter_name)
|
310
|
-
|
311
|
-
return data
|
@@ -1,11 +0,0 @@
|
|
1
|
-
hydroanomaly/__init__.py,sha256=5X78u2gwZFugrWfMtXv9bV4QNIj8yA36sJTSkk_qb4w,4884
|
2
|
-
hydroanomaly/hello.py,sha256=AhK7UKF_3TyZcWL4IDlZq_BXdKQzUP-is-jv59fgqk4,566
|
3
|
-
hydroanomaly/math_utils.py,sha256=CDOGWAiRlb2PK5SNFysumnzp7_LbZ9aleHLR_3lsGrs,856
|
4
|
-
hydroanomaly/plotting.py,sha256=YZW6-Sb_IrhbHKFeoh1d86Ef4Ev5Gpq55lEv8XX0v20,13504
|
5
|
-
hydroanomaly/sentinel_data.py,sha256=C5T1ycyTcAGvR6KEukDHJe2kEDbFXgh0yVXi8QrjFXs,17870
|
6
|
-
hydroanomaly/usgs_data.py,sha256=zUvfu3go-7cQuFtD8Hbm7pABpw_RPWuJxE66NhxYmIU,11631
|
7
|
-
hydroanomaly-0.4.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
|
8
|
-
hydroanomaly-0.4.0.dist-info/METADATA,sha256=6JGNAS0GWcMflIOidfmZEz71IqZmc_IJxybtdDmGz3o,11841
|
9
|
-
hydroanomaly-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
10
|
-
hydroanomaly-0.4.0.dist-info/top_level.txt,sha256=t-5Lc-eTLlkxIhR_N1Cpp6_YZafKS3xLLk9D2CtbE7o,13
|
11
|
-
hydroanomaly-0.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|