hydroanomaly 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hydroanomaly/usgs_data.py DELETED
@@ -1,311 +0,0 @@
1
- """
2
- USGS Data Retrieval Module
3
-
4
- This module provides functionality to retrieve water data from the USGS Water Services API.
5
- Supports various water quality parameters and time series data.
6
- """
7
-
8
- import pandas as pd
9
- import numpy as np
10
- import requests
11
- from io import StringIO
12
- from datetime import datetime
13
- from typing import Optional, Dict, Any
14
- import warnings
15
-
16
-
17
- class USGSDataRetriever:
18
- """
19
- A class to retrieve and process USGS water data.
20
-
21
- This class handles the retrieval of time series data from USGS Water Services
22
- and provides methods to clean, process, and validate the data.
23
- """
24
-
25
- def __init__(self):
26
- """Initialize the USGS Data Retriever."""
27
- self.base_url = "https://waterservices.usgs.gov/nwis/iv/"
28
- self.last_request_url = None
29
- self.last_response = None
30
-
31
- def retrieve_data(
32
- self,
33
- site_number: str,
34
- parameter_code: str,
35
- start_date: str,
36
- end_date: str,
37
- create_synthetic: bool = True
38
- ) -> pd.DataFrame:
39
- """
40
- Retrieve USGS water data for specified parameters.
41
-
42
- Args:
43
- site_number (str): USGS site number (e.g., "294643095035200")
44
- parameter_code (str): USGS parameter code (e.g., "63680" for turbidity)
45
- start_date (str): Start date in YYYY-MM-DD format
46
- end_date (str): End date in YYYY-MM-DD format
47
- create_synthetic (bool): Whether to create synthetic data if no data found
48
-
49
- Returns:
50
- pd.DataFrame: DataFrame with datetime and parameter columns
51
-
52
- Raises:
53
- ValueError: If invalid dates or parameters provided
54
- requests.RequestException: If API request fails
55
- """
56
- # Validate inputs
57
- self._validate_inputs(site_number, parameter_code, start_date, end_date)
58
-
59
- # Construct URL
60
- url = self._build_url(site_number, parameter_code, start_date, end_date)
61
- self.last_request_url = url
62
-
63
- print(f"🔄 Requesting data from USGS...")
64
- print(f"📍 Site: {site_number}")
65
- print(f"📊 Parameter: {parameter_code}")
66
- print(f"📅 Period: {start_date} to {end_date}")
67
-
68
- try:
69
- # Make API request
70
- response = requests.get(url, timeout=30)
71
- self.last_response = response
72
-
73
- if response.status_code == 200:
74
- data = self._process_response(response.text, parameter_code)
75
-
76
- if len(data) == 0 and create_synthetic:
77
- print("\n⚠️ No USGS data available. Creating synthetic data...")
78
- data = self._create_synthetic_data(start_date, end_date, parameter_code)
79
-
80
- if len(data) > 0:
81
- print(f"✅ Successfully retrieved {len(data)} data points")
82
- return data
83
- else:
84
- print("❌ No data available for the specified parameters")
85
- return pd.DataFrame(columns=["datetime", "value"])
86
-
87
- else:
88
- raise requests.RequestException(f"HTTP {response.status_code}: {response.reason}")
89
-
90
- except requests.RequestException as e:
91
- print(f"❌ Error retrieving data: {e}")
92
- if create_synthetic:
93
- print("🔄 Creating synthetic data as fallback...")
94
- return self._create_synthetic_data(start_date, end_date, parameter_code)
95
- else:
96
- raise
97
-
98
- def _validate_inputs(self, site_number: str, parameter_code: str, start_date: str, end_date: str):
99
- """Validate input parameters."""
100
- if not site_number or not isinstance(site_number, str):
101
- raise ValueError("Site number must be a non-empty string")
102
-
103
- if not parameter_code or not isinstance(parameter_code, str):
104
- raise ValueError("Parameter code must be a non-empty string")
105
-
106
- try:
107
- start_dt = datetime.strptime(start_date, "%Y-%m-%d")
108
- end_dt = datetime.strptime(end_date, "%Y-%m-%d")
109
- if start_dt >= end_dt:
110
- raise ValueError("Start date must be before end date")
111
- except ValueError as e:
112
- raise ValueError(f"Invalid date format. Use YYYY-MM-DD: {e}")
113
-
114
- def _build_url(self, site_number: str, parameter_code: str, start_date: str, end_date: str) -> str:
115
- """Build the USGS API URL."""
116
- return (
117
- f"{self.base_url}?sites={site_number}"
118
- f"&parameterCd={parameter_code}"
119
- f"&startDT={start_date}&endDT={end_date}"
120
- f"&format=rdb"
121
- )
122
-
123
- def _process_response(self, content: str, parameter_code: str) -> pd.DataFrame:
124
- """Process the USGS API response."""
125
- if "No sites found matching" in content or "No data" in content:
126
- print("⚠️ No data available for this site/parameter combination")
127
- return pd.DataFrame(columns=["datetime", "value"])
128
-
129
- try:
130
- # Read the tab-separated data
131
- data = pd.read_csv(StringIO(content), sep='\t', comment='#')
132
-
133
- # Drop empty columns
134
- data = data.dropna(axis=1, how='all')
135
-
136
- # Clean column names
137
- data.columns = data.columns.str.strip()
138
-
139
- # Find datetime and parameter columns
140
- datetime_cols = [col for col in data.columns if 'datetime' in col.lower()]
141
- parameter_cols = [col for col in data.columns if parameter_code in col]
142
-
143
- if not datetime_cols:
144
- raise ValueError("No datetime column found in response")
145
- if not parameter_cols:
146
- raise ValueError(f"No column found for parameter {parameter_code}")
147
-
148
- datetime_col = datetime_cols[0]
149
- parameter_col = parameter_cols[0]
150
-
151
- # Keep only relevant columns
152
- data = data[[datetime_col, parameter_col]].copy()
153
- data.columns = ['datetime', 'value']
154
-
155
- # Convert and clean data
156
- data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')
157
- data['value'] = pd.to_numeric(data['value'], errors='coerce')
158
-
159
- # Remove rows with missing data
160
- initial_count = len(data)
161
- data = data.dropna()
162
- final_count = len(data)
163
-
164
- if initial_count > final_count:
165
- print(f"⚠️ Removed {initial_count - final_count} rows with missing data")
166
-
167
- return data
168
-
169
- except Exception as e:
170
- print(f"❌ Error parsing USGS response: {e}")
171
- return pd.DataFrame(columns=["datetime", "value"])
172
-
173
- def _create_synthetic_data(self, start_date: str, end_date: str, parameter_code: str) -> pd.DataFrame:
174
- """Create synthetic data as fallback."""
175
- date_range = pd.date_range(start=start_date, end=end_date, freq='D')
176
-
177
- # Create realistic synthetic data based on parameter type
178
- if parameter_code == "63680": # Turbidity
179
- base_value = 12
180
- noise_std = 3
181
- anomaly_range = (5, 15)
182
- elif parameter_code == "00060": # Discharge
183
- base_value = 100
184
- noise_std = 20
185
- anomaly_range = (50, 200)
186
- elif parameter_code == "00065": # Gage height
187
- base_value = 5
188
- noise_std = 1
189
- anomaly_range = (2, 8)
190
- else: # Generic water quality parameter
191
- base_value = 10
192
- noise_std = 2
193
- anomaly_range = (3, 10)
194
-
195
- # Generate base synthetic data
196
- synthetic_values = np.random.normal(base_value, noise_std, len(date_range))
197
-
198
- # Add some anomalies (10% of data)
199
- anomaly_count = int(len(date_range) * 0.1)
200
- anomaly_indices = np.random.choice(len(date_range), size=anomaly_count, replace=False)
201
- anomaly_values = np.random.uniform(anomaly_range[0], anomaly_range[1], anomaly_count)
202
- synthetic_values[anomaly_indices] += anomaly_values
203
-
204
- # Ensure positive values
205
- synthetic_values = np.maximum(synthetic_values, 0.1)
206
-
207
- synthetic_data = pd.DataFrame({
208
- 'datetime': date_range,
209
- 'value': synthetic_values
210
- })
211
-
212
- print(f"📊 Created {len(synthetic_data)} synthetic data points")
213
- print("🔍 Sample synthetic data:")
214
- print(synthetic_data.head())
215
-
216
- return synthetic_data
217
-
218
- def save_data(self, data: pd.DataFrame, filename: str, parameter_name: str = "parameter") -> str:
219
- """
220
- Save data to CSV file.
221
-
222
- Args:
223
- data (pd.DataFrame): Data to save
224
- filename (str): Output filename
225
- parameter_name (str): Name of the parameter for column naming
226
-
227
- Returns:
228
- str: Path to saved file
229
- """
230
- if len(data) == 0:
231
- print("⚠️ No data to save")
232
- return ""
233
-
234
- # Rename value column to parameter name
235
- save_data = data.copy()
236
- save_data.columns = ['datetime', parameter_name]
237
-
238
- # Add date column for convenience
239
- save_data['date'] = save_data['datetime'].dt.date
240
-
241
- # Save to CSV
242
- save_data.to_csv(filename, index=False)
243
- print(f"💾 Saved {len(save_data)} records to '{filename}'")
244
-
245
- return filename
246
-
247
- def get_data_summary(self, data: pd.DataFrame) -> Dict[str, Any]:
248
- """
249
- Get summary statistics of the data.
250
-
251
- Args:
252
- data (pd.DataFrame): Data to summarize
253
-
254
- Returns:
255
- dict: Summary statistics
256
- """
257
- if len(data) == 0:
258
- return {"error": "No data available"}
259
-
260
- summary = {
261
- "record_count": len(data),
262
- "date_range": {
263
- "start": data['datetime'].min(),
264
- "end": data['datetime'].max()
265
- },
266
- "value_stats": {
267
- "min": data['value'].min(),
268
- "max": data['value'].max(),
269
- "mean": data['value'].mean(),
270
- "median": data['value'].median(),
271
- "std": data['value'].std()
272
- },
273
- "missing_data": {
274
- "count": data['value'].isna().sum(),
275
- "percentage": (data['value'].isna().sum() / len(data)) * 100
276
- }
277
- }
278
-
279
- return summary
280
-
281
-
282
- # Convenience function for easy access
283
- def get_usgs_data(
284
- site_number: str,
285
- parameter_code: str,
286
- start_date: str,
287
- end_date: str,
288
- save_to_file: Optional[str] = None,
289
- parameter_name: str = "value"
290
- ) -> pd.DataFrame:
291
- """
292
- Convenience function to retrieve USGS data.
293
-
294
- Args:
295
- site_number (str): USGS site number
296
- parameter_code (str): USGS parameter code
297
- start_date (str): Start date (YYYY-MM-DD)
298
- end_date (str): End date (YYYY-MM-DD)
299
- save_to_file (str, optional): Filename to save data
300
- parameter_name (str): Name for the parameter column
301
-
302
- Returns:
303
- pd.DataFrame: Retrieved data
304
- """
305
- retriever = USGSDataRetriever()
306
- data = retriever.retrieve_data(site_number, parameter_code, start_date, end_date)
307
-
308
- if save_to_file and len(data) > 0:
309
- retriever.save_data(data, save_to_file, parameter_name)
310
-
311
- return data
@@ -1,11 +0,0 @@
1
- hydroanomaly/__init__.py,sha256=5X78u2gwZFugrWfMtXv9bV4QNIj8yA36sJTSkk_qb4w,4884
2
- hydroanomaly/hello.py,sha256=AhK7UKF_3TyZcWL4IDlZq_BXdKQzUP-is-jv59fgqk4,566
3
- hydroanomaly/math_utils.py,sha256=CDOGWAiRlb2PK5SNFysumnzp7_LbZ9aleHLR_3lsGrs,856
4
- hydroanomaly/plotting.py,sha256=YZW6-Sb_IrhbHKFeoh1d86Ef4Ev5Gpq55lEv8XX0v20,13504
5
- hydroanomaly/sentinel_data.py,sha256=C5T1ycyTcAGvR6KEukDHJe2kEDbFXgh0yVXi8QrjFXs,17870
6
- hydroanomaly/usgs_data.py,sha256=zUvfu3go-7cQuFtD8Hbm7pABpw_RPWuJxE66NhxYmIU,11631
7
- hydroanomaly-0.4.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
8
- hydroanomaly-0.4.0.dist-info/METADATA,sha256=6JGNAS0GWcMflIOidfmZEz71IqZmc_IJxybtdDmGz3o,11841
9
- hydroanomaly-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
- hydroanomaly-0.4.0.dist-info/top_level.txt,sha256=t-5Lc-eTLlkxIhR_N1Cpp6_YZafKS3xLLk9D2CtbE7o,13
11
- hydroanomaly-0.4.0.dist-info/RECORD,,