loone-data-prep 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,110 @@
1
+ import csv
2
+ import traceback
1
3
  import sys
2
4
  import os
3
5
  import uuid
4
6
  from datetime import datetime, timedelta
5
7
  import pandas as pd
6
8
  from loone_data_prep.water_quality_data import wq
7
- from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
9
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_water_quality_data_is_latest
8
10
 
9
11
 
10
12
  D = {
11
- "PHOSPHATE, TOTAL AS P": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
12
- "PHOSPHATE, ORTHO AS P": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
13
- "AMMONIA-N": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
14
- "NITRATE+NITRITE-N": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
15
- "TOTAL NITROGEN": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
16
- "MICROCYSTIN HILR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
17
- "MICROCYSTIN HTYR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
18
- "MICROCYSTIN LA": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
19
- "MICROCYSTIN LF": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
20
- "MICROCYSTIN LR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
21
- "MICROCYSTIN LW": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
22
- "MICROCYSTIN LY": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
23
- "MICROCYSTIN RR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
24
- "MICROCYSTIN WR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
25
- "MICROCYSTIN YR": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
26
- "CHLOROPHYLL-A": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
27
- "CHLOROPHYLL-A(LC)": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
28
- "CHLOROPHYLL-A, CORRECTED": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]},
29
- "DISSOLVED OXYGEN": {"station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]}
13
+ "PHOSPHATE, TOTAL AS P": {
14
+ "test_number": 25,
15
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
16
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
17
+ },
18
+ "PHOSPHATE, ORTHO AS P": {
19
+ "test_number": 23,
20
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
21
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
22
+ },
23
+ "AMMONIA-N": {
24
+ "test_number": 20,
25
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
26
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
27
+ },
28
+ "NITRATE+NITRITE-N": {
29
+ "test_number": 18,
30
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
31
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
32
+ },
33
+ "TOTAL NITROGEN": {
34
+ "test_number": 80,
35
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
36
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
37
+ },
38
+ "MICROCYSTIN HILR": {
39
+ "test_number": 1023,
40
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
41
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
42
+ },
43
+ "MICROCYSTIN HTYR": {
44
+ "test_number": 1022,
45
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
46
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
47
+ },
48
+ "MICROCYSTIN LA": {
49
+ "test_number": 1005,
50
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
51
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
52
+ },
53
+ "MICROCYSTIN LF": {
54
+ "test_number": 1006,
55
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
56
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
57
+ },
58
+ "MICROCYSTIN LR": {
59
+ "test_number": 1007,
60
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
61
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
62
+ },
63
+ "MICROCYSTIN LW": {
64
+ "test_number": 1008,
65
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
66
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
67
+ },
68
+ "MICROCYSTIN LY": {
69
+ "test_number": 1009,
70
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
71
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
72
+ },
73
+ "MICROCYSTIN RR": {
74
+ "test_number": 1010,
75
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
76
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
77
+ },
78
+ "MICROCYSTIN WR": {
79
+ "test_number": 1011,
80
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
81
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
82
+ },
83
+ "MICROCYSTIN YR": {
84
+ "test_number": 1012,
85
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
86
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
87
+ },
88
+ "CHLOROPHYLL-A": {
89
+ "test_number": 61,
90
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
91
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
92
+ },
93
+ "CHLOROPHYLL-A(LC)": {
94
+ "test_number": 179,
95
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
96
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
97
+ },
98
+ "CHLOROPHYLL-A, CORRECTED": {
99
+ "test_number": 112,
100
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
101
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
102
+ },
103
+ "DISSOLVED OXYGEN": {
104
+ "test_number": 8,
105
+ "station_ids": ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"],
106
+ "station_types": {"L001": "SITE", "L004": "SITE", "L005": "SITE", "L006": "SITE", "L007": "SITE", "L008": "SITE", "LZ40": "SITE"}
107
+ }
30
108
  }
31
109
 
32
110
 
@@ -36,6 +114,9 @@ def main(workspace: str, d: dict = D) -> dict:
36
114
  for name, params in d.items():
37
115
  print(f"Getting {name} for the following station IDs: {params['station_ids']}.")
38
116
 
117
+ # Get the test_number for this parameter name
118
+ test_number = params['test_number']
119
+
39
120
  # Get the date of the latest data in the csv file for each station id
40
121
  station_date_latest = {}
41
122
  for station_id in params["station_ids"]:
@@ -43,16 +124,19 @@ def main(workspace: str, d: dict = D) -> dict:
43
124
 
44
125
  # Get the water quality data
45
126
  for station_id, date_latest in station_date_latest.items():
127
+ # Get the station type for this station ID
128
+ station_type = params["station_types"][station_id]
129
+
46
130
  # File with data for this station/name combination does NOT already exist (or possibly some other error occurred)
47
131
  if date_latest is None:
48
132
  # Get all the water quality data for the name/station combination
49
133
  print(f"Getting all {name} data for station ID: {station_id}.")
50
- wq.get(workspace, name, [station_id])
134
+ wq.get(workspace, name, test_number, [station_id])
51
135
  else:
52
136
  # Check whether we already have the latest data
53
- if dbhydro_data_is_latest(date_latest):
137
+ if dbhydro_water_quality_data_is_latest(date_latest, station_id, station_type, test_number):
54
138
  # Notify that the data is already up to date
55
- print(f'Downloading of new water quality data for test name: {name} station: {station} skipped. Data is already up to date.')
139
+ print(f'Downloading of new water quality data for test name: {name} station: {station_id} skipped. Data is already up to date.')
56
140
  continue
57
141
 
58
142
  # Temporarily rename current data file so it isn't over written
@@ -63,8 +147,8 @@ def main(workspace: str, d: dict = D) -> dict:
63
147
  try:
64
148
  # Get only the water quality data that is newer than the latest data in the csv file
65
149
  print(f"Downloading new water quality data for test name: {name} station ID: {station_id} starting from date: {date_latest}.")
66
- date_latest = (datetime.strptime(date_latest, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
67
- wq.get(workspace, name, [station_id], date_min=date_latest)
150
+ date_latest = (datetime.strptime(date_latest, "%Y-%m-%d %H:%M:%S") + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
151
+ wq.get(workspace, name, test_number, [station_id], date_min=date_latest)
68
152
 
69
153
  # Data failed to download - It's possible the data's end date has been reached
70
154
  if not os.path.exists(os.path.join(workspace, original_file_name)):
@@ -73,25 +157,38 @@ def main(workspace: str, d: dict = D) -> dict:
73
157
  # Read in the original data
74
158
  df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
75
159
 
76
- # Calculate the days column for the newly downloaded data
77
- df_original_date_min = df_original['date'].min()
78
- wq._calculate_days_column(workspace, original_file_name, df_original_date_min)
79
-
80
160
  # Read in the newly downloaded data
81
161
  df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
82
- df_new.reset_index(inplace=True)
83
162
 
84
- # Merge the new data with the original data
85
- df_merged = pd.concat([df_original, df_new], ignore_index=True)
163
+ # Calculate the days column for the newly downloaded data
164
+ df_original_date_min = df_original['date'].min()
165
+ df_new = wq._calculate_days_column(workspace, df_new, df_original_date_min)
86
166
 
167
+ # Merge the new data with the original data
168
+ df_merged = pd.concat([df_original, df_new], ignore_index=False)
169
+
170
+ # Re-number the index
171
+ df_merged.reset_index(inplace=True)
172
+ df_merged.drop(['index'], axis=1, inplace=True)
173
+
174
+ # Start index at 1 instead of 0 (for backwards compatibility)
175
+ df_merged.index = df_merged.index + 1
176
+
177
+ # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
178
+ df_merged.index = df_merged.index.astype(str)
179
+
87
180
  # Write out the merged data
88
- df_merged.to_csv(os.path.join(workspace, original_file_name))
181
+ df_merged.to_csv(os.path.join(workspace, original_file_name), index=True, quoting=csv.QUOTE_NONNUMERIC)
182
+
183
+ # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
184
+ wq.rewrite_water_quality_file_without_date_quotes(workspace, original_file_name)
89
185
 
90
186
  # Remove the original renamed data file
91
187
  os.remove(os.path.join(workspace, original_file_name_temp))
92
188
  except Exception as e:
93
189
  # Notify of the error
94
190
  print(f"Error occurred while downloading new water quality data: {e}")
191
+ traceback.print_exc()
95
192
 
96
193
  # Remove the newly downloaded data file if it exists
97
194
  if os.path.exists(os.path.join(workspace, original_file_name)):
@@ -1,117 +1,143 @@
1
+ import csv
2
+ import os
1
3
  import sys
2
4
  from datetime import datetime
3
5
  from retry import retry
4
- from rpy2.robjects import r
5
- from rpy2.rinterface_lib.embedded import RRuntimeError
6
-
6
+ import pandas as pd
7
+ from loone_data_prep.utils import get_dbhydro_api
7
8
 
8
9
  DEFAULT_STATION_IDS = ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]
9
10
  DATE_NOW = datetime.now().strftime("%Y-%m-%d")
10
11
 
11
12
 
12
- @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
13
+ @retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
13
14
  def get(
14
15
  workspace: str,
15
16
  name: str,
17
+ test_number: int,
16
18
  station_ids: list = DEFAULT_STATION_IDS,
17
19
  date_min: str = "1950-01-01",
18
20
  date_max: str = DATE_NOW,
19
21
  **kwargs: str | list
20
22
  ) -> None:
21
- station_ids_str = "\"" + "\", \"".join(station_ids) + "\""
22
- r(
23
- f"""
24
- # Load the required libraries
25
- library(rio)
26
- library(dbhydroR)
27
-
28
- # Specify the station IDs, date range, and test names
29
- station_ids <- c({station_ids_str})
30
- date_min <- "{date_min}"
31
- date_max <- "{date_max}"
32
- test_names <- c("{name}")
33
-
34
- # Loop over the station IDs
35
- for (station_id in station_ids) {{
36
- # Retrieve water quality data for the current station ID
37
- water_quality_data <- tryCatch(
38
- get_wq(
39
- station_id = station_id,
40
- date_min = date_min,
41
- date_max = date_max,
42
- test_name = test_names
43
- ),
44
- error = function(e) NULL
45
- )
46
-
47
- # Check if data is available for the current station ID and test name
48
- if (!is.null(water_quality_data) && nrow(water_quality_data) > 0) {{
49
- # Convert the vector to a data frame
50
- water_quality_data <- as.data.frame(water_quality_data)
51
-
52
- # Calculate the number of days from the minimum date plus 8
53
- water_quality_data$days <- as.integer(difftime(water_quality_data$date, min(water_quality_data$date), units = "days")) + as.integer(format(min(water_quality_data$date), "%d"))
54
-
55
- # Generate the filename based on the station ID
56
- filename <- paste0("{workspace}/water_quality_", station_id, "_", test_names, ".csv")
57
-
58
- # Save data to a CSV file
59
- write.csv(water_quality_data, file = filename)
60
-
61
- # Print a message indicating the file has been saved
62
- cat("CSV file", filename, "has been saved.\n")
63
- }} else {{
64
- # Print a message indicating no data was found for the current station ID and test name
65
- cat("No data found for station ID", station_id, "and test name", test_names, "\n")
66
- }}
67
- Sys.sleep(1) # Wait for 1 seconds before the next iteration
68
- }}
69
- """ # noqa: E501
70
- )
23
+ """Fetch water quality data from DBHydro API and save it as CSV files in the specified workspace.
24
+
25
+ Args:
26
+ workspace (str): The directory where the CSV files will be saved.
27
+ name (str): The name of the water quality parameter. Example: 'PHOSPHATE, TOTAL AS P'
28
+ test_number (int): The DBHydro test number for the water quality parameter.
29
+ station_ids (list, optional): List of station IDs to fetch data for. Defaults to DEFAULT_STATION_IDS.
30
+ date_min (str, optional): The start date for fetching data in YYYY-MM-DD format. Defaults to "1950-01-01".
31
+ date_max (str, optional): The end date for fetching data in YYYY-MM-DD format. Defaults to the current date.
32
+ **kwargs: Additional keyword arguments.
33
+
34
+ Returns:
35
+ None
36
+ """
37
+
38
+ # Initialize the DBHydro API
39
+ api = get_dbhydro_api()
40
+
41
+ # Fetch water quality data
42
+ response = api.get_water_quality(stations=station_ids, test_numbers=[test_number], date_start=date_min, date_end=date_max, exclude_flagged_results=False)
43
+ df = response.to_dataframe(include_metadata=True)
44
+
45
+ # Process and save data for each station
46
+ for station in station_ids:
47
+ # Get a copy of the data frame for this station
48
+ df_station = df[df['station'] == station].copy()
49
+
50
+ # Check if the data frame is empty
51
+ if df_station.empty:
52
+ print(f'No data found for station ID {station} and test number {test_number}.')
53
+ continue
54
+
55
+ # Get the units of the data
56
+ units = df_station['units'].iloc[0] if 'units' in df_station.columns else ''
57
+
58
+ # Drop unwanted columns
59
+ df_station = df_station[['date_collected_str', 'sig_fig_value']].copy()
60
+
61
+ # Convert string sig_fig_value to numeric
62
+ df_station['sig_fig_value'] = pd.to_numeric(df_station['sig_fig_value'], errors='coerce')
63
+
64
+ # Calculate daily average values
65
+ df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str'])
66
+ df_station["date_only"] = df_station["date_collected_str"].dt.date
67
+ df_station = df_station.groupby("date_only")["sig_fig_value"].mean().reset_index()
68
+ df_station.rename(columns={"date_only": "date_collected_str"}, inplace=True)
69
+
70
+ # Format dataframe to expected layout
71
+ df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str']) # Convert date_collected_str column to datetime
72
+ df_station.sort_values('date_collected_str', inplace=True) # Sort df by date_collected_str
73
+ df_station.rename(columns={'date_collected_str': 'date', 'sig_fig_value': f'{station}_{name}_{units}'}, inplace=True) # Rename columns
74
+
75
+ # Calculate the days column
76
+ df_station['days'] = (df_station['date'] - df_station['date'].min()).dt.days + df_station['date'].min().day
77
+
78
+ # Make sure the integer index is written out (for backwards compatibility)
79
+ df_station.reset_index(inplace=True, drop=True)
80
+
81
+ # Start index at 1 instead of 0 (for backwards compatibility)
82
+ df_station.index = df_station.index + 1
83
+
84
+ # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
85
+ df_station.index = df_station.index.astype(str)
86
+
87
+ # Make sure the date column includes time information at midnight (for backwards compatibility)
88
+ df_station['date'] = df_station['date'].dt.strftime('%Y-%m-%d 00:00:00')
89
+
90
+ # Write out the data frame to a CSV file
91
+ df_station.to_csv(os.path.join(workspace, f'water_quality_{station}_{name}.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
92
+
93
+ # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
94
+ rewrite_water_quality_file_without_date_quotes(workspace, f'water_quality_{station}_{name}.csv')
71
95
 
72
96
 
73
- def _calculate_days_column(workspace: str, file_name: str, date_min: str):
97
+ def _calculate_days_column(workspace: str, df: pd.DataFrame, date_min: str):
74
98
  """
75
99
  Calculates the values that should be in the "days" column of the water quality data CSV file
76
100
  based on the given date_min and writes the updated data frame back to the CSV file.
77
101
 
78
102
  Args:
79
103
  workspace (str): The path to the workspace directory.
80
- file_name (str): The name of the water quality data CSV file.
104
+ df (pd.DataFrame): The water quality data dataframe.
81
105
  date_min (str): The minimum date that the "days" column values should be calculated from. Should be in format "YYYY-MM-DD".
82
106
  """
83
- r(
84
- f"""
85
- # Import necessary libraries
86
- library(lubridate)
87
-
88
- # Read the CSV file
89
- df <- read.csv("{workspace}/{file_name}", check.names = FALSE)
90
-
91
- # Drop the "X" column that R adds when reading CSV files
92
- df <- df[,-1]
93
-
94
- # Get date_min as an object with the correct timezone
95
- date_min_object <- as.POSIXct("{date_min}", tz = "UTC")
96
- date_min_tz <- format(with_tz(date_min_object, tzone = "America/New_York"), "%Z")
97
- date_min_object <- as.POSIXct("{date_min}", tz = date_min_tz)
98
-
99
- # Calculate each value in the days column based on the date_min
100
- for(i in 1:nrow(df))
101
- {{
102
- # Get the current row's date as an object with the correct timezone
103
- date <- as.POSIXct(df$date[i], tz = "UTC")
104
- date_tz <- format(with_tz(date, tzone = "America/New_York"), "%Z")
105
- date <- as.POSIXct(df$date[i], tz = date_tz)
106
-
107
- # Calculate the number of days from the minimum date to the row's date plus the number of days in date_min
108
- df$days[i] <- as.integer(difftime(date, date_min_object, units = "days")) + as.integer(format(date_min_object, "%d"))
109
- }}
110
-
111
- # Write the updated data frame back to the CSV file
112
- write.csv(df, file = "{workspace}/{file_name}", row.names = FALSE)
113
- """ # noqa: E501
114
- )
107
+ # Ensure df['date'] is a pandas datetime Series
108
+ df['date'] = pd.to_datetime(df['date'])
109
+ date_min_object = pd.to_datetime(date_min)
110
+
111
+ # Calculate days column for all rows
112
+ df['days'] = (df['date'] - date_min_object).dt.days + date_min_object.day
113
+
114
+ return df
115
+
116
+
117
+ def rewrite_water_quality_file_without_date_quotes(workspace: str, file_name: str) -> None:
118
+ """
119
+ Rewrites the given water quality CSV file so that the dates don't have double quotes around them (for backwards compatibility).
120
+
121
+ Args:
122
+ workspace (str): The path to the workspace directory.
123
+ file_name (str): The name of the water quality CSV file.
124
+ """
125
+ # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
126
+ file_path = os.path.join(workspace, file_name)
127
+ lines = []
128
+
129
+ with open(file_path, 'r') as file:
130
+ lines = file.readlines()
131
+
132
+ with open(file_path, 'w', newline='') as file:
133
+ line_number = 0
134
+ for line in lines:
135
+ if line_number != 0:
136
+ line_split = line.split(',')
137
+ line_split[1] = line_split[1].replace('"', '') # Remove quotes around dates (2nd column)
138
+ line = ','.join(line_split)
139
+ file.write(line)
140
+ line_number += 1
115
141
 
116
142
 
117
143
  if __name__ == "__main__":
@@ -88,7 +88,7 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
88
88
  continue
89
89
 
90
90
  # Check whether the latest data is already up to date.
91
- if dbhydro_data_is_latest(date_latest):
91
+ if dbhydro_data_is_latest(date_latest, dbkey):
92
92
  # Notify that the data is already up to date
93
93
  print(f'Downloading of new {name} data skipped for dbkey {dbkey}. Data is already up to date.')
94
94
  continue
@@ -99,8 +99,10 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
99
99
 
100
100
  try:
101
101
  # Download only the new data
102
- print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_latest}')
103
- weather.get(workspace, name, dbkeys=[dbkey], date_min=date_latest)
102
+ date_start = pd.to_datetime(date_latest) + pd.Timedelta(days=1)
103
+ date_start = date_start.strftime('%Y-%m-%d')
104
+ print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_start}')
105
+ weather.get(workspace, name, dbkeys=[dbkey], date_min=date_start)
104
106
 
105
107
  # Data failed to download - It's possible the data's end date has been reached
106
108
  if not os.path.exists(os.path.join(workspace, original_file_name)):