ecopipeline 0.5.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ecopipeline/load/load.py CHANGED
@@ -8,7 +8,7 @@ import math
8
8
  pd.set_option('display.max_columns', None)
9
9
  import mysql.connector.errors as mysqlerrors
10
10
  from ecopipeline import ConfigManager
11
- import datetime
11
+ from datetime import datetime, timedelta
12
12
  import numpy as np
13
13
 
14
14
  data_map = {'int64':'float',
@@ -161,97 +161,123 @@ def create_new_columns(cursor : mysql.connector.cursor.MySQLCursor, table_name:
161
161
 
162
162
  return True
163
163
 
164
- def load_overwrite_database(cursor : mysql.connector.cursor.MySQLCursor, dataframe: pd.DataFrame, config_info: dict, data_type: str, primary_key: str = "time_pt", table_name: str = None):
164
+ def load_overwrite_database(config : ConfigManager, dataframe: pd.DataFrame, config_info: dict, data_type: str,
165
+ primary_key: str = "time_pt", table_name: str = None, auto_log_data_loss : bool = False):
165
166
  """
166
167
  Loads given pandas DataFrame into a MySQL table overwriting any conflicting data. Uses an UPSERT strategy to ensure any gaps in data are filled.
167
168
  Note: will not overwrite values with NULL. Must have a new value to overwrite existing values in database
168
169
 
169
170
  Parameters
170
171
  ----------
171
- cursor : mysql.connector.cursor.MySQLCursor
172
- A cursor object connected to the database where the data will land
172
+ config : ecopipeline.ConfigManager
173
+ The ConfigManager object that holds configuration data for the pipeline.
173
174
  dataframe: pd.DataFrame
174
175
  The pandas DataFrame to be written into the mySQL server.
175
176
  config_info: dict
176
177
  The dictionary containing the configuration information in the data upload. This can be aquired through the get_login_info() function in this package
177
178
  data_type: str
178
- The header name corresponding to the table you wish to write data to.
179
+ The header name corresponding to the table you wish to write data to.
180
+ primary_key : str
181
+ The name of the primary key in the database to upload to. Default as 'time_pt'
182
+ table_name : str
183
+ overwrites table name from config_info if needed
184
+ auto_log_data_loss : bool
185
+ if set to True, a data loss event will be reported if no data exits in the dataframe
186
+ for the last two days from the current date OR if an error occurs
179
187
 
180
188
  Returns
181
189
  -------
182
190
  bool:
183
191
  A boolean value indicating if the data was successfully written to the database.
184
192
  """
185
- # Drop empty columns
186
- dataframe = dataframe.dropna(axis=1, how='all')
187
-
188
- dbname = config_info['database']
189
- if table_name == None:
190
- table_name = config_info[data_type]["table_name"]
191
-
192
- if(len(dataframe.index) <= 0):
193
- print(f"Attempted to write to {table_name} but dataframe was empty.")
194
- return True
193
+ # Database Connection
194
+ db_connection, cursor = config.connect_db()
195
+ try:
195
196
 
196
- print(f"Attempting to write data for {dataframe.index[0]} to {dataframe.index[-1]} into {table_name}")
197
-
198
- # Get string of all column names for sql insert
199
- sensor_names = primary_key
200
- sensor_types = ["datetime"]
201
- for column in dataframe.columns:
202
- sensor_names += "," + column
203
- sensor_types.append(data_map[dataframe[column].dtype.name])
197
+ # Drop empty columns
198
+ dataframe = dataframe.dropna(axis=1, how='all')
204
199
 
205
- # create SQL statement
206
- insert_str = "INSERT INTO " + table_name + " (" + sensor_names + ") VALUES ("
207
- for column in dataframe.columns:
208
- insert_str += "%s, "
209
- insert_str += "%s)"
210
-
211
- # last_time = datetime.datetime.strptime('20/01/1990', "%d/%m/%Y") # arbitrary past date
212
- existing_rows_list = []
213
-
214
- # create db table if it does not exist, otherwise add missing columns to existing table
215
- if not check_table_exists(cursor, table_name, dbname):
216
- if not create_new_table(cursor, table_name, sensor_names.split(",")[1:], sensor_types[1:], primary_key=primary_key): #split on colums and remove first column aka time_pt
217
- print(f"Could not create new table {table_name} in database {dbname}")
218
- return False
219
- else:
220
- try:
221
- # find existing times in database for upsert statement
222
- cursor.execute(
223
- f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
224
- # Fetch the results into a DataFrame
225
- existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
200
+ dbname = config_info['database']
201
+ if table_name == None:
202
+ table_name = config_info[data_type]["table_name"]
203
+
204
+ if(len(dataframe.index) <= 0):
205
+ print(f"Attempted to write to {table_name} but dataframe was empty.")
206
+ ret_value = True
207
+ else:
226
208
 
227
- # Convert the primary_key column to a list
228
- existing_rows_list = existing_rows[primary_key].tolist()
209
+ print(f"Attempting to write data for {dataframe.index[0]} to {dataframe.index[-1]} into {table_name}")
210
+ if auto_log_data_loss and dataframe.index[-1] < datetime.now() - timedelta(days=3):
211
+ report_data_loss(config)
212
+
213
+ # Get string of all column names for sql insert
214
+ sensor_names = primary_key
215
+ sensor_types = ["datetime"]
216
+ for column in dataframe.columns:
217
+ sensor_names += "," + column
218
+ sensor_types.append(data_map[dataframe[column].dtype.name])
219
+
220
+ # create SQL statement
221
+ insert_str = "INSERT INTO " + table_name + " (" + sensor_names + ") VALUES ("
222
+ for column in dataframe.columns:
223
+ insert_str += "%s, "
224
+ insert_str += "%s)"
225
+
226
+ # last_time = datetime.strptime('20/01/1990', "%d/%m/%Y") # arbitrary past date
227
+ existing_rows_list = []
228
+
229
+ # create db table if it does not exist, otherwise add missing columns to existing table
230
+ if not check_table_exists(cursor, table_name, dbname):
231
+ if not create_new_table(cursor, table_name, sensor_names.split(",")[1:], sensor_types[1:], primary_key=primary_key): #split on colums and remove first column aka time_pt
232
+ ret_value = False
233
+ raise Exception(f"Could not create new table {table_name} in database {dbname}")
234
+ else:
235
+ try:
236
+ # find existing times in database for upsert statement
237
+ cursor.execute(
238
+ f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
239
+ # Fetch the results into a DataFrame
240
+ existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
241
+
242
+ # Convert the primary_key column to a list
243
+ existing_rows_list = existing_rows[primary_key].tolist()
244
+
245
+ except mysqlerrors.Error:
246
+ print(f"Table {table_name} has no data.")
247
+
248
+ missing_cols, missing_types = find_missing_columns(cursor, dataframe, config_info, table_name)
249
+ if len(missing_cols):
250
+ if not create_new_columns(cursor, table_name, missing_cols, missing_types):
251
+ print("Unable to add new columns due to database error.")
252
+
253
+ updatedRows = 0
254
+ for index, row in dataframe.iterrows():
255
+ time_data = row.values.tolist()
256
+ #remove nans and infinites
257
+ time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
258
+ time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
259
+
260
+ if index in existing_rows_list:
261
+ statement, values = _generate_mysql_update(row, index, table_name, primary_key)
262
+ if statement != "":
263
+ cursor.execute(statement, values)
264
+ updatedRows += 1
265
+ else:
266
+ cursor.execute(insert_str, (index, *time_data))
229
267
 
230
- except mysqlerrors.Error:
231
- print(f"Table {table_name} has no data.")
268
+ db_connection.commit()
269
+ print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
270
+ ret_value = True
271
+ except Exception as e:
272
+ print(f"Unable to load data into database. Exception: {e}")
273
+ if auto_log_data_loss:
274
+ report_data_loss(config)
275
+ ret_value = False
232
276
 
233
- missing_cols, missing_types = find_missing_columns(cursor, dataframe, config_info, table_name)
234
- if len(missing_cols):
235
- if not create_new_columns(cursor, table_name, missing_cols, missing_types):
236
- print("Unable to add new columns due to database error.")
237
-
238
- updatedRows = 0
239
- for index, row in dataframe.iterrows():
240
- time_data = row.values.tolist()
241
- #remove nans and infinites
242
- time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
243
- time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
244
-
245
- if index in existing_rows_list:
246
- statement, values = _generate_mysql_update(row, index, table_name, primary_key)
247
- if statement != "":
248
- cursor.execute(statement, values)
249
- updatedRows += 1
250
- else:
251
- cursor.execute(insert_str, (index, *time_data))
277
+ db_connection.close()
278
+ cursor.close()
279
+ return ret_value
252
280
 
253
- print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
254
- return True
255
281
 
256
282
  def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name : str = None):
257
283
  """
@@ -299,7 +325,7 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
299
325
  column_names += "," + column
300
326
 
301
327
  # create SQL statement
302
- insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
328
+ insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
303
329
 
304
330
  # add aditional columns for db creation
305
331
  full_column_names = column_names.split(",")[1:]
@@ -390,13 +416,13 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
390
416
  table_name = "site_events"
391
417
  if site_name is None:
392
418
  site_name = config.get_site_name()
393
- error_string = "Error proccessing data. Please check logs to resolve."
419
+ error_string = "Error processing data. Please check logs to resolve."
394
420
 
395
421
  print(f"logging DATA_LOSS into {table_name}")
396
422
 
397
423
  # create SQL statement
398
424
  insert_str = "INSERT INTO " + table_name + " (start_time_pt, site_name, event_detail, event_type, last_modified_date, last_modified_by) VALUES "
399
- insert_str += f"('{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','{site_name}','{error_string}','DATA_LOSS','{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','automatic_upload')"
425
+ insert_str += f"('{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','{site_name}','{error_string}','DATA_LOSS','{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','automatic_upload')"
400
426
 
401
427
  existing_rows = pd.DataFrame({
402
428
  'id' : []
@@ -412,7 +438,7 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
412
438
  try:
413
439
  # find existing times in database for upsert statement
414
440
  cursor.execute(
415
- f"SELECT id FROM {table_name} WHERE end_time_pt IS NULL AND site_name = '{site_name}' AND event_type = 'DATA_LOSS' and event_detail = '{error_string}'")
441
+ f"SELECT id FROM {table_name} WHERE end_time_pt IS NULL AND site_name = '{site_name}' AND event_type = 'DATA_LOSS'")
416
442
  # Fetch the results into a DataFrame
417
443
  existing_rows = pd.DataFrame(cursor.fetchall(), columns=['id'])
418
444
 
@@ -422,8 +448,10 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
422
448
 
423
449
  if existing_rows.empty:
424
450
  cursor.execute(insert_str)
425
- connection.commit()
426
- print("Successfully logged data loss.")
451
+ connection.commit()
452
+ print("Successfully logged data loss.")
453
+ else:
454
+ print("Data loss already logged.")
427
455
  except Exception as e:
428
456
  # Print the exception message
429
457
  print(f"Caught an exception when uploading to site_events table: {e}")
@@ -442,7 +470,7 @@ def _generate_mysql_update_event_table(row, id):
442
470
 
443
471
  if values:
444
472
  statement += ", ".join(statment_elems)
445
- statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}'"
473
+ statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}'"
446
474
  statement += f" WHERE id = {id};"
447
475
  # statement += f" WHERE start_time_pt = '{start_time_pt}' AND end_time_pt = '{end_time_pt}' AND event_type = '{event_type}' AND site_name = '{site_name}';"
448
476
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ecopipeline
3
- Version: 0.5.1
3
+ Version: 0.6.2
4
4
  Summary: Contains functions for use in Ecotope Datapipelines
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: GNU General Public License (GPL)
@@ -2,7 +2,7 @@ ecopipeline/__init__.py,sha256=vCRzwd781ciCSXMP1ycM_BXAqxj3KVaNKIjsLOPcbwc,171
2
2
  ecopipeline/extract/__init__.py,sha256=3u_CUMdCguVewU3kN8x6xhVNyo1-p-gwTrhjOh7Psqg,645
3
3
  ecopipeline/extract/extract.py,sha256=ryorqnu1RgyNK7joZRcbMmTajlTlB5hwaYzzpo8Z8Ho,43369
4
4
  ecopipeline/load/__init__.py,sha256=oDAVF8AhK_qugqegjW7jK16p-nb9QzKhiNQOkEBniKM,235
5
- ecopipeline/load/load.py,sha256=H9OKjE-EoqhZJ-5Xixqn5vvhvgUNFDdsVsX4fqht0hE,19975
5
+ ecopipeline/load/load.py,sha256=YfCuzsJYFNZqwR58GeF55-gRI7LpOeaK_DXYHg_0frU,21415
6
6
  ecopipeline/transform/__init__.py,sha256=DcIJfkRs4OmZzDeEfW_OiOIXNqN6CUl1_lW0SS7-eN8,2280
7
7
  ecopipeline/transform/bayview.py,sha256=TP24dnTsUD95X-f6732egPZKjepFLJgDm9ImGr-fppY,17899
8
8
  ecopipeline/transform/lbnl.py,sha256=EQ54G4rJXaZ7pwVusKcdK2KBehSdCsNo2ybphtMGs7o,33400
@@ -10,8 +10,8 @@ ecopipeline/transform/transform.py,sha256=uyBIXKCXUCT6zVnZyQohripGAzmY1yV9T1GxsX
10
10
  ecopipeline/utils/ConfigManager.py,sha256=t4sfTjGO0g5P50XBQqGVFWaXfAlW1GMDh1DLoBuFGks,9826
11
11
  ecopipeline/utils/__init__.py,sha256=ccWUR0m7gD9DfcgsxBCLOfi4lho6RdYuB2Ugy_g6ZdQ,28
12
12
  ecopipeline/utils/unit_convert.py,sha256=VFh1we2Y8KV3u21BeWb-U3TlZJXo83q5vdxxkpgcuME,3064
13
- ecopipeline-0.5.1.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- ecopipeline-0.5.1.dist-info/METADATA,sha256=BMxe3qd_Ym_Ym5G3QN1zKwPJaZme5g4aEl6LUnaNPjc,2307
15
- ecopipeline-0.5.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
- ecopipeline-0.5.1.dist-info/top_level.txt,sha256=WOPFJH2LIgKqm4lk2OnFF5cgVkYibkaBxIxgvLgO7y0,12
17
- ecopipeline-0.5.1.dist-info/RECORD,,
13
+ ecopipeline-0.6.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ ecopipeline-0.6.2.dist-info/METADATA,sha256=bjYX_gnfOZFDTS6mY71uJrBzCMkdtks6o1ZZHgVniys,2307
15
+ ecopipeline-0.6.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
+ ecopipeline-0.6.2.dist-info/top_level.txt,sha256=WOPFJH2LIgKqm4lk2OnFF5cgVkYibkaBxIxgvLgO7y0,12
17
+ ecopipeline-0.6.2.dist-info/RECORD,,