ecopipeline 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ecopipeline/load/__init__.py +2 -2
- ecopipeline/load/load.py +162 -71
- {ecopipeline-0.5.0.dist-info → ecopipeline-0.6.0.dist-info}/METADATA +2 -2
- {ecopipeline-0.5.0.dist-info → ecopipeline-0.6.0.dist-info}/RECORD +7 -7
- {ecopipeline-0.5.0.dist-info → ecopipeline-0.6.0.dist-info}/WHEEL +1 -1
- {ecopipeline-0.5.0.dist-info → ecopipeline-0.6.0.dist-info}/LICENSE +0 -0
- {ecopipeline-0.5.0.dist-info → ecopipeline-0.6.0.dist-info}/top_level.txt +0 -0
ecopipeline/load/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
from .load import check_table_exists, create_new_table, load_overwrite_database, load_event_table
|
|
2
|
-
__all__ = ["check_table_exists", "create_new_table", "load_overwrite_database", "load_event_table"]
|
|
1
|
+
from .load import check_table_exists, create_new_table, load_overwrite_database, load_event_table, report_data_loss
|
|
2
|
+
__all__ = ["check_table_exists", "create_new_table", "load_overwrite_database", "load_event_table", "report_data_loss"]
|
ecopipeline/load/load.py
CHANGED
|
@@ -8,7 +8,7 @@ import math
|
|
|
8
8
|
pd.set_option('display.max_columns', None)
|
|
9
9
|
import mysql.connector.errors as mysqlerrors
|
|
10
10
|
from ecopipeline import ConfigManager
|
|
11
|
-
import datetime
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
14
|
data_map = {'int64':'float',
|
|
@@ -161,97 +161,123 @@ def create_new_columns(cursor : mysql.connector.cursor.MySQLCursor, table_name:
|
|
|
161
161
|
|
|
162
162
|
return True
|
|
163
163
|
|
|
164
|
-
def load_overwrite_database(
|
|
164
|
+
def load_overwrite_database(config : ConfigManager, dataframe: pd.DataFrame, config_info: dict, data_type: str,
|
|
165
|
+
primary_key: str = "time_pt", table_name: str = None, auto_log_data_loss : bool = False):
|
|
165
166
|
"""
|
|
166
167
|
Loads given pandas DataFrame into a MySQL table overwriting any conflicting data. Uses an UPSERT strategy to ensure any gaps in data are filled.
|
|
167
168
|
Note: will not overwrite values with NULL. Must have a new value to overwrite existing values in database
|
|
168
169
|
|
|
169
170
|
Parameters
|
|
170
171
|
----------
|
|
171
|
-
|
|
172
|
-
|
|
172
|
+
config : ecopipeline.ConfigManager
|
|
173
|
+
The ConfigManager object that holds configuration data for the pipeline.
|
|
173
174
|
dataframe: pd.DataFrame
|
|
174
175
|
The pandas DataFrame to be written into the mySQL server.
|
|
175
176
|
config_info: dict
|
|
176
177
|
The dictionary containing the configuration information in the data upload. This can be aquired through the get_login_info() function in this package
|
|
177
178
|
data_type: str
|
|
178
|
-
The header name corresponding to the table you wish to write data to.
|
|
179
|
+
The header name corresponding to the table you wish to write data to.
|
|
180
|
+
primary_key : str
|
|
181
|
+
The name of the primary key in the database to upload to. Default as 'time_pt'
|
|
182
|
+
table_name : str
|
|
183
|
+
overwrites table name from config_info if needed
|
|
184
|
+
auto_log_data_loss : bool
|
|
185
|
+
if set to True, a data loss event will be reported if no data exits in the dataframe
|
|
186
|
+
for the last two days from the current date OR if an error occurs
|
|
179
187
|
|
|
180
188
|
Returns
|
|
181
189
|
-------
|
|
182
190
|
bool:
|
|
183
191
|
A boolean value indicating if the data was successfully written to the database.
|
|
184
192
|
"""
|
|
185
|
-
#
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
dbname = config_info['database']
|
|
189
|
-
if table_name == None:
|
|
190
|
-
table_name = config_info[data_type]["table_name"]
|
|
191
|
-
|
|
192
|
-
if(len(dataframe.index) <= 0):
|
|
193
|
-
print(f"Attempted to write to {table_name} but dataframe was empty.")
|
|
194
|
-
return True
|
|
195
|
-
|
|
196
|
-
print(f"Attempting to write data for {dataframe.index[0]} to {dataframe.index[-1]} into {table_name}")
|
|
197
|
-
|
|
198
|
-
# Get string of all column names for sql insert
|
|
199
|
-
sensor_names = primary_key
|
|
200
|
-
sensor_types = ["datetime"]
|
|
201
|
-
for column in dataframe.columns:
|
|
202
|
-
sensor_names += "," + column
|
|
203
|
-
sensor_types.append(data_map[dataframe[column].dtype.name])
|
|
193
|
+
# Database Connection
|
|
194
|
+
db_connection, cursor = config.connect_db()
|
|
195
|
+
try:
|
|
204
196
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
for column in dataframe.columns:
|
|
208
|
-
insert_str += "%s, "
|
|
209
|
-
insert_str += "%s)"
|
|
210
|
-
|
|
211
|
-
# last_time = datetime.datetime.strptime('20/01/1990', "%d/%m/%Y") # arbitrary past date
|
|
212
|
-
existing_rows_list = []
|
|
197
|
+
# Drop empty columns
|
|
198
|
+
dataframe = dataframe.dropna(axis=1, how='all')
|
|
213
199
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
cursor.execute(
|
|
223
|
-
f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
|
|
224
|
-
# Fetch the results into a DataFrame
|
|
225
|
-
existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
|
|
200
|
+
dbname = config_info['database']
|
|
201
|
+
if table_name == None:
|
|
202
|
+
table_name = config_info[data_type]["table_name"]
|
|
203
|
+
|
|
204
|
+
if(len(dataframe.index) <= 0):
|
|
205
|
+
print(f"Attempted to write to {table_name} but dataframe was empty.")
|
|
206
|
+
ret_value = True
|
|
207
|
+
else:
|
|
226
208
|
|
|
227
|
-
|
|
228
|
-
|
|
209
|
+
print(f"Attempting to write data for {dataframe.index[0]} to {dataframe.index[-1]} into {table_name}")
|
|
210
|
+
if auto_log_data_loss and dataframe.index[-1] < datetime.now() - timedelta(days=2):
|
|
211
|
+
report_data_loss(config)
|
|
212
|
+
|
|
213
|
+
# Get string of all column names for sql insert
|
|
214
|
+
sensor_names = primary_key
|
|
215
|
+
sensor_types = ["datetime"]
|
|
216
|
+
for column in dataframe.columns:
|
|
217
|
+
sensor_names += "," + column
|
|
218
|
+
sensor_types.append(data_map[dataframe[column].dtype.name])
|
|
219
|
+
|
|
220
|
+
# create SQL statement
|
|
221
|
+
insert_str = "INSERT INTO " + table_name + " (" + sensor_names + ") VALUES ("
|
|
222
|
+
for column in dataframe.columns:
|
|
223
|
+
insert_str += "%s, "
|
|
224
|
+
insert_str += "%s)"
|
|
225
|
+
|
|
226
|
+
# last_time = datetime.strptime('20/01/1990', "%d/%m/%Y") # arbitrary past date
|
|
227
|
+
existing_rows_list = []
|
|
228
|
+
|
|
229
|
+
# create db table if it does not exist, otherwise add missing columns to existing table
|
|
230
|
+
if not check_table_exists(cursor, table_name, dbname):
|
|
231
|
+
if not create_new_table(cursor, table_name, sensor_names.split(",")[1:], sensor_types[1:], primary_key=primary_key): #split on colums and remove first column aka time_pt
|
|
232
|
+
ret_value = False
|
|
233
|
+
raise Exception(f"Could not create new table {table_name} in database {dbname}")
|
|
234
|
+
else:
|
|
235
|
+
try:
|
|
236
|
+
# find existing times in database for upsert statement
|
|
237
|
+
cursor.execute(
|
|
238
|
+
f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
|
|
239
|
+
# Fetch the results into a DataFrame
|
|
240
|
+
existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
|
|
241
|
+
|
|
242
|
+
# Convert the primary_key column to a list
|
|
243
|
+
existing_rows_list = existing_rows[primary_key].tolist()
|
|
244
|
+
|
|
245
|
+
except mysqlerrors.Error:
|
|
246
|
+
print(f"Table {table_name} has no data.")
|
|
247
|
+
|
|
248
|
+
missing_cols, missing_types = find_missing_columns(cursor, dataframe, config_info, table_name)
|
|
249
|
+
if len(missing_cols):
|
|
250
|
+
if not create_new_columns(cursor, table_name, missing_cols, missing_types):
|
|
251
|
+
print("Unable to add new columns due to database error.")
|
|
252
|
+
|
|
253
|
+
updatedRows = 0
|
|
254
|
+
for index, row in dataframe.iterrows():
|
|
255
|
+
time_data = row.values.tolist()
|
|
256
|
+
#remove nans and infinites
|
|
257
|
+
time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
|
|
258
|
+
time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
|
|
259
|
+
|
|
260
|
+
if index in existing_rows_list:
|
|
261
|
+
statement, values = _generate_mysql_update(row, index, table_name, primary_key)
|
|
262
|
+
if statement != "":
|
|
263
|
+
cursor.execute(statement, values)
|
|
264
|
+
updatedRows += 1
|
|
265
|
+
else:
|
|
266
|
+
cursor.execute(insert_str, (index, *time_data))
|
|
229
267
|
|
|
230
|
-
|
|
231
|
-
print(f"
|
|
268
|
+
db_connection.commit()
|
|
269
|
+
print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
|
|
270
|
+
ret_value = True
|
|
271
|
+
except Exception as e:
|
|
272
|
+
print(f"Unable to load data into database. Exception: {e}")
|
|
273
|
+
if auto_log_data_loss:
|
|
274
|
+
report_data_loss(config)
|
|
275
|
+
ret_value = False
|
|
232
276
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
print("Unable to add new columns due to database error.")
|
|
237
|
-
|
|
238
|
-
updatedRows = 0
|
|
239
|
-
for index, row in dataframe.iterrows():
|
|
240
|
-
time_data = row.values.tolist()
|
|
241
|
-
#remove nans and infinites
|
|
242
|
-
time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
|
|
243
|
-
time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
|
|
244
|
-
|
|
245
|
-
if index in existing_rows_list:
|
|
246
|
-
statement, values = _generate_mysql_update(row, index, table_name, primary_key)
|
|
247
|
-
if statement != "":
|
|
248
|
-
cursor.execute(statement, values)
|
|
249
|
-
updatedRows += 1
|
|
250
|
-
else:
|
|
251
|
-
cursor.execute(insert_str, (index, *time_data))
|
|
277
|
+
db_connection.close()
|
|
278
|
+
cursor.close()
|
|
279
|
+
return ret_value
|
|
252
280
|
|
|
253
|
-
print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
|
|
254
|
-
return True
|
|
255
281
|
|
|
256
282
|
def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name : str = None):
|
|
257
283
|
"""
|
|
@@ -299,7 +325,7 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
|
|
|
299
325
|
column_names += "," + column
|
|
300
326
|
|
|
301
327
|
# create SQL statement
|
|
302
|
-
insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.
|
|
328
|
+
insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
|
|
303
329
|
|
|
304
330
|
# add aditional columns for db creation
|
|
305
331
|
full_column_names = column_names.split(",")[1:]
|
|
@@ -368,6 +394,71 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
|
|
|
368
394
|
cursor.close()
|
|
369
395
|
return True
|
|
370
396
|
|
|
397
|
+
def report_data_loss(config : ConfigManager, site_name : str = None):
|
|
398
|
+
"""
|
|
399
|
+
Logs data loss event in event database (assumes one exists)
|
|
400
|
+
|
|
401
|
+
Parameters
|
|
402
|
+
----------
|
|
403
|
+
config : ecopipeline.ConfigManager
|
|
404
|
+
The ConfigManager object that holds configuration data for the pipeline.
|
|
405
|
+
site_name : str
|
|
406
|
+
the name of the site to correspond the events with. If left blank will default to minute table name
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
bool:
|
|
411
|
+
A boolean value indicating if the data was successfully written to the database.
|
|
412
|
+
"""
|
|
413
|
+
# Drop empty columns
|
|
414
|
+
|
|
415
|
+
dbname = config.get_db_name()
|
|
416
|
+
table_name = "site_events"
|
|
417
|
+
if site_name is None:
|
|
418
|
+
site_name = config.get_site_name()
|
|
419
|
+
error_string = "Error proccessing data. Please check logs to resolve."
|
|
420
|
+
|
|
421
|
+
print(f"logging DATA_LOSS into {table_name}")
|
|
422
|
+
|
|
423
|
+
# create SQL statement
|
|
424
|
+
insert_str = "INSERT INTO " + table_name + " (start_time_pt, site_name, event_detail, event_type, last_modified_date, last_modified_by) VALUES "
|
|
425
|
+
insert_str += f"('{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','{site_name}','{error_string}','DATA_LOSS','{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','automatic_upload')"
|
|
426
|
+
|
|
427
|
+
existing_rows = pd.DataFrame({
|
|
428
|
+
'id' : []
|
|
429
|
+
})
|
|
430
|
+
|
|
431
|
+
connection, cursor = config.connect_db()
|
|
432
|
+
|
|
433
|
+
# create db table if it does not exist, otherwise add missing columns to existing table
|
|
434
|
+
if not check_table_exists(cursor, table_name, dbname):
|
|
435
|
+
print(f"Cannot log data loss. {table_name} does not exist in database {dbname}")
|
|
436
|
+
return False
|
|
437
|
+
else:
|
|
438
|
+
try:
|
|
439
|
+
# find existing times in database for upsert statement
|
|
440
|
+
cursor.execute(
|
|
441
|
+
f"SELECT id FROM {table_name} WHERE end_time_pt IS NULL AND site_name = '{site_name}' AND event_type = 'DATA_LOSS'")
|
|
442
|
+
# Fetch the results into a DataFrame
|
|
443
|
+
existing_rows = pd.DataFrame(cursor.fetchall(), columns=['id'])
|
|
444
|
+
|
|
445
|
+
except mysqlerrors.Error as e:
|
|
446
|
+
print(f"Retrieving data from {table_name} caused exception: {e}")
|
|
447
|
+
try:
|
|
448
|
+
|
|
449
|
+
if existing_rows.empty:
|
|
450
|
+
cursor.execute(insert_str)
|
|
451
|
+
connection.commit()
|
|
452
|
+
print("Successfully logged data loss.")
|
|
453
|
+
else:
|
|
454
|
+
print("Data loss already logged.")
|
|
455
|
+
except Exception as e:
|
|
456
|
+
# Print the exception message
|
|
457
|
+
print(f"Caught an exception when uploading to site_events table: {e}")
|
|
458
|
+
connection.close()
|
|
459
|
+
cursor.close()
|
|
460
|
+
return True
|
|
461
|
+
|
|
371
462
|
def _generate_mysql_update_event_table(row, id):
|
|
372
463
|
statement = f"UPDATE site_events SET "
|
|
373
464
|
statment_elems = []
|
|
@@ -379,7 +470,7 @@ def _generate_mysql_update_event_table(row, id):
|
|
|
379
470
|
|
|
380
471
|
if values:
|
|
381
472
|
statement += ", ".join(statment_elems)
|
|
382
|
-
statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.
|
|
473
|
+
statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}'"
|
|
383
474
|
statement += f" WHERE id = {id};"
|
|
384
475
|
# statement += f" WHERE start_time_pt = '{start_time_pt}' AND end_time_pt = '{end_time_pt}' AND event_type = '{event_type}' AND site_name = '{site_name}';"
|
|
385
476
|
else:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: ecopipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Contains functions for use in Ecotope Datapipelines
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
ecopipeline/__init__.py,sha256=vCRzwd781ciCSXMP1ycM_BXAqxj3KVaNKIjsLOPcbwc,171
|
|
2
2
|
ecopipeline/extract/__init__.py,sha256=3u_CUMdCguVewU3kN8x6xhVNyo1-p-gwTrhjOh7Psqg,645
|
|
3
3
|
ecopipeline/extract/extract.py,sha256=ryorqnu1RgyNK7joZRcbMmTajlTlB5hwaYzzpo8Z8Ho,43369
|
|
4
|
-
ecopipeline/load/__init__.py,sha256=
|
|
5
|
-
ecopipeline/load/load.py,sha256=
|
|
4
|
+
ecopipeline/load/__init__.py,sha256=oDAVF8AhK_qugqegjW7jK16p-nb9QzKhiNQOkEBniKM,235
|
|
5
|
+
ecopipeline/load/load.py,sha256=SzbUSq7M2f2Ifj_bbHzDRP63-x6UBJbEZ1C4blCpC6U,21416
|
|
6
6
|
ecopipeline/transform/__init__.py,sha256=DcIJfkRs4OmZzDeEfW_OiOIXNqN6CUl1_lW0SS7-eN8,2280
|
|
7
7
|
ecopipeline/transform/bayview.py,sha256=TP24dnTsUD95X-f6732egPZKjepFLJgDm9ImGr-fppY,17899
|
|
8
8
|
ecopipeline/transform/lbnl.py,sha256=EQ54G4rJXaZ7pwVusKcdK2KBehSdCsNo2ybphtMGs7o,33400
|
|
@@ -10,8 +10,8 @@ ecopipeline/transform/transform.py,sha256=uyBIXKCXUCT6zVnZyQohripGAzmY1yV9T1GxsX
|
|
|
10
10
|
ecopipeline/utils/ConfigManager.py,sha256=t4sfTjGO0g5P50XBQqGVFWaXfAlW1GMDh1DLoBuFGks,9826
|
|
11
11
|
ecopipeline/utils/__init__.py,sha256=ccWUR0m7gD9DfcgsxBCLOfi4lho6RdYuB2Ugy_g6ZdQ,28
|
|
12
12
|
ecopipeline/utils/unit_convert.py,sha256=VFh1we2Y8KV3u21BeWb-U3TlZJXo83q5vdxxkpgcuME,3064
|
|
13
|
-
ecopipeline-0.
|
|
14
|
-
ecopipeline-0.
|
|
15
|
-
ecopipeline-0.
|
|
16
|
-
ecopipeline-0.
|
|
17
|
-
ecopipeline-0.
|
|
13
|
+
ecopipeline-0.6.0.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
ecopipeline-0.6.0.dist-info/METADATA,sha256=HgOhlhoLCXiA1vSP42F7RlHURYBuCTiml5VR48BF7Jo,2307
|
|
15
|
+
ecopipeline-0.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
16
|
+
ecopipeline-0.6.0.dist-info/top_level.txt,sha256=WOPFJH2LIgKqm4lk2OnFF5cgVkYibkaBxIxgvLgO7y0,12
|
|
17
|
+
ecopipeline-0.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|