ecopipeline 0.5.1__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ecopipeline-0.5.1/src/ecopipeline.egg-info → ecopipeline-0.6.2}/PKG-INFO +1 -1
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/setup.cfg +1 -1
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/load/load.py +104 -76
- {ecopipeline-0.5.1 → ecopipeline-0.6.2/src/ecopipeline.egg-info}/PKG-INFO +1 -1
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/LICENSE +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/README.md +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/pyproject.toml +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/setup.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/__init__.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/extract/__init__.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/extract/extract.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/load/__init__.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/transform/__init__.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/transform/bayview.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/transform/lbnl.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/transform/transform.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/utils/ConfigManager.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/utils/__init__.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline/utils/unit_convert.py +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline.egg-info/SOURCES.txt +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline.egg-info/dependency_links.txt +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline.egg-info/requires.txt +0 -0
- {ecopipeline-0.5.1 → ecopipeline-0.6.2}/src/ecopipeline.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = ecopipeline
|
|
3
|
-
version = 0.
|
|
3
|
+
version = 0.6.2
|
|
4
4
|
authors = ["Carlos Bello, <bellocarlos@seattleu.edu>, Emil Fahrig <fahrigemil@seattleu.edu>, Casey Mang <cmang@seattleu.edu>, Julian Harris <harrisjulian@seattleu.edu>, Roger Tram <rtram@seattleu.edu>, Nolan Price <nolan@ecotope.com>"]
|
|
5
5
|
description = Contains functions for use in Ecotope Datapipelines
|
|
6
6
|
long_description = file: README.md
|
|
@@ -8,7 +8,7 @@ import math
|
|
|
8
8
|
pd.set_option('display.max_columns', None)
|
|
9
9
|
import mysql.connector.errors as mysqlerrors
|
|
10
10
|
from ecopipeline import ConfigManager
|
|
11
|
-
import datetime
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
14
|
data_map = {'int64':'float',
|
|
@@ -161,97 +161,123 @@ def create_new_columns(cursor : mysql.connector.cursor.MySQLCursor, table_name:
|
|
|
161
161
|
|
|
162
162
|
return True
|
|
163
163
|
|
|
164
|
-
def load_overwrite_database(
|
|
164
|
+
def load_overwrite_database(config : ConfigManager, dataframe: pd.DataFrame, config_info: dict, data_type: str,
|
|
165
|
+
primary_key: str = "time_pt", table_name: str = None, auto_log_data_loss : bool = False):
|
|
165
166
|
"""
|
|
166
167
|
Loads given pandas DataFrame into a MySQL table overwriting any conflicting data. Uses an UPSERT strategy to ensure any gaps in data are filled.
|
|
167
168
|
Note: will not overwrite values with NULL. Must have a new value to overwrite existing values in database
|
|
168
169
|
|
|
169
170
|
Parameters
|
|
170
171
|
----------
|
|
171
|
-
|
|
172
|
-
|
|
172
|
+
config : ecopipeline.ConfigManager
|
|
173
|
+
The ConfigManager object that holds configuration data for the pipeline.
|
|
173
174
|
dataframe: pd.DataFrame
|
|
174
175
|
The pandas DataFrame to be written into the mySQL server.
|
|
175
176
|
config_info: dict
|
|
176
177
|
The dictionary containing the configuration information in the data upload. This can be aquired through the get_login_info() function in this package
|
|
177
178
|
data_type: str
|
|
178
|
-
The header name corresponding to the table you wish to write data to.
|
|
179
|
+
The header name corresponding to the table you wish to write data to.
|
|
180
|
+
primary_key : str
|
|
181
|
+
The name of the primary key in the database to upload to. Default as 'time_pt'
|
|
182
|
+
table_name : str
|
|
183
|
+
overwrites table name from config_info if needed
|
|
184
|
+
auto_log_data_loss : bool
|
|
185
|
+
if set to True, a data loss event will be reported if no data exits in the dataframe
|
|
186
|
+
for the last two days from the current date OR if an error occurs
|
|
179
187
|
|
|
180
188
|
Returns
|
|
181
189
|
-------
|
|
182
190
|
bool:
|
|
183
191
|
A boolean value indicating if the data was successfully written to the database.
|
|
184
192
|
"""
|
|
185
|
-
#
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
dbname = config_info['database']
|
|
189
|
-
if table_name == None:
|
|
190
|
-
table_name = config_info[data_type]["table_name"]
|
|
191
|
-
|
|
192
|
-
if(len(dataframe.index) <= 0):
|
|
193
|
-
print(f"Attempted to write to {table_name} but dataframe was empty.")
|
|
194
|
-
return True
|
|
193
|
+
# Database Connection
|
|
194
|
+
db_connection, cursor = config.connect_db()
|
|
195
|
+
try:
|
|
195
196
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# Get string of all column names for sql insert
|
|
199
|
-
sensor_names = primary_key
|
|
200
|
-
sensor_types = ["datetime"]
|
|
201
|
-
for column in dataframe.columns:
|
|
202
|
-
sensor_names += "," + column
|
|
203
|
-
sensor_types.append(data_map[dataframe[column].dtype.name])
|
|
197
|
+
# Drop empty columns
|
|
198
|
+
dataframe = dataframe.dropna(axis=1, how='all')
|
|
204
199
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
# create db table if it does not exist, otherwise add missing columns to existing table
|
|
215
|
-
if not check_table_exists(cursor, table_name, dbname):
|
|
216
|
-
if not create_new_table(cursor, table_name, sensor_names.split(",")[1:], sensor_types[1:], primary_key=primary_key): #split on colums and remove first column aka time_pt
|
|
217
|
-
print(f"Could not create new table {table_name} in database {dbname}")
|
|
218
|
-
return False
|
|
219
|
-
else:
|
|
220
|
-
try:
|
|
221
|
-
# find existing times in database for upsert statement
|
|
222
|
-
cursor.execute(
|
|
223
|
-
f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
|
|
224
|
-
# Fetch the results into a DataFrame
|
|
225
|
-
existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
|
|
200
|
+
dbname = config_info['database']
|
|
201
|
+
if table_name == None:
|
|
202
|
+
table_name = config_info[data_type]["table_name"]
|
|
203
|
+
|
|
204
|
+
if(len(dataframe.index) <= 0):
|
|
205
|
+
print(f"Attempted to write to {table_name} but dataframe was empty.")
|
|
206
|
+
ret_value = True
|
|
207
|
+
else:
|
|
226
208
|
|
|
227
|
-
|
|
228
|
-
|
|
209
|
+
print(f"Attempting to write data for {dataframe.index[0]} to {dataframe.index[-1]} into {table_name}")
|
|
210
|
+
if auto_log_data_loss and dataframe.index[-1] < datetime.now() - timedelta(days=3):
|
|
211
|
+
report_data_loss(config)
|
|
212
|
+
|
|
213
|
+
# Get string of all column names for sql insert
|
|
214
|
+
sensor_names = primary_key
|
|
215
|
+
sensor_types = ["datetime"]
|
|
216
|
+
for column in dataframe.columns:
|
|
217
|
+
sensor_names += "," + column
|
|
218
|
+
sensor_types.append(data_map[dataframe[column].dtype.name])
|
|
219
|
+
|
|
220
|
+
# create SQL statement
|
|
221
|
+
insert_str = "INSERT INTO " + table_name + " (" + sensor_names + ") VALUES ("
|
|
222
|
+
for column in dataframe.columns:
|
|
223
|
+
insert_str += "%s, "
|
|
224
|
+
insert_str += "%s)"
|
|
225
|
+
|
|
226
|
+
# last_time = datetime.strptime('20/01/1990', "%d/%m/%Y") # arbitrary past date
|
|
227
|
+
existing_rows_list = []
|
|
228
|
+
|
|
229
|
+
# create db table if it does not exist, otherwise add missing columns to existing table
|
|
230
|
+
if not check_table_exists(cursor, table_name, dbname):
|
|
231
|
+
if not create_new_table(cursor, table_name, sensor_names.split(",")[1:], sensor_types[1:], primary_key=primary_key): #split on colums and remove first column aka time_pt
|
|
232
|
+
ret_value = False
|
|
233
|
+
raise Exception(f"Could not create new table {table_name} in database {dbname}")
|
|
234
|
+
else:
|
|
235
|
+
try:
|
|
236
|
+
# find existing times in database for upsert statement
|
|
237
|
+
cursor.execute(
|
|
238
|
+
f"SELECT {primary_key} FROM {table_name} WHERE {primary_key} >= '{dataframe.index.min()}'")
|
|
239
|
+
# Fetch the results into a DataFrame
|
|
240
|
+
existing_rows = pd.DataFrame(cursor.fetchall(), columns=[primary_key])
|
|
241
|
+
|
|
242
|
+
# Convert the primary_key column to a list
|
|
243
|
+
existing_rows_list = existing_rows[primary_key].tolist()
|
|
244
|
+
|
|
245
|
+
except mysqlerrors.Error:
|
|
246
|
+
print(f"Table {table_name} has no data.")
|
|
247
|
+
|
|
248
|
+
missing_cols, missing_types = find_missing_columns(cursor, dataframe, config_info, table_name)
|
|
249
|
+
if len(missing_cols):
|
|
250
|
+
if not create_new_columns(cursor, table_name, missing_cols, missing_types):
|
|
251
|
+
print("Unable to add new columns due to database error.")
|
|
252
|
+
|
|
253
|
+
updatedRows = 0
|
|
254
|
+
for index, row in dataframe.iterrows():
|
|
255
|
+
time_data = row.values.tolist()
|
|
256
|
+
#remove nans and infinites
|
|
257
|
+
time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
|
|
258
|
+
time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
|
|
259
|
+
|
|
260
|
+
if index in existing_rows_list:
|
|
261
|
+
statement, values = _generate_mysql_update(row, index, table_name, primary_key)
|
|
262
|
+
if statement != "":
|
|
263
|
+
cursor.execute(statement, values)
|
|
264
|
+
updatedRows += 1
|
|
265
|
+
else:
|
|
266
|
+
cursor.execute(insert_str, (index, *time_data))
|
|
229
267
|
|
|
230
|
-
|
|
231
|
-
print(f"
|
|
268
|
+
db_connection.commit()
|
|
269
|
+
print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
|
|
270
|
+
ret_value = True
|
|
271
|
+
except Exception as e:
|
|
272
|
+
print(f"Unable to load data into database. Exception: {e}")
|
|
273
|
+
if auto_log_data_loss:
|
|
274
|
+
report_data_loss(config)
|
|
275
|
+
ret_value = False
|
|
232
276
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
print("Unable to add new columns due to database error.")
|
|
237
|
-
|
|
238
|
-
updatedRows = 0
|
|
239
|
-
for index, row in dataframe.iterrows():
|
|
240
|
-
time_data = row.values.tolist()
|
|
241
|
-
#remove nans and infinites
|
|
242
|
-
time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
|
|
243
|
-
time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
|
|
244
|
-
|
|
245
|
-
if index in existing_rows_list:
|
|
246
|
-
statement, values = _generate_mysql_update(row, index, table_name, primary_key)
|
|
247
|
-
if statement != "":
|
|
248
|
-
cursor.execute(statement, values)
|
|
249
|
-
updatedRows += 1
|
|
250
|
-
else:
|
|
251
|
-
cursor.execute(insert_str, (index, *time_data))
|
|
277
|
+
db_connection.close()
|
|
278
|
+
cursor.close()
|
|
279
|
+
return ret_value
|
|
252
280
|
|
|
253
|
-
print(f"Successfully wrote {len(dataframe.index)} rows to table {table_name} in database {dbname}. {updatedRows} existing rows were overwritten.")
|
|
254
|
-
return True
|
|
255
281
|
|
|
256
282
|
def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name : str = None):
|
|
257
283
|
"""
|
|
@@ -299,7 +325,7 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
|
|
|
299
325
|
column_names += "," + column
|
|
300
326
|
|
|
301
327
|
# create SQL statement
|
|
302
|
-
insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.
|
|
328
|
+
insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
|
|
303
329
|
|
|
304
330
|
# add aditional columns for db creation
|
|
305
331
|
full_column_names = column_names.split(",")[1:]
|
|
@@ -390,13 +416,13 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
|
|
|
390
416
|
table_name = "site_events"
|
|
391
417
|
if site_name is None:
|
|
392
418
|
site_name = config.get_site_name()
|
|
393
|
-
error_string = "Error
|
|
419
|
+
error_string = "Error processing data. Please check logs to resolve."
|
|
394
420
|
|
|
395
421
|
print(f"logging DATA_LOSS into {table_name}")
|
|
396
422
|
|
|
397
423
|
# create SQL statement
|
|
398
424
|
insert_str = "INSERT INTO " + table_name + " (start_time_pt, site_name, event_detail, event_type, last_modified_date, last_modified_by) VALUES "
|
|
399
|
-
insert_str += f"('{datetime.
|
|
425
|
+
insert_str += f"('{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','{site_name}','{error_string}','DATA_LOSS','{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}','automatic_upload')"
|
|
400
426
|
|
|
401
427
|
existing_rows = pd.DataFrame({
|
|
402
428
|
'id' : []
|
|
@@ -412,7 +438,7 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
|
|
|
412
438
|
try:
|
|
413
439
|
# find existing times in database for upsert statement
|
|
414
440
|
cursor.execute(
|
|
415
|
-
f"SELECT id FROM {table_name} WHERE end_time_pt IS NULL AND site_name = '{site_name}' AND event_type = 'DATA_LOSS'
|
|
441
|
+
f"SELECT id FROM {table_name} WHERE end_time_pt IS NULL AND site_name = '{site_name}' AND event_type = 'DATA_LOSS'")
|
|
416
442
|
# Fetch the results into a DataFrame
|
|
417
443
|
existing_rows = pd.DataFrame(cursor.fetchall(), columns=['id'])
|
|
418
444
|
|
|
@@ -422,8 +448,10 @@ def report_data_loss(config : ConfigManager, site_name : str = None):
|
|
|
422
448
|
|
|
423
449
|
if existing_rows.empty:
|
|
424
450
|
cursor.execute(insert_str)
|
|
425
|
-
|
|
426
|
-
|
|
451
|
+
connection.commit()
|
|
452
|
+
print("Successfully logged data loss.")
|
|
453
|
+
else:
|
|
454
|
+
print("Data loss already logged.")
|
|
427
455
|
except Exception as e:
|
|
428
456
|
# Print the exception message
|
|
429
457
|
print(f"Caught an exception when uploading to site_events table: {e}")
|
|
@@ -442,7 +470,7 @@ def _generate_mysql_update_event_table(row, id):
|
|
|
442
470
|
|
|
443
471
|
if values:
|
|
444
472
|
statement += ", ".join(statment_elems)
|
|
445
|
-
statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.
|
|
473
|
+
statement += f", last_modified_by = 'automatic_upload', last_modified_date = '{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}'"
|
|
446
474
|
statement += f" WHERE id = {id};"
|
|
447
475
|
# statement += f" WHERE start_time_pt = '{start_time_pt}' AND end_time_pt = '{end_time_pt}' AND event_type = '{event_type}' AND site_name = '{site_name}';"
|
|
448
476
|
else:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|