berryworld 1.0.0.196834__py3-none-any.whl → 1.0.0.197207__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- berryworld/__init__.py +2 -3
- berryworld/credentials.py +0 -26
- berryworld/sql_connenction.py +1211 -37
- {berryworld-1.0.0.196834.dist-info → berryworld-1.0.0.197207.dist-info}/METADATA +1 -1
- {berryworld-1.0.0.196834.dist-info → berryworld-1.0.0.197207.dist-info}/RECORD +8 -8
- {berryworld-1.0.0.196834.dist-info → berryworld-1.0.0.197207.dist-info}/WHEEL +0 -0
- {berryworld-1.0.0.196834.dist-info → berryworld-1.0.0.197207.dist-info}/licenses/LICENSE +0 -0
- {berryworld-1.0.0.196834.dist-info → berryworld-1.0.0.197207.dist-info}/top_level.txt +0 -0
berryworld/sql_connenction.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
import ast
|
|
4
4
|
import math
|
|
5
|
+
import time
|
|
5
6
|
import pyodbc
|
|
6
7
|
import traceback
|
|
7
8
|
import numpy as np
|
|
@@ -9,6 +10,8 @@ import pandas as pd
|
|
|
9
10
|
import sqlalchemy as sa
|
|
10
11
|
from urllib import parse
|
|
11
12
|
from numbers import Number
|
|
13
|
+
from threading import Thread
|
|
14
|
+
from sqlalchemy.pool import QueuePool
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class SQLConnection:
|
|
@@ -137,9 +140,9 @@ class SQLConnection:
|
|
|
137
140
|
if self.multi_db & (self.server.lower() == 'prod'):
|
|
138
141
|
database = str(self.db_name) + 'Primary'
|
|
139
142
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
self.engine = sa.create_engine(
|
|
143
|
+
self.con_string = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' + database +
|
|
144
|
+
'?driver=' + self.driver + self.trusted_certificate + self.encrypt)
|
|
145
|
+
self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
|
|
143
146
|
if not commit_as_transaction:
|
|
144
147
|
self.engine = self.engine.execution_options(isolation_level="AUTOCOMMIT")
|
|
145
148
|
|
|
@@ -170,7 +173,7 @@ class SQLConnection:
|
|
|
170
173
|
-----------------------------
|
|
171
174
|
:param sql_query: Query to be sent to SQL
|
|
172
175
|
:param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
173
|
-
|
|
176
|
+
to floating point.
|
|
174
177
|
:return: DataFrame gathering the requested data
|
|
175
178
|
"""
|
|
176
179
|
self.open_read_connection()
|
|
@@ -184,21 +187,6 @@ class SQLConnection:
|
|
|
184
187
|
self.close_connection()
|
|
185
188
|
return data
|
|
186
189
|
|
|
187
|
-
@staticmethod
|
|
188
|
-
def _parse_df(parse_, data, col_names):
|
|
189
|
-
""" Auxiliar function to convert list to DataFrame
|
|
190
|
-
:param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
|
|
191
|
-
:param data: List gathering the data retrieved from SQL
|
|
192
|
-
:param col_names: List of columns to create the DataFrame
|
|
193
|
-
:return: Formatted data
|
|
194
|
-
"""
|
|
195
|
-
if parse_ is True:
|
|
196
|
-
col_names = list(zip(*list(col_names)))[0]
|
|
197
|
-
res = pd.DataFrame(list(zip(*data)), index=col_names).T
|
|
198
|
-
else:
|
|
199
|
-
res = [col_names, data]
|
|
200
|
-
return res
|
|
201
|
-
|
|
202
190
|
def sp_results(self, sql_query, resp_number=None, parse_=True, commit_as_transaction=True, no_count=True):
|
|
203
191
|
""" Execute a stored procedure and retrieves all its output data
|
|
204
192
|
-----------------------------
|
|
@@ -288,7 +276,7 @@ class SQLConnection:
|
|
|
288
276
|
:param table: Table in which the data will be uploaded
|
|
289
277
|
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
290
278
|
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
291
|
-
|
|
279
|
+
external constraints)
|
|
292
280
|
:param identity: Indicate whether the identity columns will be inserted or not
|
|
293
281
|
:param chunk: Indicate how many rows will be uploaded at once
|
|
294
282
|
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
@@ -296,11 +284,11 @@ class SQLConnection:
|
|
|
296
284
|
false, it commits data by chunks.
|
|
297
285
|
:param output: Outputs the columns indicated in this list
|
|
298
286
|
:param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
|
|
299
|
-
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
300
287
|
:param nullable: Used within bools2bits function to indicate which boolean column values to convert
|
|
301
288
|
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
302
289
|
:param infer_datetime_format: Indicate whether the datetime columns should be converted to string and if so,
|
|
303
|
-
|
|
290
|
+
then the format to be used
|
|
291
|
+
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
304
292
|
"""
|
|
305
293
|
if output is None:
|
|
306
294
|
output = []
|
|
@@ -391,7 +379,7 @@ class SQLConnection:
|
|
|
391
379
|
:param table: Table in which the data will be uploaded
|
|
392
380
|
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
393
381
|
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
394
|
-
|
|
382
|
+
external constraints)
|
|
395
383
|
:param identity: Indicate whether the identity columns will be inserted or not
|
|
396
384
|
:param chunk: Indicate how many rows will be uploaded at once
|
|
397
385
|
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
@@ -495,8 +483,8 @@ class SQLConnection:
|
|
|
495
483
|
:param bool_cols: columns to include as booleans
|
|
496
484
|
:param batch_size: Number of records to update in each iteration
|
|
497
485
|
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
498
|
-
|
|
499
|
-
|
|
486
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
|
|
487
|
+
will be retrieved)
|
|
500
488
|
:param nullable: Indicate whether to update the table column with null or exclude the reference from the update
|
|
501
489
|
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
502
490
|
:return: None
|
|
@@ -631,8 +619,8 @@ class SQLConnection:
|
|
|
631
619
|
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
632
620
|
:param bool_cols: columns to include as booleans
|
|
633
621
|
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
634
|
-
|
|
635
|
-
|
|
622
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
|
|
623
|
+
will be retrieved)
|
|
636
624
|
:param chunk: Indicate how many rows will be uploaded at once
|
|
637
625
|
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
638
626
|
:return: None
|
|
@@ -759,7 +747,7 @@ class SQLConnection:
|
|
|
759
747
|
con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
|
|
760
748
|
-----------------------------
|
|
761
749
|
:param data: DataFrame to insert in the staging table
|
|
762
|
-
:param staging_schema:
|
|
750
|
+
:param staging_schema: Schema to staging table
|
|
763
751
|
:param staging_table: Staging table name
|
|
764
752
|
:param sp_schema: Stored Procedure schema
|
|
765
753
|
:param sp_name: Stored Procedure name
|
|
@@ -821,11 +809,6 @@ class SQLConnection:
|
|
|
821
809
|
[True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
|
|
822
810
|
3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
|
|
823
811
|
the name of a date column and as values the format that the columns has to have.
|
|
824
|
-
Versions comments...
|
|
825
|
-
+ Difference between version 1.0 and 1.01 is that the last one is a bit simpler, it waits for names of columns
|
|
826
|
-
which types are booleans or datetime (and format for this one) instead of trying to figure out this columns
|
|
827
|
-
as in version 1.0 what is sometimes problematic. So, version 1.01 is more reliable but requires more time
|
|
828
|
-
to write the call to the method.
|
|
829
812
|
-------------------------
|
|
830
813
|
MERGE INTO [SCHEMA].[TABLE] AS TARGET
|
|
831
814
|
USING (
|
|
@@ -864,10 +847,10 @@ class SQLConnection:
|
|
|
864
847
|
:param update_set: list of columns to update
|
|
865
848
|
:param bool_cols: list of columns gathering boolean types
|
|
866
849
|
:param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
|
|
867
|
-
|
|
850
|
+
in its definition has it. Its a boolean.
|
|
868
851
|
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
869
|
-
:return: None
|
|
870
852
|
:param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
|
|
853
|
+
:return: None
|
|
871
854
|
"""
|
|
872
855
|
if data is None:
|
|
873
856
|
# no data to upload
|
|
@@ -940,6 +923,21 @@ class SQLConnection:
|
|
|
940
923
|
except Exception:
|
|
941
924
|
raise Exception(traceback.format_exc())
|
|
942
925
|
|
|
926
|
+
@staticmethod
|
|
927
|
+
def _parse_df(parse_, data, col_names):
|
|
928
|
+
""" Auxiliar function to convert list to DataFrame
|
|
929
|
+
:param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
|
|
930
|
+
:param data: List gathering the data retrieved from SQL
|
|
931
|
+
:param col_names: List of columns to create the DataFrame
|
|
932
|
+
:return: Formatted data
|
|
933
|
+
"""
|
|
934
|
+
if parse_ is True:
|
|
935
|
+
col_names = list(zip(*list(col_names)))[0]
|
|
936
|
+
res = pd.DataFrame(list(zip(*data)), index=col_names).T
|
|
937
|
+
else:
|
|
938
|
+
res = [col_names, data]
|
|
939
|
+
return res
|
|
940
|
+
|
|
943
941
|
@staticmethod
|
|
944
942
|
def date_mapping_data_types(data):
|
|
945
943
|
"""
|
|
@@ -967,8 +965,1184 @@ class SQLConnection:
|
|
|
967
965
|
"""
|
|
968
966
|
Map datetime and boolean variables so they can be inserted in SQL
|
|
969
967
|
:param data: DataFrame containing the variables to map
|
|
970
|
-
:return: The mapped DataFrame
|
|
971
968
|
:param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
|
|
969
|
+
:return: The mapped DataFrame
|
|
970
|
+
"""
|
|
971
|
+
first_index = data.index[0]
|
|
972
|
+
bool_col = data.columns[
|
|
973
|
+
[('bool' in str(type(data.loc[first_index, col]))) | ('object' in str(type(data.loc[first_index, col]))) for
|
|
974
|
+
col in data.columns]]
|
|
975
|
+
if len(bool_col) > 0:
|
|
976
|
+
for col in bool_col:
|
|
977
|
+
if nullable:
|
|
978
|
+
bool_not_null = data[data[col].notna()]
|
|
979
|
+
if bool_not_null.shape[0] > 0:
|
|
980
|
+
for iindex in bool_not_null.index:
|
|
981
|
+
data.at[iindex, col] = int(data.loc[iindex, col])
|
|
982
|
+
else:
|
|
983
|
+
data[col] = data[col].apply(lambda x: 1 if x is True else 0)
|
|
984
|
+
|
|
985
|
+
return data
|
|
986
|
+
|
|
987
|
+
@staticmethod
|
|
988
|
+
def id_next(con_db, table, schema, id_col, print_sql=False):
|
|
989
|
+
"""
|
|
990
|
+
This static method returns the next id to be inserted into a table for sql_server
|
|
991
|
+
:param con_db: class to connect to a sql server database
|
|
992
|
+
:param table: name of the table
|
|
993
|
+
:param schema: name of the schema
|
|
994
|
+
:param id_col: name of the id column
|
|
995
|
+
:param print_sql: bool to indicate if you want sql statement to be print on Python Console
|
|
996
|
+
:return: Max ID + 1 for id_col
|
|
997
|
+
"""
|
|
998
|
+
sql_statement = ("SELECT CASE WHEN MAX(%s) IS NULL THEN 1 ELSE MAX(%s) + 1 END AS [Id] FROM [%s].[%s]" % (
|
|
999
|
+
id_col, id_col, schema, table))
|
|
1000
|
+
if print_sql:
|
|
1001
|
+
print(sql_statement)
|
|
1002
|
+
df = con_db.query(sql_statement)
|
|
1003
|
+
id_ = df.loc[0, 'Id']
|
|
1004
|
+
return id_
|
|
1005
|
+
|
|
1006
|
+
@staticmethod
|
|
1007
|
+
def convert_decimal_str(string):
|
|
1008
|
+
""" Method to parse the Decimal type in python
|
|
1009
|
+
:param string: String variable to parse
|
|
1010
|
+
:return: Parsed string
|
|
1011
|
+
"""
|
|
1012
|
+
string = re.sub("'\)(?!(,[ ]+\())(?=([^$]))", "", string)
|
|
1013
|
+
return re.sub("Decimal\('", "", string)
|
|
1014
|
+
|
|
1015
|
+
@staticmethod
|
|
1016
|
+
def infer_datetime(data, infer_datetime_format):
|
|
1017
|
+
""" Method to infer datetime columns and format them as string
|
|
1018
|
+
:param data: DataFrame to parse
|
|
1019
|
+
:param infer_datetime_format: format to be used for the datetime columns
|
|
1020
|
+
:return: Parsed DataFrame
|
|
1021
|
+
"""
|
|
1022
|
+
for col in data.select_dtypes(include=['datetime64']).columns:
|
|
1023
|
+
data[col] = pd.to_datetime(data[col]).dt.strftime(infer_datetime_format)
|
|
1024
|
+
|
|
1025
|
+
return data
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
class SQLPoolEngine:
|
|
1029
|
+
""" Connect a Pool Engine to a Microsoft SQL """
|
|
1030
|
+
|
|
1031
|
+
def __init__(self, db_reference, server, master=False, trusted_certificate=True, encrypt=True, multi_db=False,
|
|
1032
|
+
commit_as_transaction=True, pool_size=10, max_overflow=10, pool_timeout=30, timeout=300):
|
|
1033
|
+
""" Initialize the class
|
|
1034
|
+
It requires the
|
|
1035
|
+
SQL-DBREFERENCE-PROD = 'server_name db_name user password'
|
|
1036
|
+
-----------------------------
|
|
1037
|
+
db_reference = 'FruitFlow'
|
|
1038
|
+
server = 'prod'
|
|
1039
|
+
|
|
1040
|
+
pool_ = SQLPoolEngine(db_reference, server)
|
|
1041
|
+
-----------------------------
|
|
1042
|
+
:param db_reference: Database reference to connect to
|
|
1043
|
+
:param server: Server to connect to
|
|
1044
|
+
:param master: Indicate whether the connection will be done to master or to a specific database
|
|
1045
|
+
:param trusted_certificate: Indicate whether the connection will be done using the TrustServerCertificate
|
|
1046
|
+
:param encrypt: Indicate whether the connection will use SSL/TLS encryption
|
|
1047
|
+
:param multi_db: Indicate whether the connection will be done to a specific database or to multiple databases
|
|
1048
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1049
|
+
:param pool_size: Number of connections to keep in the pool
|
|
1050
|
+
:param max_overflow: Extra connections beyond pool_size
|
|
1051
|
+
:param pool_timeout: Timeout for getting a connection
|
|
1052
|
+
:param timeout: Connection timeout in seconds
|
|
1053
|
+
"""
|
|
1054
|
+
self.con_string_read = None
|
|
1055
|
+
self.con_string_write = None
|
|
1056
|
+
self.engine_read = None
|
|
1057
|
+
self.engine_write = None
|
|
1058
|
+
self.con = None
|
|
1059
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1060
|
+
self.pool_size = pool_size
|
|
1061
|
+
self.max_overflow = max_overflow
|
|
1062
|
+
self.pool_timeout = pool_timeout
|
|
1063
|
+
self.timeout = timeout
|
|
1064
|
+
|
|
1065
|
+
self.db_reference = db_reference.replace("_", "") if "_" in db_reference else db_reference
|
|
1066
|
+
self.server = server
|
|
1067
|
+
if self.server is None:
|
|
1068
|
+
raise ValueError("Please provide a value for server type")
|
|
1069
|
+
|
|
1070
|
+
self.multi_db = multi_db
|
|
1071
|
+
self.master = master
|
|
1072
|
+
if trusted_certificate:
|
|
1073
|
+
self.trusted_certificate = '&TrustServerCertificate=yes'
|
|
1074
|
+
else:
|
|
1075
|
+
self.trusted_certificate = ''
|
|
1076
|
+
|
|
1077
|
+
if encrypt:
|
|
1078
|
+
self.encrypt = '&Encrypt=yes'
|
|
1079
|
+
else:
|
|
1080
|
+
self.encrypt = ''
|
|
1081
|
+
|
|
1082
|
+
drivers = [driver for driver in pyodbc.drivers() if (bool(re.search(r'\d', driver)))]
|
|
1083
|
+
try:
|
|
1084
|
+
self.server_name, self.db_name, self.user_name, self.password = self.credentials()
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
raise ValueError(
|
|
1087
|
+
f"Cannot find a reference to {self.db_reference} and {self.server.upper()} server: {str(e)}")
|
|
1088
|
+
|
|
1089
|
+
driver_attempt = ''
|
|
1090
|
+
for driver in drivers:
|
|
1091
|
+
try:
|
|
1092
|
+
self.driver = driver
|
|
1093
|
+
self.open_read_connection(commit_as_transaction=self.commit_as_transaction)
|
|
1094
|
+
self.query('''SELECT TOP 1 * FROM information_schema.tables;''')
|
|
1095
|
+
break
|
|
1096
|
+
except Exception as e:
|
|
1097
|
+
print(e)
|
|
1098
|
+
driver_attempt = str(e)
|
|
1099
|
+
|
|
1100
|
+
if driver_attempt != '':
|
|
1101
|
+
raise ValueError(
|
|
1102
|
+
f"Cannot connect to db: {self.db_name} - Error: {str(driver_attempt)}")
|
|
1103
|
+
|
|
1104
|
+
self.create_write_engine(commit_as_transaction=self.commit_as_transaction)
|
|
1105
|
+
|
|
1106
|
+
# Dispose the engine after a certain timeout
|
|
1107
|
+
Thread(target=self.close_connection, args=(True, self.timeout)).start()
|
|
1108
|
+
|
|
1109
|
+
def credentials(self):
|
|
1110
|
+
""" Return the credentials used to connect to the SQL Server
|
|
1111
|
+
:return: Dictionary with the credentials used to connect to the SQL Server
|
|
1112
|
+
"""
|
|
1113
|
+
try:
|
|
1114
|
+
server_creds = os.environ.get(f"SQL-{self.db_reference.upper()}")
|
|
1115
|
+
server_creds = ast.literal_eval(server_creds)
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
raise ValueError(f'DB reference: {self.db_reference} not found. ERROR: {e}')
|
|
1118
|
+
|
|
1119
|
+
try:
|
|
1120
|
+
server_creds = server_creds[self.server.lower()]
|
|
1121
|
+
except Exception as e:
|
|
1122
|
+
raise ValueError(f'Server: {self.server} not found for DB reference: {self.db_reference}. ERROR: {e}')
|
|
1123
|
+
|
|
1124
|
+
if 'server_name' not in server_creds.keys():
|
|
1125
|
+
raise ValueError(f"Server name not provided for {self.db_reference} on {self.server.upper()} server")
|
|
1126
|
+
else:
|
|
1127
|
+
server_name = server_creds['server_name']
|
|
1128
|
+
|
|
1129
|
+
if 'db_name' not in server_creds.keys():
|
|
1130
|
+
raise ValueError(f"Database name not provided for {self.db_reference} on {self.server.upper()} server")
|
|
1131
|
+
else:
|
|
1132
|
+
db_name = server_creds['db_name']
|
|
1133
|
+
|
|
1134
|
+
if 'user_name' not in server_creds.keys():
|
|
1135
|
+
raise ValueError(f"User name not provided for {self.db_reference} on {self.server.upper()} server")
|
|
1136
|
+
else:
|
|
1137
|
+
user_name = server_creds['user_name']
|
|
1138
|
+
|
|
1139
|
+
if 'pwd' not in server_creds.keys():
|
|
1140
|
+
raise ValueError(f"Password not provided for {self.db_reference} on {self.server.upper()} server")
|
|
1141
|
+
else:
|
|
1142
|
+
password = server_creds['pwd']
|
|
1143
|
+
|
|
1144
|
+
return re.sub(r'(\\)\1*', r'\1', server_name), db_name, user_name, password
|
|
1145
|
+
|
|
1146
|
+
def create_read_engine(self, commit_as_transaction=True):
|
|
1147
|
+
""" Create a reading engine
|
|
1148
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1149
|
+
:return: The opened connection
|
|
1150
|
+
"""
|
|
1151
|
+
if self.master:
|
|
1152
|
+
self.con_string_read = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/master' +
|
|
1153
|
+
'?driver=' + self.driver + '&trusted_connection=yes' + self.trusted_certificate +
|
|
1154
|
+
self.encrypt)
|
|
1155
|
+
else:
|
|
1156
|
+
self.con_string_read = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' +
|
|
1157
|
+
self.db_name + '?driver=' + self.driver + self.trusted_certificate + self.encrypt)
|
|
1158
|
+
|
|
1159
|
+
self.engine_read = sa.create_engine(self.con_string_read % parse.quote_plus(self.password),
|
|
1160
|
+
poolclass=QueuePool,
|
|
1161
|
+
pool_size=self.pool_size, # Number of connections to keep in the pool
|
|
1162
|
+
max_overflow=self.max_overflow, # Extra connections beyond pool_size
|
|
1163
|
+
pool_timeout=self.pool_timeout, # Timeout for getting a connection
|
|
1164
|
+
pool_recycle=self.timeout # Recycle connections after X minutes
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
if not commit_as_transaction:
|
|
1168
|
+
self.engine_read = self.engine_read.execution_options(isolation_level="AUTOCOMMIT")
|
|
1169
|
+
|
|
1170
|
+
def create_write_engine(self, commit_as_transaction=True):
|
|
1171
|
+
""" Create a writing engine
|
|
1172
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1173
|
+
:return: The opened connection
|
|
1174
|
+
"""
|
|
1175
|
+
database = self.db_name
|
|
1176
|
+
if self.multi_db & (self.server.lower() == 'prod'):
|
|
1177
|
+
database = str(self.db_name) + 'Primary'
|
|
1178
|
+
|
|
1179
|
+
self.con_string_write = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' + database +
|
|
1180
|
+
'?driver=' + self.driver + self.trusted_certificate + self.encrypt)
|
|
1181
|
+
self.engine_write = sa.create_engine(self.con_string_write % parse.quote_plus(self.password),
|
|
1182
|
+
poolclass=QueuePool,
|
|
1183
|
+
pool_size=10, # Number of connections to keep in the pool
|
|
1184
|
+
max_overflow=10, # Extra connections beyond pool_size
|
|
1185
|
+
pool_timeout=30, # Timeout for getting a connection
|
|
1186
|
+
pool_recycle=self.timeout # Recycle connections after X minutes
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
if not commit_as_transaction:
|
|
1190
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1191
|
+
|
|
1192
|
+
def open_read_connection(self, commit_as_transaction=True):
|
|
1193
|
+
""" Open a reading connection with the Server
|
|
1194
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1195
|
+
:return: The opened connection
|
|
1196
|
+
"""
|
|
1197
|
+
self.create_read_engine(commit_as_transaction=commit_as_transaction)
|
|
1198
|
+
self.con = self.engine_read.connect().connection
|
|
1199
|
+
|
|
1200
|
+
def close_connection(self, timeout=0):
|
|
1201
|
+
""" Dispose any opened engines with the Server
|
|
1202
|
+
:return: None
|
|
1203
|
+
"""
|
|
1204
|
+
if timeout > 0:
|
|
1205
|
+
time.sleep(timeout)
|
|
1206
|
+
|
|
1207
|
+
if self.engine_read:
|
|
1208
|
+
self.engine_read.dispose()
|
|
1209
|
+
|
|
1210
|
+
if self.engine_write:
|
|
1211
|
+
self.engine_write.dispose()
|
|
1212
|
+
|
|
1213
|
+
def query(self, sql_query, coerce_float=False):
|
|
1214
|
+
""" Read data from SQL according to the sql_query
|
|
1215
|
+
-----------------------------
|
|
1216
|
+
query_str = "SELECT * FROM %s" & table
|
|
1217
|
+
con_.query(query_str)
|
|
1218
|
+
-----------------------------
|
|
1219
|
+
:param sql_query: Query to be sent to SQL
|
|
1220
|
+
:param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
1221
|
+
to floating point.
|
|
1222
|
+
:return: DataFrame gathering the requested data
|
|
1223
|
+
"""
|
|
1224
|
+
if self.con is None:
|
|
1225
|
+
self.con = self.engine_read.connect().connection
|
|
1226
|
+
|
|
1227
|
+
data = None
|
|
1228
|
+
try:
|
|
1229
|
+
with self.engine_read.begin() as conn:
|
|
1230
|
+
data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
|
|
1231
|
+
except ValueError:
|
|
1232
|
+
print(traceback.format_exc())
|
|
1233
|
+
finally:
|
|
1234
|
+
self.con.close()
|
|
1235
|
+
return data
|
|
1236
|
+
|
|
1237
|
+
|
|
1238
|
+
class SQLConnectionPool:
|
|
1239
|
+
""" Connect to a Microsoft SQL Server using connection pooling """
|
|
1240
|
+
|
|
1241
|
+
def __init__(self, pool_class):
|
|
1242
|
+
""" Initialize the class
|
|
1243
|
+
It requires an instance of the SQLPoolEngine to work properly
|
|
1244
|
+
-----------------------------
|
|
1245
|
+
con_ = SQLConnectionPool(SQLPoolEngine)
|
|
1246
|
+
-----------------------------
|
|
1247
|
+
:param pool_class: SQLAlchemy Pool class to use for the connections
|
|
1248
|
+
"""
|
|
1249
|
+
self.con_read = None
|
|
1250
|
+
self.con_write = None
|
|
1251
|
+
self.engine_read = pool_class.engine_read
|
|
1252
|
+
self.engine_write = pool_class.engine_write
|
|
1253
|
+
self.con_string_read = pool_class.con_string_read
|
|
1254
|
+
self.con_string_write = pool_class.con_string_write
|
|
1255
|
+
self.commit_as_transaction = pool_class.commit_as_transaction
|
|
1256
|
+
self.db_name = pool_class.db_name
|
|
1257
|
+
self.server = pool_class.server
|
|
1258
|
+
self.timeout = pool_class.timeout
|
|
1259
|
+
|
|
1260
|
+
Thread(target=self.close_connection, args=(self.timeout,)).start()
|
|
1261
|
+
|
|
1262
|
+
def close_connection(self, timeout=0):
|
|
1263
|
+
""" Close any opened connections with the Server
|
|
1264
|
+
:return: None
|
|
1265
|
+
"""
|
|
1266
|
+
if timeout > 0:
|
|
1267
|
+
time.sleep(timeout)
|
|
1268
|
+
|
|
1269
|
+
if self.con_read is not None:
|
|
1270
|
+
self.con_read.close()
|
|
1271
|
+
if self.engine_read:
|
|
1272
|
+
self.engine_read.dispose()
|
|
1273
|
+
|
|
1274
|
+
if self.con_write is not None:
|
|
1275
|
+
self.con_write.close()
|
|
1276
|
+
if self.engine_write:
|
|
1277
|
+
self.engine_write.dispose()
|
|
1278
|
+
|
|
1279
|
+
def query(self, sql_query, coerce_float=False):
|
|
1280
|
+
""" Read data from SQL according to the sql_query
|
|
1281
|
+
-----------------------------
|
|
1282
|
+
query_str = "SELECT * FROM %s" & table
|
|
1283
|
+
con_.query(query_str)
|
|
1284
|
+
-----------------------------
|
|
1285
|
+
:param sql_query: Query to be sent to SQL
|
|
1286
|
+
:param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
1287
|
+
to floating point.
|
|
1288
|
+
:return: DataFrame gathering the requested data
|
|
1289
|
+
"""
|
|
1290
|
+
if self.con_read is None:
|
|
1291
|
+
self.con_read = self.engine_read.connect().connection
|
|
1292
|
+
|
|
1293
|
+
data = None
|
|
1294
|
+
try:
|
|
1295
|
+
with self.engine_read.begin() as conn:
|
|
1296
|
+
data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
|
|
1297
|
+
except ValueError:
|
|
1298
|
+
print(traceback.format_exc())
|
|
1299
|
+
return data
|
|
1300
|
+
|
|
1301
|
+
def sp_results(self, sql_query, resp_number=None, parse_=True, no_count=True, commit_as_transaction=True):
|
|
1302
|
+
""" Execute a stored procedure and retrieves all its output data
|
|
1303
|
+
-----------------------------
|
|
1304
|
+
query_str = "EXECUTE %s" & stored_procedure
|
|
1305
|
+
con_.sp_results(query_str, resp_number=1)
|
|
1306
|
+
-----------------------------
|
|
1307
|
+
:param sql_query: Query to be sent to SQL
|
|
1308
|
+
:param resp_number: Indicate which of the stored procedures responses will be retrieved
|
|
1309
|
+
:param parse_: Indicate whether the output needs to be converted to a DataFrame or not
|
|
1310
|
+
:param no_count: Indicate whether SET NOCOUNT option is ON (True) or OFF (False)
|
|
1311
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1312
|
+
:return: DataFrame list gathering the requested data
|
|
1313
|
+
"""
|
|
1314
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1315
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1316
|
+
if not commit_as_transaction:
|
|
1317
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1318
|
+
else:
|
|
1319
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1320
|
+
|
|
1321
|
+
if self.con_write is None:
|
|
1322
|
+
self.con_write = self.engine_write.connect().connection
|
|
1323
|
+
|
|
1324
|
+
data_list = list()
|
|
1325
|
+
cursor = None
|
|
1326
|
+
try:
|
|
1327
|
+
cursor = self.con_write.cursor()
|
|
1328
|
+
if no_count:
|
|
1329
|
+
cursor.execute("SET NOCOUNT ON;" + sql_query)
|
|
1330
|
+
else:
|
|
1331
|
+
cursor.execute(sql_query)
|
|
1332
|
+
if resp_number is not None:
|
|
1333
|
+
for cursor_number in range(resp_number - 1):
|
|
1334
|
+
cursor.nextset()
|
|
1335
|
+
try:
|
|
1336
|
+
data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
|
|
1337
|
+
except ValueError:
|
|
1338
|
+
raise ValueError('Please indicate a valid resp_number')
|
|
1339
|
+
else:
|
|
1340
|
+
aux_cursor = True
|
|
1341
|
+
count = 0
|
|
1342
|
+
while aux_cursor is not False and count < 100:
|
|
1343
|
+
try:
|
|
1344
|
+
data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
|
|
1345
|
+
aux_cursor = cursor.nextset()
|
|
1346
|
+
except Exception as e:
|
|
1347
|
+
print(e)
|
|
1348
|
+
cursor.nextset()
|
|
1349
|
+
finally:
|
|
1350
|
+
count += 1
|
|
1351
|
+
if count >= 100:
|
|
1352
|
+
raise RuntimeError("Method sp_results has loop over 100 times for database '%s' on server '%s'"
|
|
1353
|
+
% (self.db_name, self.server))
|
|
1354
|
+
self.con_write.commit()
|
|
1355
|
+
except ValueError:
|
|
1356
|
+
print(traceback.format_exc())
|
|
1357
|
+
finally:
|
|
1358
|
+
if cursor:
|
|
1359
|
+
cursor.close()
|
|
1360
|
+
|
|
1361
|
+
return data_list
|
|
1362
|
+
|
|
1363
|
+
def run_statement(self, sql_statement, commit_as_transaction=True):
|
|
1364
|
+
""" Execute SQL statement
|
|
1365
|
+
-----------------------------
|
|
1366
|
+
query_str = "DELETE FROM %s WHERE Id > 100" & table
|
|
1367
|
+
con_.run_statement(query_str)
|
|
1368
|
+
-----------------------------
|
|
1369
|
+
:param sql_statement: Statement as string to be run in SQL
|
|
1370
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1371
|
+
:return: Statement result
|
|
1372
|
+
"""
|
|
1373
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1374
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1375
|
+
if not commit_as_transaction:
|
|
1376
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1377
|
+
else:
|
|
1378
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1379
|
+
|
|
1380
|
+
if self.con_write is None:
|
|
1381
|
+
self.con_write = self.engine_write.connect().connection
|
|
1382
|
+
|
|
1383
|
+
cursor = self.con_write.cursor()
|
|
1384
|
+
# Execute SQL statement
|
|
1385
|
+
try:
|
|
1386
|
+
cursor.execute(sql_statement)
|
|
1387
|
+
self.con_write.commit()
|
|
1388
|
+
except Exception:
|
|
1389
|
+
raise Exception(traceback.format_exc())
|
|
1390
|
+
finally:
|
|
1391
|
+
if cursor:
|
|
1392
|
+
cursor.close()
|
|
1393
|
+
|
|
1394
|
+
def insert(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1000, print_sql=False,
|
|
1395
|
+
commit_all_together=False, output=None, bools2bits=True, nullable=False, infer_datetime_format=None,
|
|
1396
|
+
commit_as_transaction=True):
|
|
1397
|
+
""" Insert data in a table in SQL truncating the table if needed
|
|
1398
|
+
-----------------------------
|
|
1399
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
1400
|
+
con_.insert(df, table_schema, table_name)
|
|
1401
|
+
-----------------------------
|
|
1402
|
+
:param data: DataFrame containing the data to upload
|
|
1403
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
1404
|
+
:param table: Table in which the data will be uploaded
|
|
1405
|
+
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
1406
|
+
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
1407
|
+
external constraints)
|
|
1408
|
+
:param identity: Indicate whether the identity columns will be inserted or not
|
|
1409
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
1410
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
1411
|
+
:param commit_all_together: when it is true, it only commits data if all data has been inserted. When it is
|
|
1412
|
+
false, it commits data by chunks.
|
|
1413
|
+
:param output: Outputs the columns indicated in this list
|
|
1414
|
+
:param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
|
|
1415
|
+
:param nullable: Used within bools2bits function to indicate which boolean column values to convert
|
|
1416
|
+
:param infer_datetime_format: Indicate whether the datetime columns should be converted to string and if so,
|
|
1417
|
+
then the format to be used
|
|
1418
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1419
|
+
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
1420
|
+
"""
|
|
1421
|
+
if output is None:
|
|
1422
|
+
output = []
|
|
1423
|
+
if data is None:
|
|
1424
|
+
# no data to upload
|
|
1425
|
+
return ValueError("The data provided is invalid!")
|
|
1426
|
+
cursor = None
|
|
1427
|
+
results = pd.DataFrame(columns=output)
|
|
1428
|
+
|
|
1429
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1430
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1431
|
+
if not commit_as_transaction:
|
|
1432
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1433
|
+
else:
|
|
1434
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1435
|
+
|
|
1436
|
+
if self.con_write is None:
|
|
1437
|
+
self.con_write = self.engine_write.connect().connection
|
|
1438
|
+
|
|
1439
|
+
# Mapping the date datatype columns for SQL
|
|
1440
|
+
data = self.date_mapping_data_types(data)
|
|
1441
|
+
|
|
1442
|
+
# Infer datetime format if provided
|
|
1443
|
+
if infer_datetime_format is not None:
|
|
1444
|
+
data = self.infer_datetime(data, infer_datetime_format)
|
|
1445
|
+
|
|
1446
|
+
# Mapping the boolean columns to bit
|
|
1447
|
+
if bools2bits:
|
|
1448
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
1449
|
+
|
|
1450
|
+
try:
|
|
1451
|
+
cursor = self.con_write.cursor()
|
|
1452
|
+
# Truncate table if needed
|
|
1453
|
+
if truncate:
|
|
1454
|
+
cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
|
|
1455
|
+
# Delete all records from the table if needed
|
|
1456
|
+
if delete:
|
|
1457
|
+
cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
|
|
1458
|
+
# Allow to insert to an Identity column
|
|
1459
|
+
if identity:
|
|
1460
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
|
|
1461
|
+
# Convert category columns to string
|
|
1462
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
1463
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
1464
|
+
# Deal with bull values and apostrophes (')
|
|
1465
|
+
data = data.replace("'NULL'", "NULL")
|
|
1466
|
+
data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
|
|
1467
|
+
data = data.fillna("null")
|
|
1468
|
+
# Insert data into the table destination
|
|
1469
|
+
records = [tuple(x) for x in data.values]
|
|
1470
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
|
|
1471
|
+
insert_ += str(tuple(data.columns.values)).replace(
|
|
1472
|
+
"(\'", "([").replace('\', \'', '], [').replace('\')', '])')
|
|
1473
|
+
if len(output) > 0:
|
|
1474
|
+
insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
|
|
1475
|
+
insert_ += """ VALUES """
|
|
1476
|
+
|
|
1477
|
+
for batch in self._chunker(records, chunk):
|
|
1478
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
1479
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
1480
|
+
string = insert_ + rows
|
|
1481
|
+
string = self.convert_decimal_str(string)
|
|
1482
|
+
if print_sql:
|
|
1483
|
+
print(string)
|
|
1484
|
+
cursor.execute(string)
|
|
1485
|
+
if len(output) > 0:
|
|
1486
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
1487
|
+
if ~commit_all_together:
|
|
1488
|
+
self.con_write.commit()
|
|
1489
|
+
if commit_all_together:
|
|
1490
|
+
self.con_write.commit()
|
|
1491
|
+
|
|
1492
|
+
# Restrict to insert to an Identity column
|
|
1493
|
+
if identity:
|
|
1494
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
|
|
1495
|
+
|
|
1496
|
+
if len(output) > 0:
|
|
1497
|
+
return results.reset_index(drop=True)
|
|
1498
|
+
|
|
1499
|
+
except Exception:
|
|
1500
|
+
raise Exception(traceback.format_exc())
|
|
1501
|
+
|
|
1502
|
+
finally:
|
|
1503
|
+
if cursor:
|
|
1504
|
+
cursor.close()
|
|
1505
|
+
|
|
1506
|
+
def insert_at_once(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1,
|
|
1507
|
+
print_sql=False, output=None, bools2bits=True, nullable=False, commit_as_transaction=True):
|
|
1508
|
+
""" Build all the insert statements and commit them all at once
|
|
1509
|
+
-----------------------------
|
|
1510
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
1511
|
+
con_.insert(df, table_schema, table_name)
|
|
1512
|
+
-----------------------------
|
|
1513
|
+
:param data: DataFrame containing the data to upload
|
|
1514
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
1515
|
+
:param table: Table in which the data will be uploaded
|
|
1516
|
+
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
1517
|
+
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
1518
|
+
external constraints)
|
|
1519
|
+
:param identity: Indicate whether the identity columns will be inserted or not
|
|
1520
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
1521
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
1522
|
+
:param output: Outputs the columns indicated in this list
|
|
1523
|
+
:param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
|
|
1524
|
+
:param nullable: Used within bools2bits function to indicate which boolean column values to convert
|
|
1525
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1526
|
+
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
1527
|
+
"""
|
|
1528
|
+
if output is None:
|
|
1529
|
+
output = []
|
|
1530
|
+
if data is None:
|
|
1531
|
+
# no data to upload
|
|
1532
|
+
return ValueError("The data provided is invalid!")
|
|
1533
|
+
cursor = None
|
|
1534
|
+
results = pd.DataFrame(columns=output)
|
|
1535
|
+
|
|
1536
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1537
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1538
|
+
if not commit_as_transaction:
|
|
1539
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1540
|
+
else:
|
|
1541
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1542
|
+
|
|
1543
|
+
if self.con_write is None:
|
|
1544
|
+
self.con_write = self.engine_write.connect().connection
|
|
1545
|
+
|
|
1546
|
+
# Mapping the date datatype columns for SQL
|
|
1547
|
+
data = self.date_mapping_data_types(data)
|
|
1548
|
+
|
|
1549
|
+
# Mapping the boolean columns to bit
|
|
1550
|
+
if bools2bits:
|
|
1551
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
1552
|
+
|
|
1553
|
+
try:
|
|
1554
|
+
cursor = self.con_write.cursor()
|
|
1555
|
+
# Truncate table if needed
|
|
1556
|
+
if truncate:
|
|
1557
|
+
cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
|
|
1558
|
+
# Delete all records from the table if needed
|
|
1559
|
+
if delete:
|
|
1560
|
+
cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
|
|
1561
|
+
# Allow to insert to an Identity column
|
|
1562
|
+
if identity:
|
|
1563
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
|
|
1564
|
+
# Convert category columns to string
|
|
1565
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
1566
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
1567
|
+
# Deal with bull values and apostrophes (')
|
|
1568
|
+
data = data.replace("'NULL'", "NULL")
|
|
1569
|
+
data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
|
|
1570
|
+
data = data.fillna("null")
|
|
1571
|
+
# Insert data into the table destination
|
|
1572
|
+
records = [tuple(x) for x in data.values]
|
|
1573
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
|
|
1574
|
+
insert_ += str(tuple(data.columns.values)).replace(
|
|
1575
|
+
"(\'", "([").replace('\', \'', '], [').replace('\')', '])')
|
|
1576
|
+
if len(output) > 0:
|
|
1577
|
+
insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
|
|
1578
|
+
insert_ += """ VALUES """
|
|
1579
|
+
|
|
1580
|
+
insert_statements = list()
|
|
1581
|
+
for batch in self._chunker(records, chunk):
|
|
1582
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
1583
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
1584
|
+
string = insert_ + rows
|
|
1585
|
+
string = self.convert_decimal_str(string)
|
|
1586
|
+
insert_statements.append(string)
|
|
1587
|
+
|
|
1588
|
+
if print_sql:
|
|
1589
|
+
print(';'.join(insert_statements))
|
|
1590
|
+
cursor.execute(';'.join(insert_statements))
|
|
1591
|
+
if len(output) > 0:
|
|
1592
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
1593
|
+
self.con_write.commit()
|
|
1594
|
+
|
|
1595
|
+
# Restrict to insert to an Identity column
|
|
1596
|
+
if identity:
|
|
1597
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
|
|
1598
|
+
|
|
1599
|
+
if len(output) > 0:
|
|
1600
|
+
return results.reset_index(drop=True)
|
|
1601
|
+
|
|
1602
|
+
except Exception:
|
|
1603
|
+
raise Exception(traceback.format_exc())
|
|
1604
|
+
|
|
1605
|
+
finally:
|
|
1606
|
+
if cursor:
|
|
1607
|
+
cursor.close()
|
|
1608
|
+
|
|
1609
|
+
def update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, batch_size=100,
|
|
1610
|
+
output=None, nullable=True, commit_as_transaction=True):
|
|
1611
|
+
""" This method updates a table in batches in sql server.
|
|
1612
|
+
-----------------------------
|
|
1613
|
+
UPDATE [SCHEMA].[TABLE]
|
|
1614
|
+
SET update_list[0] = data[index, update_list{0}],
|
|
1615
|
+
update_list[1] = data[index, update_list[1]]
|
|
1616
|
+
OUTPUT output[0], output[1]
|
|
1617
|
+
WHERE on_list[0] = data[index, on_list[0]]
|
|
1618
|
+
AND on_list[1] = data[index, on_list[1]]
|
|
1619
|
+
-----------------------------
|
|
1620
|
+
:param data: DataFrame containing the data to update
|
|
1621
|
+
:param update_list: list of columns to update
|
|
1622
|
+
:param on_list: list of columns to apply the on clause
|
|
1623
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
1624
|
+
:param table: Table in which the data will be uploaded
|
|
1625
|
+
:param bool_cols: list of columns gathering boolean types
|
|
1626
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
1627
|
+
:param bool_cols: columns to include as booleans
|
|
1628
|
+
:param batch_size: Number of records to update in each iteration
|
|
1629
|
+
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
1630
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
|
|
1631
|
+
will be retrieved)
|
|
1632
|
+
:param nullable: Indicate whether to update the table column with null or exclude the reference from the update
|
|
1633
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1634
|
+
:return: None
|
|
1635
|
+
"""
|
|
1636
|
+
cursor = None
|
|
1637
|
+
if data is None:
|
|
1638
|
+
# no data to update
|
|
1639
|
+
return ValueError("The data provided is invalid!")
|
|
1640
|
+
|
|
1641
|
+
if output is None:
|
|
1642
|
+
output = []
|
|
1643
|
+
else:
|
|
1644
|
+
output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for out
|
|
1645
|
+
in output]
|
|
1646
|
+
results = pd.DataFrame(columns=output)
|
|
1647
|
+
|
|
1648
|
+
# re-starting indexes
|
|
1649
|
+
data.reset_index(drop=True, inplace=True)
|
|
1650
|
+
|
|
1651
|
+
# Mapping boolean columns
|
|
1652
|
+
if bool_cols is not None:
|
|
1653
|
+
for col in bool_cols:
|
|
1654
|
+
data[col] = data[col].astype(bool)
|
|
1655
|
+
|
|
1656
|
+
# Mapping date type for SQL
|
|
1657
|
+
data = self.date_mapping_data_types(data)
|
|
1658
|
+
|
|
1659
|
+
# create connection
|
|
1660
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1661
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1662
|
+
if not commit_as_transaction:
|
|
1663
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1664
|
+
else:
|
|
1665
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1666
|
+
|
|
1667
|
+
if self.con_write is None:
|
|
1668
|
+
self.con_write = self.engine_write.connect().connection
|
|
1669
|
+
|
|
1670
|
+
try:
|
|
1671
|
+
# initialise cursor
|
|
1672
|
+
cursor = self.con_write.cursor()
|
|
1673
|
+
|
|
1674
|
+
# extraction of the useful columns
|
|
1675
|
+
data_update = data[list(set(update_list + on_list))]
|
|
1676
|
+
|
|
1677
|
+
# initialisation of the sql statement
|
|
1678
|
+
sql_start = ''' UPDATE [%s].[%s] SET ''' % (schema, table)
|
|
1679
|
+
iter_batch = math.ceil(data_update.shape[0] / batch_size)
|
|
1680
|
+
for batch in range(iter_batch):
|
|
1681
|
+
batch_update = data_update.iloc[batch * batch_size: (batch + 1) * batch_size]
|
|
1682
|
+
|
|
1683
|
+
sql_statement = ''
|
|
1684
|
+
for iindex in batch_update.index:
|
|
1685
|
+
# UPDATE [SCHEMA].[TABLE]
|
|
1686
|
+
sql_statement += sql_start
|
|
1687
|
+
|
|
1688
|
+
# VALUES
|
|
1689
|
+
for col in update_list:
|
|
1690
|
+
if nullable:
|
|
1691
|
+
if pd.isna(batch_update.loc[iindex, col]):
|
|
1692
|
+
sql_statement += " [%s] = NULL ," % col
|
|
1693
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
1694
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
1695
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
1696
|
+
sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
|
|
1697
|
+
else:
|
|
1698
|
+
sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
|
|
1699
|
+
else:
|
|
1700
|
+
if pd.notna(batch_update.loc[iindex, col]):
|
|
1701
|
+
if str(batch_update.loc[iindex, col]).upper() == 'NULL':
|
|
1702
|
+
continue
|
|
1703
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
1704
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
1705
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
1706
|
+
sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
|
|
1707
|
+
else:
|
|
1708
|
+
sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
|
|
1709
|
+
|
|
1710
|
+
# OUTPUT
|
|
1711
|
+
if len(output) > 0:
|
|
1712
|
+
sql_statement = sql_statement[:-1] + " OUTPUT " + ",".join(output) + ' '
|
|
1713
|
+
|
|
1714
|
+
# WHERE
|
|
1715
|
+
sql_statement = sql_statement[:-1] + ' WHERE '
|
|
1716
|
+
for col in on_list:
|
|
1717
|
+
if pd.isna(batch_update.loc[iindex, col]):
|
|
1718
|
+
sql_statement += " [%s] = NULL AND" % col
|
|
1719
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
1720
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
1721
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
1722
|
+
sql_statement += " [%s] = %s AND" % (col, batch_update.loc[iindex, col])
|
|
1723
|
+
else:
|
|
1724
|
+
sql_statement += " [%s] = '%s' AND" % (col, batch_update.loc[iindex, col])
|
|
1725
|
+
|
|
1726
|
+
# Addition of semicolon
|
|
1727
|
+
sql_statement = sql_statement[:-3] + ';'
|
|
1728
|
+
|
|
1729
|
+
if print_sql:
|
|
1730
|
+
print(sql_statement)
|
|
1731
|
+
|
|
1732
|
+
# executing statement
|
|
1733
|
+
if len(sql_statement) > 0:
|
|
1734
|
+
if len(output) > 0:
|
|
1735
|
+
cursor.execute(sql_statement)
|
|
1736
|
+
for cursor_number in range(len(sql_statement.split(';')) - 1):
|
|
1737
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
1738
|
+
cursor.nextset()
|
|
1739
|
+
else:
|
|
1740
|
+
cursor.execute(sql_statement)
|
|
1741
|
+
self.con_write.commit()
|
|
1742
|
+
|
|
1743
|
+
if len(output) > 0:
|
|
1744
|
+
return results.reset_index(drop=True)
|
|
1745
|
+
|
|
1746
|
+
except Exception:
|
|
1747
|
+
raise Exception(traceback.format_exc())
|
|
1748
|
+
|
|
1749
|
+
finally:
|
|
1750
|
+
if cursor:
|
|
1751
|
+
cursor.close()
|
|
1752
|
+
|
|
1753
|
+
def bulk_update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, output=None,
|
|
1754
|
+
chunk=1000, commit_as_transaction=True):
|
|
1755
|
+
""" This method updates a table in batches in sql server.
|
|
1756
|
+
-----------------------------
|
|
1757
|
+
UPDATE [SCHEMA].[TABLE]
|
|
1758
|
+
SET update_list[0] = data[index, update_list{0}],
|
|
1759
|
+
update_list[1] = data[index, update_list[1]]
|
|
1760
|
+
OUTPUT output[0], output[1]
|
|
1761
|
+
WHERE on_list[0] = data[index, on_list[0]]
|
|
1762
|
+
AND on_list[1] = data[index, on_list[1]]
|
|
1763
|
+
-----------------------------
|
|
1764
|
+
:param data: DataFrame containing the data to update
|
|
1765
|
+
:param update_list: list of columns to update
|
|
1766
|
+
:param on_list: list of columns to apply the on clause
|
|
1767
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
1768
|
+
:param table: Table in which the data will be uploaded
|
|
1769
|
+
:param bool_cols: list of columns gathering boolean types
|
|
1770
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
1771
|
+
:param bool_cols: columns to include as booleans
|
|
1772
|
+
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
1773
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
|
|
1774
|
+
will be retrieved)
|
|
1775
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
1776
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1777
|
+
:return: None
|
|
1778
|
+
"""
|
|
1779
|
+
cursor = None
|
|
1780
|
+
if data is None:
|
|
1781
|
+
# no data to update
|
|
1782
|
+
return ValueError("The data provided is invalid!")
|
|
1783
|
+
|
|
1784
|
+
if output is None:
|
|
1785
|
+
output = []
|
|
1786
|
+
sql_output = []
|
|
1787
|
+
else:
|
|
1788
|
+
sql_output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for
|
|
1789
|
+
out
|
|
1790
|
+
in output]
|
|
1791
|
+
results = pd.DataFrame(columns=output)
|
|
1792
|
+
|
|
1793
|
+
# re-starting indexes
|
|
1794
|
+
data.reset_index(drop=True, inplace=True)
|
|
1795
|
+
|
|
1796
|
+
# Mapping boolean columns
|
|
1797
|
+
if bool_cols is not None:
|
|
1798
|
+
for col in bool_cols:
|
|
1799
|
+
data[col] = data[col].astype(bool)
|
|
1800
|
+
|
|
1801
|
+
# Mapping date type for SQL
|
|
1802
|
+
data = data[on_list + update_list]
|
|
1803
|
+
data = self.date_mapping_data_types(data)
|
|
1804
|
+
|
|
1805
|
+
# create connection
|
|
1806
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1807
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1808
|
+
if not commit_as_transaction:
|
|
1809
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1810
|
+
else:
|
|
1811
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1812
|
+
|
|
1813
|
+
if self.con_write is None:
|
|
1814
|
+
self.con_write = self.engine_write.connect().connection
|
|
1815
|
+
|
|
1816
|
+
try:
|
|
1817
|
+
# initialise cursor
|
|
1818
|
+
cursor = self.con_write.cursor()
|
|
1819
|
+
|
|
1820
|
+
# Convert category columns to string
|
|
1821
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
1822
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
1823
|
+
# Deal with bull values and apostrophes (')
|
|
1824
|
+
data = data.replace("'NULL'", "NULL")
|
|
1825
|
+
data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
|
|
1826
|
+
data = data.fillna("null")
|
|
1827
|
+
|
|
1828
|
+
records = [tuple(x) for x in data.values]
|
|
1829
|
+
temp_table = f'#Temp{schema}{table}'
|
|
1830
|
+
|
|
1831
|
+
for batch in self._chunker(records, chunk):
|
|
1832
|
+
batch_records = [tuple(x) for x in batch]
|
|
1833
|
+
# initialisation of the sql statement
|
|
1834
|
+
insert_ = f'DROP TABLE IF EXISTS {temp_table} '
|
|
1835
|
+
insert_ += f"SELECT * INTO {temp_table} FROM ( VALUES "
|
|
1836
|
+
temp_columns = str(tuple(data.columns.values)).replace("(\'", "([").replace(
|
|
1837
|
+
'\', \'', '], [').replace('\')', '])')
|
|
1838
|
+
rows = str(batch_records).strip('[]').replace("~~", "''")
|
|
1839
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
1840
|
+
sql_statement = insert_ + rows
|
|
1841
|
+
sql_statement = self.convert_decimal_str(sql_statement)
|
|
1842
|
+
sql_statement += f') AS TempTable {temp_columns}'
|
|
1843
|
+
|
|
1844
|
+
col_update_set = ''
|
|
1845
|
+
for col in update_list:
|
|
1846
|
+
col_update_set += f' target.{col} = source.{col},'
|
|
1847
|
+
col_update_set = col_update_set[:-1]
|
|
1848
|
+
|
|
1849
|
+
col_difference_check = ''
|
|
1850
|
+
for col in update_list:
|
|
1851
|
+
col_difference_check += f' target.{col} <> source.{col} OR'
|
|
1852
|
+
col_difference_check = col_difference_check[:-2]
|
|
1853
|
+
|
|
1854
|
+
col_join_on = ''
|
|
1855
|
+
for col in on_list:
|
|
1856
|
+
col_join_on += f' source.{col} = target.{col} AND'
|
|
1857
|
+
col_join_on = col_join_on[:-3]
|
|
1858
|
+
|
|
1859
|
+
sql_statement += f'UPDATE target SET {col_update_set} '
|
|
1860
|
+
|
|
1861
|
+
if len(output) > 0:
|
|
1862
|
+
sql_statement += f" OUTPUT {','.join(sql_output)} "
|
|
1863
|
+
|
|
1864
|
+
sql_statement += f'''FROM {schema}.{table} target
|
|
1865
|
+
JOIN {temp_table} as source
|
|
1866
|
+
ON {col_join_on}
|
|
1867
|
+
WHERE {col_difference_check}
|
|
1868
|
+
'''
|
|
1869
|
+
|
|
1870
|
+
sql_statement += f' DROP TABLE IF EXISTS {temp_table} '
|
|
1871
|
+
|
|
1872
|
+
if print_sql:
|
|
1873
|
+
print(sql_statement)
|
|
1874
|
+
|
|
1875
|
+
# executing statement
|
|
1876
|
+
if len(sql_statement) > 0:
|
|
1877
|
+
if len(output) > 0:
|
|
1878
|
+
cursor.execute(sql_statement)
|
|
1879
|
+
cursor.nextset()
|
|
1880
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
1881
|
+
else:
|
|
1882
|
+
cursor.execute(sql_statement)
|
|
1883
|
+
|
|
1884
|
+
self.con_write.commit()
|
|
1885
|
+
|
|
1886
|
+
if len(output) > 0:
|
|
1887
|
+
return results.reset_index(drop=True)
|
|
1888
|
+
|
|
1889
|
+
except Exception:
|
|
1890
|
+
raise Exception(traceback.format_exc())
|
|
1891
|
+
|
|
1892
|
+
finally:
|
|
1893
|
+
if cursor:
|
|
1894
|
+
cursor.close()
|
|
1895
|
+
|
|
1896
|
+
def merge(self, data, staging_schema, staging_table, sp_schema, sp_name, truncate=False, chunk=1000,
|
|
1897
|
+
commit_as_transaction=True):
|
|
1898
|
+
""" Merge data from Staging table using a Stored Procedure. It requires a table in SQL which will store the
|
|
1899
|
+
Staging data. The method will work as follows:
|
|
1900
|
+
1.- Truncate the staging table according to the truncate parameter
|
|
1901
|
+
2.- Insert the data into the staging table
|
|
1902
|
+
3.- Execute a stored procedure to merge the staging table with the destination table
|
|
1903
|
+
-----------------------------
|
|
1904
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
1905
|
+
con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
|
|
1906
|
+
-----------------------------
|
|
1907
|
+
:param data: DataFrame to insert in the staging table
|
|
1908
|
+
:param staging_schema: Staging table schema
|
|
1909
|
+
:param staging_table: Staging table name
|
|
1910
|
+
:param sp_schema: Stored Procedure schema
|
|
1911
|
+
:param sp_name: Stored Procedure name
|
|
1912
|
+
:param truncate: Indicate whether the staging table has to be truncated or not
|
|
1913
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
1914
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
1915
|
+
:return: None
|
|
1916
|
+
"""
|
|
1917
|
+
if data is None:
|
|
1918
|
+
# no data to upload
|
|
1919
|
+
return ValueError("The data provided is invalid!")
|
|
1920
|
+
cursor = None
|
|
1921
|
+
|
|
1922
|
+
if self.commit_as_transaction != commit_as_transaction:
|
|
1923
|
+
self.commit_as_transaction = commit_as_transaction
|
|
1924
|
+
if not commit_as_transaction:
|
|
1925
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
|
|
1926
|
+
else:
|
|
1927
|
+
self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
|
|
1928
|
+
|
|
1929
|
+
if self.con_write is None:
|
|
1930
|
+
self.con_write = self.engine_write.connect().connection
|
|
1931
|
+
|
|
1932
|
+
try:
|
|
1933
|
+
cursor = self.con_write.cursor()
|
|
1934
|
+
# Truncate Staging table if needed
|
|
1935
|
+
if truncate:
|
|
1936
|
+
trunc_insert = """TRUNCATE TABLE [%s].[%s]""" % (staging_schema, staging_table)
|
|
1937
|
+
cursor.execute(trunc_insert)
|
|
1938
|
+
self.con_write.commit()
|
|
1939
|
+
# Convert category columns to string
|
|
1940
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
1941
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
1942
|
+
# Deal with null values and apostrophes (')
|
|
1943
|
+
data = data.replace("'NULL'", "NULL")
|
|
1944
|
+
data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
|
|
1945
|
+
data = data.fillna("null")
|
|
1946
|
+
# Insert in Staging Table
|
|
1947
|
+
records = [tuple(x) for x in data.values]
|
|
1948
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (staging_schema, staging_table)
|
|
1949
|
+
insert_ = insert_ + str(tuple(data.columns.values)).replace("\'", "") + """ VALUES """
|
|
1950
|
+
for batch in self._chunker(records, chunk):
|
|
1951
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
1952
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
1953
|
+
string = insert_ + rows
|
|
1954
|
+
string = self.convert_decimal_str(string)
|
|
1955
|
+
cursor.execute(string)
|
|
1956
|
+
self.con_write.commit()
|
|
1957
|
+
# Execute Stored Procedure
|
|
1958
|
+
exec_sp = """EXECUTE [%s].[%s]""" % (sp_schema, sp_name)
|
|
1959
|
+
cursor.execute(exec_sp)
|
|
1960
|
+
self.con_write.commit()
|
|
1961
|
+
except Exception:
|
|
1962
|
+
raise Exception(traceback.format_exc())
|
|
1963
|
+
finally:
|
|
1964
|
+
if cursor:
|
|
1965
|
+
cursor.close()
|
|
1966
|
+
|
|
1967
|
+
def merge_into(self, data, schema, table, on_list, update_check=False, update_set=None, bool_cols=None,
|
|
1968
|
+
identity=False, print_sql=False, nullable=False):
|
|
1969
|
+
"""
|
|
1970
|
+
This method is equivalent to the 'merge into' of T-sql. Schema and table defines the Target, while data is the
|
|
1971
|
+
Source. Please refer to below schema for more arguments use clarifications.
|
|
1972
|
+
Aspects to take into consideration:
|
|
1973
|
+
1.- This method will not work properly if data contains duplicates. It is not relevant if the target contains
|
|
1974
|
+
duplicates because DISTINCT is used to call the table.
|
|
1975
|
+
2.- When having booleans in the dataset you have to pay attention because pandas get bool from sql server as
|
|
1976
|
+
[True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
|
|
1977
|
+
3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
|
|
1978
|
+
the name of a date column and as values the format that the columns has to have.
|
|
1979
|
+
-------------------------
|
|
1980
|
+
MERGE INTO [SCHEMA].[TABLE] AS TARGET
|
|
1981
|
+
USING (
|
|
1982
|
+
data
|
|
1983
|
+
) AS SOURCE
|
|
1984
|
+
ON TARGET.on_list[0] = SOURCE.on_list[0]
|
|
1985
|
+
AND TARGET.on_list[1] = SOURCE.on_list[1]
|
|
1986
|
+
...
|
|
1987
|
+
AND TARGET.on_list[n] = SOURCE.on_list[n]
|
|
1988
|
+
WHEN MATCHED AND (
|
|
1989
|
+
TARGET.update_check[0] <> SOURCE.update_check[0]
|
|
1990
|
+
OR TARGET.update_check[1] <> SOURCE.update_check[1]
|
|
1991
|
+
...
|
|
1992
|
+
OR TARGET.update_check[n] <> SOURCE.update_check[n]
|
|
1993
|
+
)
|
|
1994
|
+
UPDATE SET TARGET.update_check[0] = SOURCE.update_check[0],
|
|
1995
|
+
...
|
|
1996
|
+
TARGET.update_check[n] = SOURCE.update_check[n],
|
|
1997
|
+
TARGET.update_set[0] = SOURCE.update_set[0],
|
|
1998
|
+
TARGET.update_set[1] = SOURCE.update_set[1],
|
|
1999
|
+
....
|
|
2000
|
+
TARGET.update_set[n] = SOURCE.update_set[n]
|
|
2001
|
+
WHEN NOT MATCHED BY TARGET THEN
|
|
2002
|
+
INSERT
|
|
2003
|
+
(
|
|
2004
|
+
all columns from [SCHEMA].[TABLE]
|
|
2005
|
+
)
|
|
2006
|
+
VALUES
|
|
2007
|
+
(all columns from data)
|
|
2008
|
+
-------------------------------
|
|
2009
|
+
:param data: DataFrame containing the data to upload/update
|
|
2010
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
2011
|
+
:param table: Table in which the data will be uploaded
|
|
2012
|
+
:param on_list: list of columns to apply the on clause
|
|
2013
|
+
:param update_check: list of columns to do the check
|
|
2014
|
+
:param update_set: list of columns to update
|
|
2015
|
+
:param bool_cols: list of columns gathering boolean types
|
|
2016
|
+
:param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
|
|
2017
|
+
in its definition has it. It's a boolean.
|
|
2018
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
2019
|
+
:param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
|
|
2020
|
+
:return: None
|
|
2021
|
+
"""
|
|
2022
|
+
if data is None:
|
|
2023
|
+
# no data to upload
|
|
2024
|
+
return ValueError("The data provided is invalid!")
|
|
2025
|
+
|
|
2026
|
+
if data.shape[0] != data.drop_duplicates().shape[0]:
|
|
2027
|
+
return TypeError("There are duplicates values in your dataframe, it will not work properly on "
|
|
2028
|
+
"pd.concat().drop_duplicates()")
|
|
2029
|
+
|
|
2030
|
+
# if update_set has values assigned, update check has to have values assigned
|
|
2031
|
+
if update_set is not None:
|
|
2032
|
+
if update_check is None:
|
|
2033
|
+
return ValueError("Please, to use update_set assigned values to update_check")
|
|
2034
|
+
else:
|
|
2035
|
+
update_set = update_check
|
|
2036
|
+
|
|
2037
|
+
# Mapping boolean columns
|
|
2038
|
+
if bool_cols is not None:
|
|
2039
|
+
for col in bool_cols:
|
|
2040
|
+
data[col] = data[col].astype(bool)
|
|
2041
|
+
|
|
2042
|
+
# Mapping date and boolean type for SQL
|
|
2043
|
+
data = self.date_mapping_data_types(data)
|
|
2044
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
2045
|
+
|
|
2046
|
+
try:
|
|
2047
|
+
# call the table from the server
|
|
2048
|
+
data_table = self.query("""SELECT DISTINCT * FROM [%s].[%s]""" % (schema, table))
|
|
2049
|
+
|
|
2050
|
+
if data_table.shape[0] == 0:
|
|
2051
|
+
print("The destination table is empty so all the data will be inserted")
|
|
2052
|
+
self.insert(data, schema, table)
|
|
2053
|
+
|
|
2054
|
+
else:
|
|
2055
|
+
for data_col in data.columns:
|
|
2056
|
+
if ("int" in str(type(data_table[data_col].iloc[0]))) & (
|
|
2057
|
+
data_table[data_col].isnull().sum() > 0):
|
|
2058
|
+
data_table[data_col] = data_table[data_col].astype(float)
|
|
2059
|
+
else:
|
|
2060
|
+
data_table[data_col] = data_table[data_col].astype(type(data[data_col].iloc[0]))
|
|
2061
|
+
|
|
2062
|
+
coincidence = pd.DataFrame()
|
|
2063
|
+
if data_table.shape[0] > 0:
|
|
2064
|
+
for col in data_table.columns.values.tolist():
|
|
2065
|
+
if isinstance(data_table.loc[0, col], bool):
|
|
2066
|
+
data_table[col] = data_table[col].apply(
|
|
2067
|
+
lambda x: 1 if x is True else 0 if x is False else np.nan)
|
|
2068
|
+
if bool_cols is not None:
|
|
2069
|
+
for col in bool_cols:
|
|
2070
|
+
data_table[col] = data_table[col].astype(bool)
|
|
2071
|
+
# join the input table with the one in the database
|
|
2072
|
+
coincidence = data.merge(data_table[on_list], how='inner', on=on_list)
|
|
2073
|
+
# WHEN MATCHED AND ... UPDATE SET
|
|
2074
|
+
if update_check:
|
|
2075
|
+
coincidence2 = coincidence.merge(data_table[list(set(on_list + update_check))],
|
|
2076
|
+
how='inner',
|
|
2077
|
+
on=list(set(on_list + update_check)))
|
|
2078
|
+
data_update = pd.concat([coincidence, coincidence2], ignore_index=True)
|
|
2079
|
+
data_update.drop_duplicates(keep=False, inplace=True)
|
|
2080
|
+
if data_update.shape[0] > 0:
|
|
2081
|
+
self.update(data_update, list(set(update_set + update_check)), on_list, schema, table,
|
|
2082
|
+
print_sql=print_sql)
|
|
2083
|
+
|
|
2084
|
+
# WHEN NOT MATCHED BY TARGET THEN... INSERT
|
|
2085
|
+
data_insert = pd.concat([data, coincidence], ignore_index=True)
|
|
2086
|
+
data_insert.drop_duplicates(keep=False, inplace=True)
|
|
2087
|
+
if data_insert.shape[0] > 0:
|
|
2088
|
+
self.insert(data_insert, schema, table, identity=identity, print_sql=print_sql)
|
|
2089
|
+
|
|
2090
|
+
except Exception:
|
|
2091
|
+
raise Exception(traceback.format_exc())
|
|
2092
|
+
|
|
2093
|
+
@staticmethod
|
|
2094
|
+
def _parse_df(parse_, data, col_names):
|
|
2095
|
+
""" Auxiliar function to convert list to DataFrame
|
|
2096
|
+
:param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
|
|
2097
|
+
:param data: List gathering the data retrieved from SQL
|
|
2098
|
+
:param col_names: List of columns to create the DataFrame
|
|
2099
|
+
:return: Formatted data
|
|
2100
|
+
"""
|
|
2101
|
+
if parse_ is True:
|
|
2102
|
+
col_names = list(zip(*list(col_names)))[0]
|
|
2103
|
+
res = pd.DataFrame(list(zip(*data)), index=col_names).T
|
|
2104
|
+
else:
|
|
2105
|
+
res = [col_names, data]
|
|
2106
|
+
return res
|
|
2107
|
+
|
|
2108
|
+
@staticmethod
|
|
2109
|
+
def _chunker(seq, size):
|
|
2110
|
+
""" Split the data set in chunks to be sent to SQL
|
|
2111
|
+
:param seq: Sequence of records to be split
|
|
2112
|
+
:param size: Size of the chunks to split the data
|
|
2113
|
+
:return: The DataFrame divided in chunks
|
|
2114
|
+
"""
|
|
2115
|
+
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
|
|
2116
|
+
|
|
2117
|
+
@staticmethod
|
|
2118
|
+
def date_mapping_data_types(data):
|
|
2119
|
+
"""
|
|
2120
|
+
Map datetime and boolean variables so they can be inserted in SQL
|
|
2121
|
+
:param data: DataFrame containing the variables to map
|
|
2122
|
+
:return: The mapped DataFrame
|
|
2123
|
+
"""
|
|
2124
|
+
first_index = data.index[0]
|
|
2125
|
+
date_col = data.columns[
|
|
2126
|
+
[('date' in str(type(data.loc[first_index, col]))) | ('timestamp' in str(type(data.loc[first_index, col])))
|
|
2127
|
+
for col in data.columns]]
|
|
2128
|
+
if len(date_col) > 0:
|
|
2129
|
+
for col in date_col:
|
|
2130
|
+
data[col] = pd.to_datetime(data[col])
|
|
2131
|
+
if data[col].dtypes == 'O':
|
|
2132
|
+
data[col] = data[col].dt.strftime('%Y-%m-%d')
|
|
2133
|
+
else:
|
|
2134
|
+
data[col] = data[col].dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
2135
|
+
data.loc[data[col] == 'NaT', col] = np.nan
|
|
2136
|
+
|
|
2137
|
+
return data
|
|
2138
|
+
|
|
2139
|
+
@staticmethod
|
|
2140
|
+
def boolean_mapping_data_types(data, nullable=False):
|
|
2141
|
+
"""
|
|
2142
|
+
Map datetime and boolean variables so they can be inserted in SQL
|
|
2143
|
+
:param data: DataFrame containing the variables to map
|
|
2144
|
+
:param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
|
|
2145
|
+
:return: The mapped DataFrame
|
|
972
2146
|
"""
|
|
973
2147
|
first_index = data.index[0]
|
|
974
2148
|
bool_col = data.columns[
|
|
@@ -990,7 +2164,7 @@ class SQLConnection:
|
|
|
990
2164
|
def id_next(con_db, table, schema, id_col, print_sql=False):
|
|
991
2165
|
"""
|
|
992
2166
|
This static method returns the next id to be inserted into a table for sql_server
|
|
993
|
-
:param con_db: class to connect to a sql server
|
|
2167
|
+
:param con_db: class to connect to a sql server database
|
|
994
2168
|
:param table: name of the table
|
|
995
2169
|
:param schema: name of the schema
|
|
996
2170
|
:param id_col: name of the id column
|