berryworld 1.0.0.196751__py3-none-any.whl → 1.0.0.197185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  import ast
4
4
  import math
5
+ import time
5
6
  import pyodbc
6
7
  import traceback
7
8
  import numpy as np
@@ -9,6 +10,8 @@ import pandas as pd
9
10
  import sqlalchemy as sa
10
11
  from urllib import parse
11
12
  from numbers import Number
13
+ from threading import Thread
14
+ from sqlalchemy.pool import QueuePool
12
15
 
13
16
 
14
17
  class SQLConnection:
@@ -137,9 +140,9 @@ class SQLConnection:
137
140
  if self.multi_db & (self.server.lower() == 'prod'):
138
141
  database = str(self.db_name) + 'Primary'
139
142
 
140
- constring = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' + database +
141
- '?driver=' + self.driver + self.trusted_certificate + self.encrypt)
142
- self.engine = sa.create_engine(constring % parse.quote_plus(self.password))
143
+ self.con_string = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' + database +
144
+ '?driver=' + self.driver + self.trusted_certificate + self.encrypt)
145
+ self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
143
146
  if not commit_as_transaction:
144
147
  self.engine = self.engine.execution_options(isolation_level="AUTOCOMMIT")
145
148
 
@@ -170,7 +173,7 @@ class SQLConnection:
170
173
  -----------------------------
171
174
  :param sql_query: Query to be sent to SQL
172
175
  :param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
173
- to floating point.
176
+ to floating point.
174
177
  :return: DataFrame gathering the requested data
175
178
  """
176
179
  self.open_read_connection()
@@ -184,21 +187,6 @@ class SQLConnection:
184
187
  self.close_connection()
185
188
  return data
186
189
 
187
- @staticmethod
188
- def _parse_df(parse_, data, col_names):
189
- """ Auxiliar function to convert list to DataFrame
190
- :param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
191
- :param data: List gathering the data retrieved from SQL
192
- :param col_names: List of columns to create the DataFrame
193
- :return: Formatted data
194
- """
195
- if parse_ is True:
196
- col_names = list(zip(*list(col_names)))[0]
197
- res = pd.DataFrame(list(zip(*data)), index=col_names).T
198
- else:
199
- res = [col_names, data]
200
- return res
201
-
202
190
  def sp_results(self, sql_query, resp_number=None, parse_=True, commit_as_transaction=True, no_count=True):
203
191
  """ Execute a stored procedure and retrieves all its output data
204
192
  -----------------------------
@@ -288,7 +276,7 @@ class SQLConnection:
288
276
  :param table: Table in which the data will be uploaded
289
277
  :param truncate: Indicate whether the table has to be truncated before the data is sent or not
290
278
  :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
291
- external constraints)
279
+ external constraints)
292
280
  :param identity: Indicate whether the identity columns will be inserted or not
293
281
  :param chunk: Indicate how many rows will be uploaded at once
294
282
  :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
@@ -296,11 +284,11 @@ class SQLConnection:
296
284
  false, it commits data by chunks.
297
285
  :param output: Outputs the columns indicated in this list
298
286
  :param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
299
- :return: A DataFrame with the output columns requested if output is not None, else None
300
287
  :param nullable: Used within bools2bits function to indicate which boolean column values to convert
301
288
  :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
302
289
  :param infer_datetime_format: Indicate whether the datetime columns should be converted to string and if so,
303
- then the format to be used
290
+ then the format to be used
291
+ :return: A DataFrame with the output columns requested if output is not None, else None
304
292
  """
305
293
  if output is None:
306
294
  output = []
@@ -391,7 +379,7 @@ class SQLConnection:
391
379
  :param table: Table in which the data will be uploaded
392
380
  :param truncate: Indicate whether the table has to be truncated before the data is sent or not
393
381
  :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
394
- external constraints)
382
+ external constraints)
395
383
  :param identity: Indicate whether the identity columns will be inserted or not
396
384
  :param chunk: Indicate how many rows will be uploaded at once
397
385
  :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
@@ -495,8 +483,8 @@ class SQLConnection:
495
483
  :param bool_cols: columns to include as booleans
496
484
  :param batch_size: Number of records to update in each iteration
497
485
  :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
498
- retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
499
- retrieved)
486
+ retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
487
+ will be retrieved)
500
488
  :param nullable: Indicate whether to update the table column with null or exclude the reference from the update
501
489
  :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
502
490
  :return: None
@@ -631,8 +619,8 @@ class SQLConnection:
631
619
  :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
632
620
  :param bool_cols: columns to include as booleans
633
621
  :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
634
- retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
635
- retrieved)
622
+ retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
623
+ will be retrieved)
636
624
  :param chunk: Indicate how many rows will be uploaded at once
637
625
  :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
638
626
  :return: None
@@ -759,7 +747,7 @@ class SQLConnection:
759
747
  con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
760
748
  -----------------------------
761
749
  :param data: DataFrame to insert in the staging table
762
- :param staging_schema: Staging table schema
750
+ :param staging_schema: Schema to staging table
763
751
  :param staging_table: Staging table name
764
752
  :param sp_schema: Stored Procedure schema
765
753
  :param sp_name: Stored Procedure name
@@ -821,11 +809,6 @@ class SQLConnection:
821
809
  [True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
822
810
  3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
823
811
  the name of a date column and as values the format that the columns has to have.
824
- Versions comments...
825
- + Difference between version 1.0 and 1.01 is that the last one is a bit simpler, it waits for names of columns
826
- which types are booleans or datetime (and format for this one) instead of trying to figure out this columns
827
- as in version 1.0 what is sometimes problematic. So, version 1.01 is more reliable but requires more time
828
- to write the call to the method.
829
812
  -------------------------
830
813
  MERGE INTO [SCHEMA].[TABLE] AS TARGET
831
814
  USING (
@@ -864,10 +847,10 @@ class SQLConnection:
864
847
  :param update_set: list of columns to update
865
848
  :param bool_cols: list of columns gathering boolean types
866
849
  :param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
867
- in its definition has it. Its a boolean.
850
+ in its definition has it. Its a boolean.
868
851
  :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
869
- :return: None
870
852
  :param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
853
+ :return: None
871
854
  """
872
855
  if data is None:
873
856
  # no data to upload
@@ -940,6 +923,21 @@ class SQLConnection:
940
923
  except Exception:
941
924
  raise Exception(traceback.format_exc())
942
925
 
926
+ @staticmethod
927
+ def _parse_df(parse_, data, col_names):
928
+ """ Auxiliar function to convert list to DataFrame
929
+ :param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
930
+ :param data: List gathering the data retrieved from SQL
931
+ :param col_names: List of columns to create the DataFrame
932
+ :return: Formatted data
933
+ """
934
+ if parse_ is True:
935
+ col_names = list(zip(*list(col_names)))[0]
936
+ res = pd.DataFrame(list(zip(*data)), index=col_names).T
937
+ else:
938
+ res = [col_names, data]
939
+ return res
940
+
943
941
  @staticmethod
944
942
  def date_mapping_data_types(data):
945
943
  """
@@ -967,8 +965,1168 @@ class SQLConnection:
967
965
  """
968
966
  Map datetime and boolean variables so they can be inserted in SQL
969
967
  :param data: DataFrame containing the variables to map
970
- :return: The mapped DataFrame
971
968
  :param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
969
+ :return: The mapped DataFrame
970
+ """
971
+ first_index = data.index[0]
972
+ bool_col = data.columns[
973
+ [('bool' in str(type(data.loc[first_index, col]))) | ('object' in str(type(data.loc[first_index, col]))) for
974
+ col in data.columns]]
975
+ if len(bool_col) > 0:
976
+ for col in bool_col:
977
+ if nullable:
978
+ bool_not_null = data[data[col].notna()]
979
+ if bool_not_null.shape[0] > 0:
980
+ for iindex in bool_not_null.index:
981
+ data.at[iindex, col] = int(data.loc[iindex, col])
982
+ else:
983
+ data[col] = data[col].apply(lambda x: 1 if x is True else 0)
984
+
985
+ return data
986
+
987
+ @staticmethod
988
+ def id_next(con_db, table, schema, id_col, print_sql=False):
989
+ """
990
+ This static method returns the next id to be inserted into a table for sql_server
991
+ :param con_db: class to connect to a sql server database
992
+ :param table: name of the table
993
+ :param schema: name of the schema
994
+ :param id_col: name of the id column
995
+ :param print_sql: bool to indicate if you want sql statement to be print on Python Console
996
+ :return: Max ID + 1 for id_col
997
+ """
998
+ sql_statement = ("SELECT CASE WHEN MAX(%s) IS NULL THEN 1 ELSE MAX(%s) + 1 END AS [Id] FROM [%s].[%s]" % (
999
+ id_col, id_col, schema, table))
1000
+ if print_sql:
1001
+ print(sql_statement)
1002
+ df = con_db.query(sql_statement)
1003
+ id_ = df.loc[0, 'Id']
1004
+ return id_
1005
+
1006
+ @staticmethod
1007
+ def convert_decimal_str(string):
1008
+ """ Method to parse the Decimal type in python
1009
+ :param string: String variable to parse
1010
+ :return: Parsed string
1011
+ """
1012
+ string = re.sub("'\)(?!(,[ ]+\())(?=([^$]))", "", string)
1013
+ return re.sub("Decimal\('", "", string)
1014
+
1015
+ @staticmethod
1016
+ def infer_datetime(data, infer_datetime_format):
1017
+ """ Method to infer datetime columns and format them as string
1018
+ :param data: DataFrame to parse
1019
+ :param infer_datetime_format: format to be used for the datetime columns
1020
+ :return: Parsed DataFrame
1021
+ """
1022
+ for col in data.select_dtypes(include=['datetime64']).columns:
1023
+ data[col] = pd.to_datetime(data[col]).dt.strftime(infer_datetime_format)
1024
+
1025
+ return data
1026
+
1027
+
1028
+ class SQLPoolEngine:
1029
+ """ Connect a Pool Engine to a Microsoft SQL """
1030
+
1031
+ def __init__(self, db_reference, server, master=False, trusted_certificate=True, encrypt=True, multi_db=False,
1032
+ commit_as_transaction=True, pool_size=10, max_overflow=10, pool_timeout=30, timeout=300):
1033
+ """ Initialize the class
1034
+ It requires the
1035
+ SQL-DBREFERENCE-PROD = 'server_name db_name user password'
1036
+ -----------------------------
1037
+ db_reference = 'FruitFlow'
1038
+ server = 'prod'
1039
+
1040
+ pool_ = SQLPoolEngine(db_reference, server)
1041
+ -----------------------------
1042
+ :param db_reference: Database reference to connect to
1043
+ :param server: Server to connect to
1044
+ :param master: Indicate whether the connection will be done to master or to a specific database
1045
+ :param trusted_certificate: Indicate whether the connection will be done using the TrustServerCertificate
1046
+ :param encrypt: Indicate whether the connection will use SSL/TLS encryption
1047
+ :param multi_db: Indicate whether the connection will be done to a specific database or to multiple databases
1048
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1049
+ :param pool_size: Number of connections to keep in the pool
1050
+ :param max_overflow: Extra connections beyond pool_size
1051
+ :param pool_timeout: Timeout for getting a connection
1052
+ :param timeout: Connection timeout in seconds
1053
+ """
1054
+ self.con_string_read = None
1055
+ self.con_string_write = None
1056
+ self.engine_read = None
1057
+ self.engine_write = None
1058
+ self.con = None
1059
+ self.commit_as_transaction = commit_as_transaction
1060
+ self.pool_size = pool_size
1061
+ self.max_overflow = max_overflow
1062
+ self.pool_timeout = pool_timeout
1063
+ self.timeout = timeout
1064
+
1065
+ self.db_reference = db_reference.replace("_", "") if "_" in db_reference else db_reference
1066
+ self.server = server
1067
+ if self.server is None:
1068
+ raise ValueError("Please provide a value for server type")
1069
+
1070
+ self.multi_db = multi_db
1071
+ self.master = master
1072
+ if trusted_certificate:
1073
+ self.trusted_certificate = '&TrustServerCertificate=yes'
1074
+ else:
1075
+ self.trusted_certificate = ''
1076
+
1077
+ if encrypt:
1078
+ self.encrypt = '&Encrypt=yes'
1079
+ else:
1080
+ self.encrypt = ''
1081
+
1082
+ drivers = [driver for driver in pyodbc.drivers() if (bool(re.search(r'\d', driver)))]
1083
+ try:
1084
+ self.server_name, self.db_name, self.user_name, self.password = self.credentials()
1085
+ except Exception as e:
1086
+ raise ValueError(
1087
+ f"Cannot find a reference to {self.db_reference} and {self.server.upper()} server: {str(e)}")
1088
+
1089
+ driver_attempt = ''
1090
+ for driver in drivers:
1091
+ try:
1092
+ self.driver = driver
1093
+ self.open_read_connection(commit_as_transaction=self.commit_as_transaction)
1094
+ self.query('''SELECT TOP 1 * FROM information_schema.tables;''')
1095
+ break
1096
+ except Exception as e:
1097
+ print(e)
1098
+ driver_attempt = str(e)
1099
+
1100
+ if driver_attempt != '':
1101
+ raise ValueError(
1102
+ f"Cannot connect to db: {self.db_name} - Error: {str(driver_attempt)}")
1103
+
1104
+ self.create_write_engine(commit_as_transaction=self.commit_as_transaction)
1105
+
1106
+ def credentials(self):
1107
+ """ Return the credentials used to connect to the SQL Server
1108
+ :return: Dictionary with the credentials used to connect to the SQL Server
1109
+ """
1110
+ try:
1111
+ server_creds = os.environ.get(f"SQL-{self.db_reference.upper()}")
1112
+ server_creds = ast.literal_eval(server_creds)
1113
+ except Exception as e:
1114
+ raise ValueError(f'DB reference: {self.db_reference} not found. ERROR: {e}')
1115
+
1116
+ try:
1117
+ server_creds = server_creds[self.server.lower()]
1118
+ except Exception as e:
1119
+ raise ValueError(f'Server: {self.server} not found for DB reference: {self.db_reference}. ERROR: {e}')
1120
+
1121
+ if 'server_name' not in server_creds.keys():
1122
+ raise ValueError(f"Server name not provided for {self.db_reference} on {self.server.upper()} server")
1123
+ else:
1124
+ server_name = server_creds['server_name']
1125
+
1126
+ if 'db_name' not in server_creds.keys():
1127
+ raise ValueError(f"Database name not provided for {self.db_reference} on {self.server.upper()} server")
1128
+ else:
1129
+ db_name = server_creds['db_name']
1130
+
1131
+ if 'user_name' not in server_creds.keys():
1132
+ raise ValueError(f"User name not provided for {self.db_reference} on {self.server.upper()} server")
1133
+ else:
1134
+ user_name = server_creds['user_name']
1135
+
1136
+ if 'pwd' not in server_creds.keys():
1137
+ raise ValueError(f"Password not provided for {self.db_reference} on {self.server.upper()} server")
1138
+ else:
1139
+ password = server_creds['pwd']
1140
+
1141
+ return re.sub(r'(\\)\1*', r'\1', server_name), db_name, user_name, password
1142
+
1143
+ def create_read_engine(self, commit_as_transaction=True):
1144
+ """ Create a reading engine
1145
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1146
+ :return: The opened connection
1147
+ """
1148
+ if self.master:
1149
+ self.con_string_read = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/master' +
1150
+ '?driver=' + self.driver + '&trusted_connection=yes' + self.trusted_certificate +
1151
+ self.encrypt)
1152
+ else:
1153
+ self.con_string_read = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' +
1154
+ self.db_name + '?driver=' + self.driver + self.trusted_certificate + self.encrypt)
1155
+
1156
+ self.engine_read = sa.create_engine(self.con_string_read % parse.quote_plus(self.password),
1157
+ poolclass=QueuePool,
1158
+ pool_size=self.pool_size, # Number of connections to keep in the pool
1159
+ max_overflow=self.max_overflow, # Extra connections beyond pool_size
1160
+ pool_timeout=self.pool_timeout, # Timeout for getting a connection
1161
+ pool_recycle=self.timeout # Recycle connections after X minutes
1162
+ )
1163
+
1164
+ if not commit_as_transaction:
1165
+ self.engine_read = self.engine_read.execution_options(isolation_level="AUTOCOMMIT")
1166
+
1167
+ def create_write_engine(self, commit_as_transaction=True):
1168
+ """ Create a writing engine
1169
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1170
+ :return: The opened connection
1171
+ """
1172
+ database = self.db_name
1173
+ if self.multi_db & (self.server.lower() == 'prod'):
1174
+ database = str(self.db_name) + 'Primary'
1175
+
1176
+ self.con_string_write = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server_name + '/' + database +
1177
+ '?driver=' + self.driver + self.trusted_certificate + self.encrypt)
1178
+ self.engine_write = sa.create_engine(self.con_string_write % parse.quote_plus(self.password),
1179
+ poolclass=QueuePool,
1180
+ pool_size=10, # Number of connections to keep in the pool
1181
+ max_overflow=10, # Extra connections beyond pool_size
1182
+ pool_timeout=30, # Timeout for getting a connection
1183
+ pool_recycle=self.timeout # Recycle connections after X minutes
1184
+ )
1185
+
1186
+ if not commit_as_transaction:
1187
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1188
+
1189
+ def open_read_connection(self, commit_as_transaction=True):
1190
+ """ Open a reading connection with the Server
1191
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1192
+ :return: The opened connection
1193
+ """
1194
+ self.create_read_engine(commit_as_transaction=commit_as_transaction)
1195
+ self.con = self.engine_read.connect().connection
1196
+
1197
+ def query(self, sql_query, coerce_float=False):
1198
+ """ Read data from SQL according to the sql_query
1199
+ -----------------------------
1200
+ query_str = "SELECT * FROM %s" & table
1201
+ con_.query(query_str)
1202
+ -----------------------------
1203
+ :param sql_query: Query to be sent to SQL
1204
+ :param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
1205
+ to floating point.
1206
+ :return: DataFrame gathering the requested data
1207
+ """
1208
+ if self.con is None:
1209
+ self.con = self.engine_read.connect().connection
1210
+
1211
+ data = None
1212
+ try:
1213
+ with self.engine_read.begin() as conn:
1214
+ data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
1215
+ except ValueError:
1216
+ print(traceback.format_exc())
1217
+ finally:
1218
+ self.con.close()
1219
+ return data
1220
+
1221
+
1222
+ class SQLConnectionPool:
1223
+ """ Connect to a Microsoft SQL Server using connection pooling """
1224
+
1225
+ def __init__(self, pool_class):
1226
+ """ Initialize the class
1227
+ It requires an instance of the SQLPoolEngine to work properly
1228
+ -----------------------------
1229
+ con_ = SQLConnectionPool(SQLPoolEngine)
1230
+ -----------------------------
1231
+ :param pool_class: SQLAlchemy Pool class to use for the connections
1232
+ """
1233
+ self.con_read = None
1234
+ self.con_write = None
1235
+ self.engine_read = pool_class.engine_read
1236
+ self.engine_write = pool_class.engine_write
1237
+ self.con_string_read = pool_class.con_string_read
1238
+ self.con_string_write = pool_class.con_string_write
1239
+ self.commit_as_transaction = pool_class.commit_as_transaction
1240
+ self.db_name = pool_class.db_name
1241
+ self.server = pool_class.server
1242
+ self.timeout = pool_class.timeout
1243
+
1244
+ Thread(target=self.close_connection, args=(self.timeout,)).start()
1245
+
1246
+ def close_connection(self, timeout=0):
1247
+ """ Close any opened connections with the Server
1248
+ :return: None
1249
+ """
1250
+ if timeout > 0:
1251
+ time.sleep(timeout)
1252
+
1253
+ if self.con_read is not None:
1254
+ self.con_read.close()
1255
+ if self.engine_read:
1256
+ self.engine_read.dispose()
1257
+
1258
+ if self.con_write is not None:
1259
+ self.con_write.close()
1260
+ if self.engine_write:
1261
+ self.engine_write.dispose()
1262
+
1263
+ def query(self, sql_query, coerce_float=False):
1264
+ """ Read data from SQL according to the sql_query
1265
+ -----------------------------
1266
+ query_str = "SELECT * FROM %s" & table
1267
+ con_.query(query_str)
1268
+ -----------------------------
1269
+ :param sql_query: Query to be sent to SQL
1270
+ :param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
1271
+ to floating point.
1272
+ :return: DataFrame gathering the requested data
1273
+ """
1274
+ if self.con_read is None:
1275
+ self.con_read = self.engine_read.connect().connection
1276
+
1277
+ data = None
1278
+ try:
1279
+ with self.engine_read.begin() as conn:
1280
+ data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
1281
+ except ValueError:
1282
+ print(traceback.format_exc())
1283
+ return data
1284
+
1285
+ def sp_results(self, sql_query, resp_number=None, parse_=True, no_count=True, commit_as_transaction=True):
1286
+ """ Execute a stored procedure and retrieves all its output data
1287
+ -----------------------------
1288
+ query_str = "EXECUTE %s" & stored_procedure
1289
+ con_.sp_results(query_str, resp_number=1)
1290
+ -----------------------------
1291
+ :param sql_query: Query to be sent to SQL
1292
+ :param resp_number: Indicate which of the stored procedures responses will be retrieved
1293
+ :param parse_: Indicate whether the output needs to be converted to a DataFrame or not
1294
+ :param no_count: Indicate whether SET NOCOUNT option is ON (True) or OFF (False)
1295
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1296
+ :return: DataFrame list gathering the requested data
1297
+ """
1298
+ if self.commit_as_transaction != commit_as_transaction:
1299
+ self.commit_as_transaction = commit_as_transaction
1300
+ if not commit_as_transaction:
1301
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1302
+ else:
1303
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1304
+
1305
+ if self.con_write is None:
1306
+ self.con_write = self.engine_write.connect().connection
1307
+
1308
+ data_list = list()
1309
+ cursor = None
1310
+ try:
1311
+ cursor = self.con_write.cursor()
1312
+ if no_count:
1313
+ cursor.execute("SET NOCOUNT ON;" + sql_query)
1314
+ else:
1315
+ cursor.execute(sql_query)
1316
+ if resp_number is not None:
1317
+ for cursor_number in range(resp_number - 1):
1318
+ cursor.nextset()
1319
+ try:
1320
+ data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
1321
+ except ValueError:
1322
+ raise ValueError('Please indicate a valid resp_number')
1323
+ else:
1324
+ aux_cursor = True
1325
+ count = 0
1326
+ while aux_cursor is not False and count < 100:
1327
+ try:
1328
+ data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
1329
+ aux_cursor = cursor.nextset()
1330
+ except Exception as e:
1331
+ print(e)
1332
+ cursor.nextset()
1333
+ finally:
1334
+ count += 1
1335
+ if count >= 100:
1336
+ raise RuntimeError("Method sp_results has loop over 100 times for database '%s' on server '%s'"
1337
+ % (self.db_name, self.server))
1338
+ self.con_write.commit()
1339
+ except ValueError:
1340
+ print(traceback.format_exc())
1341
+ finally:
1342
+ if cursor:
1343
+ cursor.close()
1344
+
1345
+ return data_list
1346
+
1347
+ def run_statement(self, sql_statement, commit_as_transaction=True):
1348
+ """ Execute SQL statement
1349
+ -----------------------------
1350
+ query_str = "DELETE FROM %s WHERE Id > 100" & table
1351
+ con_.run_statement(query_str)
1352
+ -----------------------------
1353
+ :param sql_statement: Statement as string to be run in SQL
1354
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1355
+ :return: Statement result
1356
+ """
1357
+ if self.commit_as_transaction != commit_as_transaction:
1358
+ self.commit_as_transaction = commit_as_transaction
1359
+ if not commit_as_transaction:
1360
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1361
+ else:
1362
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1363
+
1364
+ if self.con_write is None:
1365
+ self.con_write = self.engine_write.connect().connection
1366
+
1367
+ cursor = self.con_write.cursor()
1368
+ # Execute SQL statement
1369
+ try:
1370
+ cursor.execute(sql_statement)
1371
+ self.con_write.commit()
1372
+ except Exception:
1373
+ raise Exception(traceback.format_exc())
1374
+ finally:
1375
+ if cursor:
1376
+ cursor.close()
1377
+
1378
+ def insert(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1000, print_sql=False,
1379
+ commit_all_together=False, output=None, bools2bits=True, nullable=False, infer_datetime_format=None,
1380
+ commit_as_transaction=True):
1381
+ """ Insert data in a table in SQL truncating the table if needed
1382
+ -----------------------------
1383
+ df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
1384
+ con_.insert(df, table_schema, table_name)
1385
+ -----------------------------
1386
+ :param data: DataFrame containing the data to upload
1387
+ :param schema: Schema of the table in which the data will be uploaded
1388
+ :param table: Table in which the data will be uploaded
1389
+ :param truncate: Indicate whether the table has to be truncated before the data is sent or not
1390
+ :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
1391
+ external constraints)
1392
+ :param identity: Indicate whether the identity columns will be inserted or not
1393
+ :param chunk: Indicate how many rows will be uploaded at once
1394
+ :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
1395
+ :param commit_all_together: when it is true, it only commits data if all data has been inserted. When it is
1396
+ false, it commits data by chunks.
1397
+ :param output: Outputs the columns indicated in this list
1398
+ :param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
1399
+ :param nullable: Used within bools2bits function to indicate which boolean column values to convert
1400
+ :param infer_datetime_format: Indicate whether the datetime columns should be converted to string and if so,
1401
+ then the format to be used
1402
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1403
+ :return: A DataFrame with the output columns requested if output is not None, else None
1404
+ """
1405
+ if output is None:
1406
+ output = []
1407
+ if data is None:
1408
+ # no data to upload
1409
+ return ValueError("The data provided is invalid!")
1410
+ cursor = None
1411
+ results = pd.DataFrame(columns=output)
1412
+
1413
+ if self.commit_as_transaction != commit_as_transaction:
1414
+ self.commit_as_transaction = commit_as_transaction
1415
+ if not commit_as_transaction:
1416
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1417
+ else:
1418
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1419
+
1420
+ if self.con_write is None:
1421
+ self.con_write = self.engine_write.connect().connection
1422
+
1423
+ # Mapping the date datatype columns for SQL
1424
+ data = self.date_mapping_data_types(data)
1425
+
1426
+ # Infer datetime format if provided
1427
+ if infer_datetime_format is not None:
1428
+ data = self.infer_datetime(data, infer_datetime_format)
1429
+
1430
+ # Mapping the boolean columns to bit
1431
+ if bools2bits:
1432
+ data = self.boolean_mapping_data_types(data, nullable)
1433
+
1434
+ try:
1435
+ cursor = self.con_write.cursor()
1436
+ # Truncate table if needed
1437
+ if truncate:
1438
+ cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
1439
+ # Delete all records from the table if needed
1440
+ if delete:
1441
+ cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
1442
+ # Allow to insert to an Identity column
1443
+ if identity:
1444
+ cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
1445
+ # Convert category columns to string
1446
+ cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
1447
+ data[cat_cols] = data[cat_cols].astype(str)
1448
+ # Deal with bull values and apostrophes (')
1449
+ data = data.replace("'NULL'", "NULL")
1450
+ data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
1451
+ data = data.fillna("null")
1452
+ # Insert data into the table destination
1453
+ records = [tuple(x) for x in data.values]
1454
+ insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
1455
+ insert_ += str(tuple(data.columns.values)).replace(
1456
+ "(\'", "([").replace('\', \'', '], [').replace('\')', '])')
1457
+ if len(output) > 0:
1458
+ insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
1459
+ insert_ += """ VALUES """
1460
+
1461
+ for batch in self._chunker(records, chunk):
1462
+ rows = str(batch).strip('[]').replace("~~", "''")
1463
+ rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
1464
+ string = insert_ + rows
1465
+ string = self.convert_decimal_str(string)
1466
+ if print_sql:
1467
+ print(string)
1468
+ cursor.execute(string)
1469
+ if len(output) > 0:
1470
+ results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
1471
+ if ~commit_all_together:
1472
+ self.con_write.commit()
1473
+ if commit_all_together:
1474
+ self.con_write.commit()
1475
+
1476
+ # Restrict to insert to an Identity column
1477
+ if identity:
1478
+ cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
1479
+
1480
+ if len(output) > 0:
1481
+ return results.reset_index(drop=True)
1482
+
1483
+ except Exception:
1484
+ raise Exception(traceback.format_exc())
1485
+
1486
+ finally:
1487
+ if cursor:
1488
+ cursor.close()
1489
+
1490
+ def insert_at_once(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1,
1491
+ print_sql=False, output=None, bools2bits=True, nullable=False, commit_as_transaction=True):
1492
+ """ Build all the insert statements and commit them all at once
1493
+ -----------------------------
1494
+ df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
1495
+ con_.insert(df, table_schema, table_name)
1496
+ -----------------------------
1497
+ :param data: DataFrame containing the data to upload
1498
+ :param schema: Schema of the table in which the data will be uploaded
1499
+ :param table: Table in which the data will be uploaded
1500
+ :param truncate: Indicate whether the table has to be truncated before the data is sent or not
1501
+ :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
1502
+ external constraints)
1503
+ :param identity: Indicate whether the identity columns will be inserted or not
1504
+ :param chunk: Indicate how many rows will be uploaded at once
1505
+ :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
1506
+ :param output: Outputs the columns indicated in this list
1507
+ :param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
1508
+ :param nullable: Used within bools2bits function to indicate which boolean column values to convert
1509
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1510
+ :return: A DataFrame with the output columns requested if output is not None, else None
1511
+ """
1512
+ if output is None:
1513
+ output = []
1514
+ if data is None:
1515
+ # no data to upload
1516
+ return ValueError("The data provided is invalid!")
1517
+ cursor = None
1518
+ results = pd.DataFrame(columns=output)
1519
+
1520
+ if self.commit_as_transaction != commit_as_transaction:
1521
+ self.commit_as_transaction = commit_as_transaction
1522
+ if not commit_as_transaction:
1523
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1524
+ else:
1525
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1526
+
1527
+ if self.con_write is None:
1528
+ self.con_write = self.engine_write.connect().connection
1529
+
1530
+ # Mapping the date datatype columns for SQL
1531
+ data = self.date_mapping_data_types(data)
1532
+
1533
+ # Mapping the boolean columns to bit
1534
+ if bools2bits:
1535
+ data = self.boolean_mapping_data_types(data, nullable)
1536
+
1537
+ try:
1538
+ cursor = self.con_write.cursor()
1539
+ # Truncate table if needed
1540
+ if truncate:
1541
+ cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
1542
+ # Delete all records from the table if needed
1543
+ if delete:
1544
+ cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
1545
+ # Allow to insert to an Identity column
1546
+ if identity:
1547
+ cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
1548
+ # Convert category columns to string
1549
+ cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
1550
+ data[cat_cols] = data[cat_cols].astype(str)
1551
+ # Deal with bull values and apostrophes (')
1552
+ data = data.replace("'NULL'", "NULL")
1553
+ data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
1554
+ data = data.fillna("null")
1555
+ # Insert data into the table destination
1556
+ records = [tuple(x) for x in data.values]
1557
+ insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
1558
+ insert_ += str(tuple(data.columns.values)).replace(
1559
+ "(\'", "([").replace('\', \'', '], [').replace('\')', '])')
1560
+ if len(output) > 0:
1561
+ insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
1562
+ insert_ += """ VALUES """
1563
+
1564
+ insert_statements = list()
1565
+ for batch in self._chunker(records, chunk):
1566
+ rows = str(batch).strip('[]').replace("~~", "''")
1567
+ rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
1568
+ string = insert_ + rows
1569
+ string = self.convert_decimal_str(string)
1570
+ insert_statements.append(string)
1571
+
1572
+ if print_sql:
1573
+ print(';'.join(insert_statements))
1574
+ cursor.execute(';'.join(insert_statements))
1575
+ if len(output) > 0:
1576
+ results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
1577
+ self.con_write.commit()
1578
+
1579
+ # Restrict to insert to an Identity column
1580
+ if identity:
1581
+ cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
1582
+
1583
+ if len(output) > 0:
1584
+ return results.reset_index(drop=True)
1585
+
1586
+ except Exception:
1587
+ raise Exception(traceback.format_exc())
1588
+
1589
+ finally:
1590
+ if cursor:
1591
+ cursor.close()
1592
+
1593
+ def update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, batch_size=100,
1594
+ output=None, nullable=True, commit_as_transaction=True):
1595
+ """ This method updates a table in batches in sql server.
1596
+ -----------------------------
1597
+ UPDATE [SCHEMA].[TABLE]
1598
+ SET update_list[0] = data[index, update_list{0}],
1599
+ update_list[1] = data[index, update_list[1]]
1600
+ OUTPUT output[0], output[1]
1601
+ WHERE on_list[0] = data[index, on_list[0]]
1602
+ AND on_list[1] = data[index, on_list[1]]
1603
+ -----------------------------
1604
+ :param data: DataFrame containing the data to update
1605
+ :param update_list: list of columns to update
1606
+ :param on_list: list of columns to apply the on clause
1607
+ :param schema: Schema of the table in which the data will be uploaded
1608
+ :param table: Table in which the data will be uploaded
1609
+ :param bool_cols: list of columns gathering boolean types
1610
+ :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
1611
+ :param bool_cols: columns to include as booleans
1612
+ :param batch_size: Number of records to update in each iteration
1613
+ :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
1614
+ retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
1615
+ will be retrieved)
1616
+ :param nullable: Indicate whether to update the table column with null or exclude the reference from the update
1617
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1618
+ :return: None
1619
+ """
1620
+ cursor = None
1621
+ if data is None:
1622
+ # no data to update
1623
+ return ValueError("The data provided is invalid!")
1624
+
1625
+ if output is None:
1626
+ output = []
1627
+ else:
1628
+ output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for out
1629
+ in output]
1630
+ results = pd.DataFrame(columns=output)
1631
+
1632
+ # re-starting indexes
1633
+ data.reset_index(drop=True, inplace=True)
1634
+
1635
+ # Mapping boolean columns
1636
+ if bool_cols is not None:
1637
+ for col in bool_cols:
1638
+ data[col] = data[col].astype(bool)
1639
+
1640
+ # Mapping date type for SQL
1641
+ data = self.date_mapping_data_types(data)
1642
+
1643
+ # create connection
1644
+ if self.commit_as_transaction != commit_as_transaction:
1645
+ self.commit_as_transaction = commit_as_transaction
1646
+ if not commit_as_transaction:
1647
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1648
+ else:
1649
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1650
+
1651
+ if self.con_write is None:
1652
+ self.con_write = self.engine_write.connect().connection
1653
+
1654
+ try:
1655
+ # initialise cursor
1656
+ cursor = self.con_write.cursor()
1657
+
1658
+ # extraction of the useful columns
1659
+ data_update = data[list(set(update_list + on_list))]
1660
+
1661
+ # initialisation of the sql statement
1662
+ sql_start = ''' UPDATE [%s].[%s] SET ''' % (schema, table)
1663
+ iter_batch = math.ceil(data_update.shape[0] / batch_size)
1664
+ for batch in range(iter_batch):
1665
+ batch_update = data_update.iloc[batch * batch_size: (batch + 1) * batch_size]
1666
+
1667
+ sql_statement = ''
1668
+ for iindex in batch_update.index:
1669
+ # UPDATE [SCHEMA].[TABLE]
1670
+ sql_statement += sql_start
1671
+
1672
+ # VALUES
1673
+ for col in update_list:
1674
+ if nullable:
1675
+ if pd.isna(batch_update.loc[iindex, col]):
1676
+ sql_statement += " [%s] = NULL ," % col
1677
+ elif isinstance(batch_update.loc[iindex, col], bool):
1678
+ sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
1679
+ elif isinstance(batch_update.loc[iindex, col], Number):
1680
+ sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
1681
+ else:
1682
+ sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
1683
+ else:
1684
+ if pd.notna(batch_update.loc[iindex, col]):
1685
+ if str(batch_update.loc[iindex, col]).upper() == 'NULL':
1686
+ continue
1687
+ elif isinstance(batch_update.loc[iindex, col], bool):
1688
+ sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
1689
+ elif isinstance(batch_update.loc[iindex, col], Number):
1690
+ sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
1691
+ else:
1692
+ sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
1693
+
1694
+ # OUTPUT
1695
+ if len(output) > 0:
1696
+ sql_statement = sql_statement[:-1] + " OUTPUT " + ",".join(output) + ' '
1697
+
1698
+ # WHERE
1699
+ sql_statement = sql_statement[:-1] + ' WHERE '
1700
+ for col in on_list:
1701
+ if pd.isna(batch_update.loc[iindex, col]):
1702
+ sql_statement += " [%s] = NULL AND" % col
1703
+ elif isinstance(batch_update.loc[iindex, col], bool):
1704
+ sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
1705
+ elif isinstance(batch_update.loc[iindex, col], Number):
1706
+ sql_statement += " [%s] = %s AND" % (col, batch_update.loc[iindex, col])
1707
+ else:
1708
+ sql_statement += " [%s] = '%s' AND" % (col, batch_update.loc[iindex, col])
1709
+
1710
+ # Addition of semicolon
1711
+ sql_statement = sql_statement[:-3] + ';'
1712
+
1713
+ if print_sql:
1714
+ print(sql_statement)
1715
+
1716
+ # executing statement
1717
+ if len(sql_statement) > 0:
1718
+ if len(output) > 0:
1719
+ cursor.execute(sql_statement)
1720
+ for cursor_number in range(len(sql_statement.split(';')) - 1):
1721
+ results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
1722
+ cursor.nextset()
1723
+ else:
1724
+ cursor.execute(sql_statement)
1725
+ self.con_write.commit()
1726
+
1727
+ if len(output) > 0:
1728
+ return results.reset_index(drop=True)
1729
+
1730
+ except Exception:
1731
+ raise Exception(traceback.format_exc())
1732
+
1733
+ finally:
1734
+ if cursor:
1735
+ cursor.close()
1736
+
1737
+ def bulk_update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, output=None,
1738
+ chunk=1000, commit_as_transaction=True):
1739
+ """ This method updates a table in batches in sql server.
1740
+ -----------------------------
1741
+ UPDATE [SCHEMA].[TABLE]
1742
+ SET update_list[0] = data[index, update_list{0}],
1743
+ update_list[1] = data[index, update_list[1]]
1744
+ OUTPUT output[0], output[1]
1745
+ WHERE on_list[0] = data[index, on_list[0]]
1746
+ AND on_list[1] = data[index, on_list[1]]
1747
+ -----------------------------
1748
+ :param data: DataFrame containing the data to update
1749
+ :param update_list: list of columns to update
1750
+ :param on_list: list of columns to apply the on clause
1751
+ :param schema: Schema of the table in which the data will be uploaded
1752
+ :param table: Table in which the data will be uploaded
1753
+ :param bool_cols: list of columns gathering boolean types
1754
+ :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
1755
+ :param bool_cols: columns to include as booleans
1756
+ :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
1757
+ retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one
1758
+ will be retrieved)
1759
+ :param chunk: Indicate how many rows will be uploaded at once
1760
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1761
+ :return: None
1762
+ """
1763
+ cursor = None
1764
+ if data is None:
1765
+ # no data to update
1766
+ return ValueError("The data provided is invalid!")
1767
+
1768
+ if output is None:
1769
+ output = []
1770
+ sql_output = []
1771
+ else:
1772
+ sql_output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for
1773
+ out
1774
+ in output]
1775
+ results = pd.DataFrame(columns=output)
1776
+
1777
+ # re-starting indexes
1778
+ data.reset_index(drop=True, inplace=True)
1779
+
1780
+ # Mapping boolean columns
1781
+ if bool_cols is not None:
1782
+ for col in bool_cols:
1783
+ data[col] = data[col].astype(bool)
1784
+
1785
+ # Mapping date type for SQL
1786
+ data = data[on_list + update_list]
1787
+ data = self.date_mapping_data_types(data)
1788
+
1789
+ # create connection
1790
+ if self.commit_as_transaction != commit_as_transaction:
1791
+ self.commit_as_transaction = commit_as_transaction
1792
+ if not commit_as_transaction:
1793
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1794
+ else:
1795
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1796
+
1797
+ if self.con_write is None:
1798
+ self.con_write = self.engine_write.connect().connection
1799
+
1800
+ try:
1801
+ # initialise cursor
1802
+ cursor = self.con_write.cursor()
1803
+
1804
+ # Convert category columns to string
1805
+ cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
1806
+ data[cat_cols] = data[cat_cols].astype(str)
1807
+ # Deal with bull values and apostrophes (')
1808
+ data = data.replace("'NULL'", "NULL")
1809
+ data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
1810
+ data = data.fillna("null")
1811
+
1812
+ records = [tuple(x) for x in data.values]
1813
+ temp_table = f'#Temp{schema}{table}'
1814
+
1815
+ for batch in self._chunker(records, chunk):
1816
+ batch_records = [tuple(x) for x in batch]
1817
+ # initialisation of the sql statement
1818
+ insert_ = f'DROP TABLE IF EXISTS {temp_table} '
1819
+ insert_ += f"SELECT * INTO {temp_table} FROM ( VALUES "
1820
+ temp_columns = str(tuple(data.columns.values)).replace("(\'", "([").replace(
1821
+ '\', \'', '], [').replace('\')', '])')
1822
+ rows = str(batch_records).strip('[]').replace("~~", "''")
1823
+ rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
1824
+ sql_statement = insert_ + rows
1825
+ sql_statement = self.convert_decimal_str(sql_statement)
1826
+ sql_statement += f') AS TempTable {temp_columns}'
1827
+
1828
+ col_update_set = ''
1829
+ for col in update_list:
1830
+ col_update_set += f' target.{col} = source.{col},'
1831
+ col_update_set = col_update_set[:-1]
1832
+
1833
+ col_difference_check = ''
1834
+ for col in update_list:
1835
+ col_difference_check += f' target.{col} <> source.{col} OR'
1836
+ col_difference_check = col_difference_check[:-2]
1837
+
1838
+ col_join_on = ''
1839
+ for col in on_list:
1840
+ col_join_on += f' source.{col} = target.{col} AND'
1841
+ col_join_on = col_join_on[:-3]
1842
+
1843
+ sql_statement += f'UPDATE target SET {col_update_set} '
1844
+
1845
+ if len(output) > 0:
1846
+ sql_statement += f" OUTPUT {','.join(sql_output)} "
1847
+
1848
+ sql_statement += f'''FROM {schema}.{table} target
1849
+ JOIN {temp_table} as source
1850
+ ON {col_join_on}
1851
+ WHERE {col_difference_check}
1852
+ '''
1853
+
1854
+ sql_statement += f' DROP TABLE IF EXISTS {temp_table} '
1855
+
1856
+ if print_sql:
1857
+ print(sql_statement)
1858
+
1859
+ # executing statement
1860
+ if len(sql_statement) > 0:
1861
+ if len(output) > 0:
1862
+ cursor.execute(sql_statement)
1863
+ cursor.nextset()
1864
+ results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
1865
+ else:
1866
+ cursor.execute(sql_statement)
1867
+
1868
+ self.con_write.commit()
1869
+
1870
+ if len(output) > 0:
1871
+ return results.reset_index(drop=True)
1872
+
1873
+ except Exception:
1874
+ raise Exception(traceback.format_exc())
1875
+
1876
+ finally:
1877
+ if cursor:
1878
+ cursor.close()
1879
+
1880
+ def merge(self, data, staging_schema, staging_table, sp_schema, sp_name, truncate=False, chunk=1000,
1881
+ commit_as_transaction=True):
1882
+ """ Merge data from Staging table using a Stored Procedure. It requires a table in SQL which will store the
1883
+ Staging data. The method will work as follows:
1884
+ 1.- Truncate the staging table according to the truncate parameter
1885
+ 2.- Insert the data into the staging table
1886
+ 3.- Execute a stored procedure to merge the staging table with the destination table
1887
+ -----------------------------
1888
+ df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
1889
+ con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
1890
+ -----------------------------
1891
+ :param data: DataFrame to insert in the staging table
1892
+ :param staging_schema: Staging table schema
1893
+ :param staging_table: Staging table name
1894
+ :param sp_schema: Stored Procedure schema
1895
+ :param sp_name: Stored Procedure name
1896
+ :param truncate: Indicate whether the staging table has to be truncated or not
1897
+ :param chunk: Indicate how many rows will be uploaded at once
1898
+ :param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
1899
+ :return: None
1900
+ """
1901
+ if data is None:
1902
+ # no data to upload
1903
+ return ValueError("The data provided is invalid!")
1904
+ cursor = None
1905
+
1906
+ if self.commit_as_transaction != commit_as_transaction:
1907
+ self.commit_as_transaction = commit_as_transaction
1908
+ if not commit_as_transaction:
1909
+ self.engine_write = self.engine_write.execution_options(isolation_level="AUTOCOMMIT")
1910
+ else:
1911
+ self.engine_write = self.engine_write.execution_options(isolation_level="READ COMMITTED")
1912
+
1913
+ if self.con_write is None:
1914
+ self.con_write = self.engine_write.connect().connection
1915
+
1916
+ try:
1917
+ cursor = self.con_write.cursor()
1918
+ # Truncate Staging table if needed
1919
+ if truncate:
1920
+ trunc_insert = """TRUNCATE TABLE [%s].[%s]""" % (staging_schema, staging_table)
1921
+ cursor.execute(trunc_insert)
1922
+ self.con_write.commit()
1923
+ # Convert category columns to string
1924
+ cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
1925
+ data[cat_cols] = data[cat_cols].astype(str)
1926
+ # Deal with null values and apostrophes (')
1927
+ data = data.replace("'NULL'", "NULL")
1928
+ data = data.replace("'", "~~", regex=True).infer_objects(copy=False)
1929
+ data = data.fillna("null")
1930
+ # Insert in Staging Table
1931
+ records = [tuple(x) for x in data.values]
1932
+ insert_ = """INSERT INTO [%s].[%s] """ % (staging_schema, staging_table)
1933
+ insert_ = insert_ + str(tuple(data.columns.values)).replace("\'", "") + """ VALUES """
1934
+ for batch in self._chunker(records, chunk):
1935
+ rows = str(batch).strip('[]').replace("~~", "''")
1936
+ rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
1937
+ string = insert_ + rows
1938
+ string = self.convert_decimal_str(string)
1939
+ cursor.execute(string)
1940
+ self.con_write.commit()
1941
+ # Execute Stored Procedure
1942
+ exec_sp = """EXECUTE [%s].[%s]""" % (sp_schema, sp_name)
1943
+ cursor.execute(exec_sp)
1944
+ self.con_write.commit()
1945
+ except Exception:
1946
+ raise Exception(traceback.format_exc())
1947
+ finally:
1948
+ if cursor:
1949
+ cursor.close()
1950
+
1951
+ def merge_into(self, data, schema, table, on_list, update_check=False, update_set=None, bool_cols=None,
1952
+ identity=False, print_sql=False, nullable=False):
1953
+ """
1954
+ This method is equivalent to the 'merge into' of T-sql. Schema and table defines the Target, while data is the
1955
+ Source. Please refer to below schema for more arguments use clarifications.
1956
+ Aspects to take into consideration:
1957
+ 1.- This method will not work properly if data contains duplicates. It is not relevant if the target contains
1958
+ duplicates because DISTINCT is used to call the table.
1959
+ 2.- When having booleans in the dataset you have to pay attention because pandas get bool from sql server as
1960
+ [True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
1961
+ 3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
1962
+ the name of a date column and as values the format that the columns has to have.
1963
+ -------------------------
1964
+ MERGE INTO [SCHEMA].[TABLE] AS TARGET
1965
+ USING (
1966
+ data
1967
+ ) AS SOURCE
1968
+ ON TARGET.on_list[0] = SOURCE.on_list[0]
1969
+ AND TARGET.on_list[1] = SOURCE.on_list[1]
1970
+ ...
1971
+ AND TARGET.on_list[n] = SOURCE.on_list[n]
1972
+ WHEN MATCHED AND (
1973
+ TARGET.update_check[0] <> SOURCE.update_check[0]
1974
+ OR TARGET.update_check[1] <> SOURCE.update_check[1]
1975
+ ...
1976
+ OR TARGET.update_check[n] <> SOURCE.update_check[n]
1977
+ )
1978
+ UPDATE SET TARGET.update_check[0] = SOURCE.update_check[0],
1979
+ ...
1980
+ TARGET.update_check[n] = SOURCE.update_check[n],
1981
+ TARGET.update_set[0] = SOURCE.update_set[0],
1982
+ TARGET.update_set[1] = SOURCE.update_set[1],
1983
+ ....
1984
+ TARGET.update_set[n] = SOURCE.update_set[n]
1985
+ WHEN NOT MATCHED BY TARGET THEN
1986
+ INSERT
1987
+ (
1988
+ all columns from [SCHEMA].[TABLE]
1989
+ )
1990
+ VALUES
1991
+ (all columns from data)
1992
+ -------------------------------
1993
+ :param data: DataFrame containing the data to upload/update
1994
+ :param schema: Schema of the table in which the data will be uploaded
1995
+ :param table: Table in which the data will be uploaded
1996
+ :param on_list: list of columns to apply the on clause
1997
+ :param update_check: list of columns to do the check
1998
+ :param update_set: list of columns to update
1999
+ :param bool_cols: list of columns gathering boolean types
2000
+ :param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
2001
+ in its definition has it. It's a boolean.
2002
+ :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
2003
+ :param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
2004
+ :return: None
2005
+ """
2006
+ if data is None:
2007
+ # no data to upload
2008
+ return ValueError("The data provided is invalid!")
2009
+
2010
+ if data.shape[0] != data.drop_duplicates().shape[0]:
2011
+ return TypeError("There are duplicates values in your dataframe, it will not work properly on "
2012
+ "pd.concat().drop_duplicates()")
2013
+
2014
+ # if update_set has values assigned, update check has to have values assigned
2015
+ if update_set is not None:
2016
+ if update_check is None:
2017
+ return ValueError("Please, to use update_set assigned values to update_check")
2018
+ else:
2019
+ update_set = update_check
2020
+
2021
+ # Mapping boolean columns
2022
+ if bool_cols is not None:
2023
+ for col in bool_cols:
2024
+ data[col] = data[col].astype(bool)
2025
+
2026
+ # Mapping date and boolean type for SQL
2027
+ data = self.date_mapping_data_types(data)
2028
+ data = self.boolean_mapping_data_types(data, nullable)
2029
+
2030
+ try:
2031
+ # call the table from the server
2032
+ data_table = self.query("""SELECT DISTINCT * FROM [%s].[%s]""" % (schema, table))
2033
+
2034
+ if data_table.shape[0] == 0:
2035
+ print("The destination table is empty so all the data will be inserted")
2036
+ self.insert(data, schema, table)
2037
+
2038
+ else:
2039
+ for data_col in data.columns:
2040
+ if ("int" in str(type(data_table[data_col].iloc[0]))) & (
2041
+ data_table[data_col].isnull().sum() > 0):
2042
+ data_table[data_col] = data_table[data_col].astype(float)
2043
+ else:
2044
+ data_table[data_col] = data_table[data_col].astype(type(data[data_col].iloc[0]))
2045
+
2046
+ coincidence = pd.DataFrame()
2047
+ if data_table.shape[0] > 0:
2048
+ for col in data_table.columns.values.tolist():
2049
+ if isinstance(data_table.loc[0, col], bool):
2050
+ data_table[col] = data_table[col].apply(
2051
+ lambda x: 1 if x is True else 0 if x is False else np.nan)
2052
+ if bool_cols is not None:
2053
+ for col in bool_cols:
2054
+ data_table[col] = data_table[col].astype(bool)
2055
+ # join the input table with the one in the database
2056
+ coincidence = data.merge(data_table[on_list], how='inner', on=on_list)
2057
+ # WHEN MATCHED AND ... UPDATE SET
2058
+ if update_check:
2059
+ coincidence2 = coincidence.merge(data_table[list(set(on_list + update_check))],
2060
+ how='inner',
2061
+ on=list(set(on_list + update_check)))
2062
+ data_update = pd.concat([coincidence, coincidence2], ignore_index=True)
2063
+ data_update.drop_duplicates(keep=False, inplace=True)
2064
+ if data_update.shape[0] > 0:
2065
+ self.update(data_update, list(set(update_set + update_check)), on_list, schema, table,
2066
+ print_sql=print_sql)
2067
+
2068
+ # WHEN NOT MATCHED BY TARGET THEN... INSERT
2069
+ data_insert = pd.concat([data, coincidence], ignore_index=True)
2070
+ data_insert.drop_duplicates(keep=False, inplace=True)
2071
+ if data_insert.shape[0] > 0:
2072
+ self.insert(data_insert, schema, table, identity=identity, print_sql=print_sql)
2073
+
2074
+ except Exception:
2075
+ raise Exception(traceback.format_exc())
2076
+
2077
+ @staticmethod
2078
+ def _parse_df(parse_, data, col_names):
2079
+ """ Auxiliar function to convert list to DataFrame
2080
+ :param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
2081
+ :param data: List gathering the data retrieved from SQL
2082
+ :param col_names: List of columns to create the DataFrame
2083
+ :return: Formatted data
2084
+ """
2085
+ if parse_ is True:
2086
+ col_names = list(zip(*list(col_names)))[0]
2087
+ res = pd.DataFrame(list(zip(*data)), index=col_names).T
2088
+ else:
2089
+ res = [col_names, data]
2090
+ return res
2091
+
2092
+ @staticmethod
2093
+ def _chunker(seq, size):
2094
+ """ Split the data set in chunks to be sent to SQL
2095
+ :param seq: Sequence of records to be split
2096
+ :param size: Size of the chunks to split the data
2097
+ :return: The DataFrame divided in chunks
2098
+ """
2099
+ return (seq[pos:pos + size] for pos in range(0, len(seq), size))
2100
+
2101
+ @staticmethod
2102
+ def date_mapping_data_types(data):
2103
+ """
2104
+ Map datetime and boolean variables so they can be inserted in SQL
2105
+ :param data: DataFrame containing the variables to map
2106
+ :return: The mapped DataFrame
2107
+ """
2108
+ first_index = data.index[0]
2109
+ date_col = data.columns[
2110
+ [('date' in str(type(data.loc[first_index, col]))) | ('timestamp' in str(type(data.loc[first_index, col])))
2111
+ for col in data.columns]]
2112
+ if len(date_col) > 0:
2113
+ for col in date_col:
2114
+ data[col] = pd.to_datetime(data[col])
2115
+ if data[col].dtypes == 'O':
2116
+ data[col] = data[col].dt.strftime('%Y-%m-%d')
2117
+ else:
2118
+ data[col] = data[col].dt.strftime('%Y-%m-%d %H:%M:%S')
2119
+ data.loc[data[col] == 'NaT', col] = np.nan
2120
+
2121
+ return data
2122
+
2123
+ @staticmethod
2124
+ def boolean_mapping_data_types(data, nullable=False):
2125
+ """
2126
+ Map datetime and boolean variables so they can be inserted in SQL
2127
+ :param data: DataFrame containing the variables to map
2128
+ :param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
2129
+ :return: The mapped DataFrame
972
2130
  """
973
2131
  first_index = data.index[0]
974
2132
  bool_col = data.columns[
@@ -990,7 +2148,7 @@ class SQLConnection:
990
2148
  def id_next(con_db, table, schema, id_col, print_sql=False):
991
2149
  """
992
2150
  This static method returns the next id to be inserted into a table for sql_server
993
- :param con_db: class to connect to a sql server dabatase
2151
+ :param con_db: class to connect to a sql server database
994
2152
  :param table: name of the table
995
2153
  :param schema: name of the schema
996
2154
  :param id_col: name of the id column