berryworld 1.0.0.176826__py3-none-any.whl → 1.0.0.177678__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
berryworld/__init__.py CHANGED
@@ -2,7 +2,6 @@ from .xml_parser import XMLparser
2
2
  from .handy_mix import HandyMix
3
3
  from .transportation_solver import TransportationAlgorithm
4
4
  from .allocation_solver import AllocationSolver
5
- from .sql_connection import SQLConnection
6
5
  from .pickle_management import PickleManagement
7
6
  from .postgres_connection import Postgresql
8
7
  from .email_logging import EmailLogging
berryworld/sql_conn.py CHANGED
@@ -9,8 +9,6 @@ from urllib import parse
9
9
  from numbers import Number
10
10
  from .credentials import SQLCredentials
11
11
 
12
- pd.set_option('future.no_silent_downcasting', True)
13
-
14
12
 
15
13
  class SQLConn:
16
14
  """ Connect to Microsoft SQL """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: berryworld
3
- Version: 1.0.0.176826
3
+ Version: 1.0.0.177678
4
4
  Summary: Handy classes to improve ETL processes
5
5
  Home-page: https://www.berryworld.com
6
6
  Author: BerryWorld ltd
@@ -1,4 +1,4 @@
1
- berryworld/__init__.py,sha256=Y6l1wu4yAxo7s1Yi4BliIfj2WquUPtE9BmDIpfCx80Q,1119
1
+ berryworld/__init__.py,sha256=QvrKku17Q0ns8sDSVnN0qp_clb4W3tKyzVNMyq0RatI,1077
2
2
  berryworld/aks_logs.py,sha256=Gb2_cokiZbEX01Yoptd0MxpDociaug-GrXdwliyxFBo,2293
3
3
  berryworld/allocation_solver.py,sha256=asFtaCAze6-eHUGWXA0kAp67UBS-Upj1KKdrVLj_ttQ,8513
4
4
  berryworld/app_logs.py,sha256=MKzKPYd3JuPfOQNAapIgaeZeFHw1z_w2mbn9I6QCADE,4180
@@ -17,8 +17,7 @@ berryworld/pickle_management.py,sha256=O49ojVtTqYCT510rVRTbZWWaur_-5q3HSVG03Azn8
17
17
  berryworld/postgres_connection.py,sha256=whKDnchd5Feqpmxpoh2vlyn36EKHR-dVEULYq0N_4wA,8287
18
18
  berryworld/power_automate.py,sha256=9rDuRy0v-Ttq-SThid4lOB_tD4ibkyEmobiROpa--g4,25414
19
19
  berryworld/sharepoint_con.py,sha256=TuH-Vxk1VxjTi7x80KFssf_J8YPLRXpV27RBaFZi37U,22254
20
- berryworld/sql_conn.py,sha256=Mdvf2z7oKnPVpi8e4mydWaQSb4DkYMh_uLLhGpWIzMM,47452
21
- berryworld/sql_connection.py,sha256=u-2oNFC8cTP0nXkBGp62XjD06kukMFkwFMQ57CwsySQ,44364
20
+ berryworld/sql_conn.py,sha256=BgdjWkhuabwnz0G8k7AWr-WoY540FIFgY9ec6Mn3uVc,47399
22
21
  berryworld/teams_logging.py,sha256=8NwXyWr4fLj7W6GzAm2nRQCGFDxibQpAHDHHD24FrP8,6997
23
22
  berryworld/transportation_solver.py,sha256=AdJPekVNufweaKDZLWYIB9qSxeVti80LaoaD-4NCSjc,5038
24
23
  berryworld/verify_keys.py,sha256=J2J505PcmBsQ9bj0XSRtXjpY-8qwpPU1A5LQdFRicFU,4257
@@ -29,8 +28,8 @@ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
28
  tests/test_allocation_config.py,sha256=e12l6fE9U57eSPS35g6ekJ_hol7-RHg89JV60_m1BlE,4633
30
29
  tests/test_handy_mix_config.py,sha256=Un56mz9KJmdn4K4OwzHAHLSRzDU1Xv2nFrONNuzOG04,2594
31
30
  tests/test_xml_parser.py,sha256=3QTlhFEd6KbK6nRFKZnc35tad6wqukTbe4QrFi8mr_8,859
32
- berryworld-1.0.0.176826.dist-info/LICENSE,sha256=vtkVCJM6E2af2gnsi2XxKPr4WY-uIbvzVLXieFND0UU,1074
33
- berryworld-1.0.0.176826.dist-info/METADATA,sha256=d7EymZKDEDB0wwI0JlhO_7lsTe9FM5h--fkOrcffZ3g,1107
34
- berryworld-1.0.0.176826.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
35
- berryworld-1.0.0.176826.dist-info/top_level.txt,sha256=GIZ5qy-P5oxfEH755vA1IMFeTVdX3-40JxMe6nOe5I8,17
36
- berryworld-1.0.0.176826.dist-info/RECORD,,
31
+ berryworld-1.0.0.177678.dist-info/LICENSE,sha256=vtkVCJM6E2af2gnsi2XxKPr4WY-uIbvzVLXieFND0UU,1074
32
+ berryworld-1.0.0.177678.dist-info/METADATA,sha256=1cpTYSRAAtxZKiXFHEnK1kfJxi37Ojt-Qc5YKKXm2cM,1107
33
+ berryworld-1.0.0.177678.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
34
+ berryworld-1.0.0.177678.dist-info/top_level.txt,sha256=GIZ5qy-P5oxfEH755vA1IMFeTVdX3-40JxMe6nOe5I8,17
35
+ berryworld-1.0.0.177678.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.0)
2
+ Generator: setuptools (70.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,946 +0,0 @@
1
- import sqlalchemy as sa
2
- import pyodbc
3
- import math
4
- import pandas as pd
5
- import numpy as np
6
- from urllib import parse
7
- import traceback
8
- from numbers import Number
9
- import re
10
-
11
-
12
- class SQLConnection:
13
- """ Connect to Microsoft SQL """
14
-
15
- def __init__(self, server_creds, wincred=False, master=False, trusted_certificate=True):
16
- """ Initialize the class
17
- -----------------------------
18
- server_creds = {
19
- "server_name": "",
20
- "db_name": "",
21
- "user_name": "",
22
- "password": ""
23
- }
24
- wincred = True
25
- master = False
26
-
27
- con_ = SQLConnection(server_creds, wincred, master)
28
- -----------------------------
29
- :param server_creds: Dictionary containing the info to connect to the Server
30
- :param wincred: Indicate whether the connection to SQL will be done via Windows Authentication or not
31
- :param master: Indicate whether the connection will be done to master or to a specific database
32
- :param trusted_certificate: Indicate whether the connection will be done using the TrustServerCertificate
33
- parameter
34
- """
35
- self.wincred = wincred
36
- self.master = master
37
- if trusted_certificate:
38
- self.trusted_certificate = '&TrustServerCertificate=yes'
39
- else:
40
- self.trusted_certificate = ''
41
-
42
- drivers = [driver for driver in pyodbc.drivers() if (bool(re.search(r'\d', driver)))]
43
- self.driver = drivers[0]
44
- self.server = server_creds['server_name']
45
- self.user_name = server_creds['user_name']
46
- self.password = server_creds['password']
47
-
48
- if ~self.master:
49
- self.db_name = server_creds['db_name']
50
-
51
- self.con = None
52
- self.engine = None
53
- self.con_string = None
54
-
55
- driver_attempt = ''
56
- for driver in drivers:
57
- try:
58
- self.driver = driver
59
- self.query('''SELECT TOP 1 * FROM information_schema.tables;''')
60
- break
61
- except Exception as e:
62
- print(e)
63
- driver_attempt = str(e)
64
-
65
- if driver_attempt != '':
66
- raise ValueError(
67
- "Cannot connect to db: %s - Error: %s" % (self.db_name, str(driver_attempt)))
68
-
69
- def open_read_connection(self):
70
- """ Open a reading connection with the Server
71
- :return: The opened connection
72
- """
73
-
74
- if self.wincred:
75
- if self.master:
76
- self.con_string = 'mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/master' + \
77
- '?driver=' + self.driver + '&trusted_connection=yes' + self.trusted_certificate
78
- self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
79
- else:
80
- self.con_string = 'mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/' + self.db_name + \
81
- '?driver=' + self.driver + '&trusted_connection=yes' + self.trusted_certificate
82
- self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
83
- else:
84
- self.con_string = 'mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/' + self.db_name + \
85
- '?driver=' + self.driver + self.trusted_certificate
86
- self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
87
- self.con = self.engine.connect().connection
88
-
89
- def open_write_connection(self):
90
- """ Open a writing connection with the Server
91
- :return: The opened connection
92
- """
93
- # driver = 'SQL+Server'
94
- constring = 'mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/' + self.db_name + \
95
- '?driver=' + self.driver + self.trusted_certificate
96
- self.engine = sa.create_engine(constring % parse.quote_plus(self.password))
97
- self.con = self.engine.connect().connection
98
-
99
- def close_connection(self):
100
- """ Close any opened connections with the Server
101
- :return: None
102
- """
103
- self.con.close()
104
- if self.engine:
105
- self.engine.dispose()
106
-
107
- @staticmethod
108
- def _chunker(seq, size):
109
- """ Split the data set in chunks to be sent to SQL
110
- :param seq: Sequence of records to be split
111
- :param size: Size of any of the chunks to split the data
112
- :return: The DataFrame divided in chunks
113
- """
114
- return (seq[pos:pos + size] for pos in range(0, len(seq), size))
115
-
116
- def query(self, sql_query, coerce_float=False):
117
- """ Read data from SQL according to the sql_query
118
- -----------------------------
119
- query_str = "SELECT * FROM %s" & table
120
- con_.query(query_str)
121
- -----------------------------
122
- :param sql_query: Query to be sent to SQL
123
- :param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
124
- to floating point.
125
- :return: DataFrame gathering the requested data
126
- """
127
- self.open_read_connection()
128
- data = None
129
- try:
130
- with self.engine.begin() as conn:
131
- data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
132
- except ValueError:
133
- print(traceback.format_exc())
134
- finally:
135
- self.close_connection()
136
- return data
137
-
138
- @staticmethod
139
- def _parse_df(parse, data, col_names):
140
- """ Auxiliar function to convert list to DataFrame
141
- :param parse: Parameter to indicate whether the data has to be transformed into a DataFrame or not
142
- :param data: List gathering the data retrieved from SQL
143
- :param col_names: List of columns to create the DataFrame
144
- :return: Formatted data
145
- """
146
- if parse is True:
147
- col_names = list(zip(*list(col_names)))[0]
148
- res = pd.DataFrame(list(zip(*data)), index=col_names).T
149
- else:
150
- res = [col_names, data]
151
- return res
152
-
153
- def sp_results(self, sql_query, resp_number=None, parse=True):
154
- """ Execute a stored procedure and retrieves all its output data
155
- -----------------------------
156
- query_str = "EXECUTE %s" & stored_procedure
157
- con_.sp_results(query_str, resp_number=1)
158
- -----------------------------
159
- :param sql_query: Query to be sent to SQL
160
- :param resp_number: Indicate which of the stored procedures responses will be retrieved
161
- :param parse: Indicate whether the output needs to be converted to a DataFrame or not
162
- :return: DataFrame list gathering the requested data
163
- """
164
- self.open_read_connection()
165
- data_list = list()
166
- cursor = None
167
- try:
168
- cursor = self.con.cursor()
169
- cursor.execute(sql_query)
170
- if resp_number is not None:
171
- for cursor_number in range(resp_number - 1):
172
- cursor.nextset()
173
- try:
174
- data_list.append(self._parse_df(parse, cursor.fetchall(), cursor.description))
175
- except ValueError:
176
- raise ValueError('Please indicate a valid resp_number')
177
- else:
178
- aux_cursor = True
179
- count = 0
180
- while aux_cursor is not False and count < 100:
181
- try:
182
- data_list.append(self._parse_df(parse, cursor.fetchall(), cursor.description))
183
- aux_cursor = cursor.nextset()
184
- except Exception:
185
- cursor.nextset()
186
- finally:
187
- count += 1
188
- if count >= 100:
189
- raise RuntimeError("Method sp_results has loop over 100 times for database '%s' on server '%s'"
190
- % (self.db_name, self.server))
191
- self.con.commit()
192
- except ValueError:
193
- print(traceback.format_exc())
194
- finally:
195
- if cursor:
196
- cursor.close()
197
- self.close_connection()
198
- return data_list
199
-
200
- def run_statement(self, sql_statement):
201
- """ Execute SQL statement
202
- -----------------------------
203
- query_str = "DELETE FROM %s WHERE Id > 100" & table
204
- con_.run_statement(query_str)
205
- -----------------------------
206
- :param sql_statement: Statement as string to be run in SQL
207
- :return: Statement result
208
- """
209
- self.open_write_connection()
210
- cursor = self.con.cursor()
211
- # Execute SQL statement
212
- try:
213
- cursor.execute(sql_statement)
214
- self.con.commit()
215
- except Exception:
216
- raise Exception(traceback.format_exc())
217
- finally:
218
- if cursor:
219
- cursor.close()
220
- self.close_connection()
221
-
222
- def insert(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1000, print_sql=False,
223
- commit_all_together=False, output=None, bools2bits=True, nullable=False):
224
- """ Insert data in a table in SQL truncating the table if needed
225
- -----------------------------
226
- df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
227
- con_.insert(df, table_schema, table_name)
228
- -----------------------------
229
- :param data: DataFrame containing the data to upload
230
- :param schema: Schema of the table in which the data will be uploaded
231
- :param table: Table in which the data will be uploaded
232
- :param truncate: Indicate whether the table has to be truncated before the data is sent or not
233
- :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
234
- external constraints)
235
- :param identity: Indicate whether the identity columns will be inserted or not
236
- :param chunk: Indicate how many rows will be uploaded at once
237
- :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
238
- :param commit_all_together: when it is true, it only commits data if all data has been inserted. When it is
239
- false, it commits data by chunks.
240
- :param output: Outputs the columns indicated in this list
241
- :param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
242
- :return: A DataFrame with the output columns requested if output is not None, else None
243
- :param nullable: Used within bools2bits function to indicate which boolean column values to convert
244
- """
245
- if output is None:
246
- output = []
247
- if data is None:
248
- # no data to upload
249
- return ValueError("The data provided is invalid!")
250
- cursor = None
251
- self.open_write_connection()
252
- results = pd.DataFrame(columns=output)
253
-
254
- # Mapping the date datatype columns for SQL
255
- data = self.date_mapping_data_types(data)
256
-
257
- # Mapping the boolean columns to bit
258
- if bools2bits:
259
- data = self.boolean_mapping_data_types(data, nullable)
260
-
261
- try:
262
- cursor = self.con.cursor()
263
- # Truncate table if needed
264
- if truncate:
265
- cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
266
- # Delete all records from the table if needed
267
- if delete:
268
- cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
269
- # Allow to insert to an Identity column
270
- if identity:
271
- cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
272
- # Convert category columns to string
273
- cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
274
- data[cat_cols] = data[cat_cols].astype(str)
275
- # Deal with bull values and apostrophes (')
276
- data = data.replace("'NULL'", "NULL")
277
- data = data.replace("'", "~~", regex=True)
278
- data = data.fillna("null")
279
- # Insert data into the table destination
280
- records = [tuple(x) for x in data.values]
281
- insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
282
- insert_ += str(tuple(data.columns.values)).replace("(\'", "([").replace('\', \'', '], [').replace('\')',
283
- '])')
284
- if len(output) > 0:
285
- insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
286
- insert_ += """ VALUES """
287
-
288
- for batch in self._chunker(records, chunk):
289
- rows = str(batch).strip('[]').replace("~~", "''")
290
- rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
291
- insert_rows = insert_ + rows
292
- insert_rows = self.convert_decimal_str(insert_rows)
293
- if print_sql:
294
- print(insert_rows)
295
- cursor.execute(insert_rows)
296
- if len(output) > 0:
297
- results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
298
- if ~commit_all_together:
299
- self.con.commit()
300
- if commit_all_together:
301
- self.con.commit()
302
-
303
- # Restrict to insert to an Identity column
304
- if identity:
305
- cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
306
-
307
- if len(output) > 0:
308
- return results.reset_index(drop=True)
309
-
310
- except Exception:
311
- raise Exception(traceback.format_exc())
312
-
313
- finally:
314
- if cursor:
315
- cursor.close()
316
- self.close_connection()
317
-
318
- def insert_at_once(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1,
319
- print_sql=False, output=None, bools2bits=True, nullable=False):
320
- """ Build all the insert statements and commit them all at once
321
- -----------------------------
322
- df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
323
- con_.insert(df, table_schema, table_name)
324
- -----------------------------
325
- :param data: DataFrame containing the data to upload
326
- :param schema: Schema of the table in which the data will be uploaded
327
- :param table: Table in which the data will be uploaded
328
- :param truncate: Indicate whether the table has to be truncated before the data is sent or not
329
- :param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
330
- external constraints)
331
- :param identity: Indicate whether the identity columns will be inserted or not
332
- :param chunk: Indicate how many rows will be uploaded at once
333
- :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
334
- :param output: Outputs the columns indicated in this list
335
- :param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
336
- :return: A DataFrame with the output columns requested if output is not None, else None
337
- :param nullable: Used within bools2bits function to indicate which boolean column values to convert
338
- """
339
- if output is None:
340
- output = []
341
- if data is None:
342
- # no data to upload
343
- return ValueError("The data provided is invalid!")
344
- cursor = None
345
- self.open_write_connection()
346
- results = pd.DataFrame(columns=output)
347
-
348
- # Mapping the date datatype columns for SQL
349
- data = self.date_mapping_data_types(data)
350
-
351
- # Mapping the boolean columns to bit
352
- if bools2bits:
353
- data = self.boolean_mapping_data_types(data, nullable)
354
-
355
- try:
356
- cursor = self.con.cursor()
357
- # Truncate table if needed
358
- if truncate:
359
- cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
360
- # Delete all records from the table if needed
361
- if delete:
362
- cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
363
- # Allow to insert to an Identity column
364
- if identity:
365
- cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
366
- # Convert category columns to string
367
- cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
368
- data[cat_cols] = data[cat_cols].astype(str)
369
- # Deal with bull values and apostrophes (')
370
- data = data.replace("'NULL'", "NULL")
371
- data = data.replace("'", "~~", regex=True)
372
- data = data.fillna("null")
373
- # Insert data into the table destination
374
- records = [tuple(x) for x in data.values]
375
- insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
376
- insert_ += str(tuple(data.columns.values)).replace("(\'", "([").replace('\', \'', '], [').replace('\')',
377
- '])')
378
- if len(output) > 0:
379
- insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
380
- insert_ += """ VALUES """
381
-
382
- insert_statements = list()
383
- for batch in self._chunker(records, chunk):
384
- rows = str(batch).strip('[]').replace("~~", "''")
385
- rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
386
- insert_rows = insert_ + rows
387
- insert_rows = self.convert_decimal_str(insert_rows)
388
- insert_statements.append(insert_rows)
389
-
390
- if print_sql:
391
- print(';'.join(insert_statements))
392
- cursor.execute(';'.join(insert_statements))
393
- if len(output) > 0:
394
- results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
395
- self.con.commit()
396
-
397
- # Restrict to insert to an Identity column
398
- if identity:
399
- cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
400
-
401
- if len(output) > 0:
402
- return results.reset_index(drop=True)
403
-
404
- except Exception:
405
- raise Exception(traceback.format_exc())
406
-
407
- finally:
408
- if cursor:
409
- cursor.close()
410
- self.close_connection()
411
-
412
- def update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, batch_size=100,
413
- output=None, nullable=True):
414
- """ This method updates a table in batches in sql server.
415
- -----------------------------
416
- UPDATE [SCHEMA].[TABLE]
417
- SET update_list[0] = data[index, update_list{0}],
418
- update_list[1] = data[index, update_list[1]]
419
- OUTPUT output[0], output[1]
420
- WHERE on_list[0] = data[index, on_list[0]]
421
- AND on_list[1] = data[index, on_list[1]]
422
- -----------------------------
423
- :param data: DataFrame containing the data to update
424
- :param update_list: list of columns to update
425
- :param on_list: list of columns to apply the on clause
426
- :param schema: Schema of the table in which the data will be uploaded
427
- :param table: Table in which the data will be uploaded
428
- :param bool_cols: list of columns gathering boolean types
429
- :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
430
- :param bool_cols: columns to include as booleans
431
- :param batch_size: Number of records to update in each iteration
432
- :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
433
- retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
434
- retrieved)
435
- :param nullable: Indicate whether to update the table column with null or exclude the reference from the update
436
- :return: None
437
- """
438
- cursor = None
439
- if data is None:
440
- # no data to update
441
- return ValueError("The data provided is invalid!")
442
-
443
- if output is None:
444
- output = []
445
- else:
446
- output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for out
447
- in output]
448
- results = pd.DataFrame(columns=output)
449
-
450
- # re-starting indexes
451
- data.reset_index(drop=True, inplace=True)
452
-
453
- # Mapping boolean columns
454
- if bool_cols is not None:
455
- for col in bool_cols:
456
- data[col] = data[col].astype(bool)
457
-
458
- # Mapping date type for SQL
459
- data = self.date_mapping_data_types(data)
460
-
461
- # create connection
462
- self.open_write_connection()
463
-
464
- try:
465
- # initialise cursor
466
- cursor = self.con.cursor()
467
-
468
- # extraction of the useful columns
469
- data_update = data[list(set(update_list + on_list))]
470
-
471
- # initialisation of the sql statement
472
- sql_start = ''' UPDATE [%s].[%s] SET ''' % (schema, table)
473
- iter_batch = math.ceil(data_update.shape[0] / batch_size)
474
- for batch in range(iter_batch):
475
- batch_update = data_update.iloc[batch * batch_size: (batch + 1) * batch_size]
476
-
477
- sql_statement = ''
478
- for iindex in batch_update.index:
479
- # UPDATE [SCHEMA].[TABLE]
480
- sql_statement += sql_start
481
-
482
- # VALUES
483
- for col in update_list:
484
- if nullable:
485
- if pd.isna(batch_update.loc[iindex, col]):
486
- sql_statement += " [%s] = NULL ," % col
487
- elif isinstance(batch_update.loc[iindex, col], bool):
488
- sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
489
- elif isinstance(batch_update.loc[iindex, col], Number):
490
- sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
491
- else:
492
- sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
493
- else:
494
- if pd.notna(batch_update.loc[iindex, col]):
495
- if str(batch_update.loc[iindex, col]).upper() == 'NULL':
496
- sql_statement += " [%s] = NULL ," % col
497
- elif isinstance(batch_update.loc[iindex, col], bool):
498
- sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
499
- elif isinstance(batch_update.loc[iindex, col], Number):
500
- sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
501
- else:
502
- sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
503
-
504
- # OUTPUT
505
- if len(output) > 0:
506
- sql_statement = sql_statement[:-1] + " OUTPUT " + ",".join(output) + ' '
507
-
508
- # WHERE
509
- sql_statement = sql_statement[:-1] + ' WHERE '
510
- for col in on_list:
511
- if pd.isna(batch_update.loc[iindex, col]):
512
- sql_statement += " [%s] = NULL AND" % col
513
- elif isinstance(batch_update.loc[iindex, col], bool):
514
- sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
515
- elif isinstance(batch_update.loc[iindex, col], Number):
516
- sql_statement += " [%s] = %s AND" % (col, batch_update.loc[iindex, col])
517
- else:
518
- sql_statement += " [%s] = '%s' AND" % (col, batch_update.loc[iindex, col])
519
-
520
- # Addition of semicolon
521
- sql_statement = sql_statement[:-3] + ';'
522
-
523
- if print_sql:
524
- print(sql_statement)
525
-
526
- # executing statement
527
- if len(sql_statement) > 0:
528
- if len(output) > 0:
529
- cursor.execute(sql_statement)
530
- for cursor_number in range(len(sql_statement.split(';'))):
531
- results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
532
- cursor.nextset()
533
- else:
534
- cursor.execute(sql_statement)
535
- self.con.commit()
536
-
537
- if len(output) > 0:
538
- return results.reset_index(drop=True)
539
-
540
- except Exception:
541
- raise Exception(traceback.format_exc())
542
-
543
- finally:
544
- if cursor:
545
- cursor.close()
546
- self.close_connection()
547
-
548
- def bulk_update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, output=None,
549
- chunk=1000):
550
- """ This method updates a table in batches in sql server.
551
- -----------------------------
552
- UPDATE [SCHEMA].[TABLE]
553
- SET update_list[0] = data[index, update_list{0}],
554
- update_list[1] = data[index, update_list[1]]
555
- OUTPUT output[0], output[1]
556
- WHERE on_list[0] = data[index, on_list[0]]
557
- AND on_list[1] = data[index, on_list[1]]
558
- -----------------------------
559
- :param data: DataFrame containing the data to update
560
- :param update_list: list of columns to update
561
- :param on_list: list of columns to apply the on clause
562
- :param schema: Schema of the table in which the data will be uploaded
563
- :param table: Table in which the data will be uploaded
564
- :param bool_cols: list of columns gathering boolean types
565
- :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
566
- :param bool_cols: columns to include as booleans
567
- :param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
568
- retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
569
- retrieved)
570
- :param chunk: Indicate how many rows will be uploaded at once
571
- :return: None
572
- """
573
- cursor = None
574
- if data is None:
575
- # no data to update
576
- return ValueError("The data provided is invalid!")
577
-
578
- if output is None:
579
- output = []
580
- sql_output = []
581
- else:
582
- sql_output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for
583
- out
584
- in output]
585
- results = pd.DataFrame(columns=output)
586
-
587
- # re-starting indexes
588
- data.reset_index(drop=True, inplace=True)
589
-
590
- # Mapping boolean columns
591
- if bool_cols is not None:
592
- for col in bool_cols:
593
- data[col] = data[col].astype(bool)
594
-
595
- # Mapping date type for SQL
596
- data = data[on_list + update_list]
597
- data = self.date_mapping_data_types(data)
598
-
599
- # create connection
600
- self.open_write_connection()
601
-
602
- try:
603
- # initialise cursor
604
- cursor = self.con.cursor()
605
-
606
- # Convert category columns to string
607
- cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
608
- data[cat_cols] = data[cat_cols].astype(str)
609
- # Deal with bull values and apostrophes (')
610
- data = data.replace("'NULL'", "NULL")
611
- data = data.replace("'", "~~", regex=True)
612
- data = data.fillna("null")
613
-
614
- records = [tuple(x) for x in data.values]
615
- temp_table = f'#Temp{schema}{table}'
616
-
617
- for batch in self._chunker(records, chunk):
618
- batch_records = [tuple(x) for x in batch]
619
- # initialisation of the sql statement
620
- insert_ = f'DROP TABLE IF EXISTS {temp_table} '
621
- insert_ += f"SELECT * INTO {temp_table} FROM ( VALUES "
622
- temp_columns = str(tuple(data.columns.values)).replace("(\'", "([").replace(
623
- '\', \'', '], [').replace('\')', '])')
624
- rows = str(batch_records).strip('[]').replace("~~", "''")
625
- rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
626
- sql_statement = insert_ + rows
627
- sql_statement = self.convert_decimal_str(sql_statement)
628
- sql_statement += f') AS TempTable {temp_columns}'
629
-
630
- col_update_set = ''
631
- for col in update_list:
632
- col_update_set += f' target.{col} = source.{col},'
633
- col_update_set = col_update_set[:-1]
634
-
635
- col_difference_check = ''
636
- for col in update_list:
637
- col_difference_check += f' target.{col} <> source.{col} OR'
638
- col_difference_check = col_difference_check[:-2]
639
-
640
- col_join_on = ''
641
- for col in on_list:
642
- col_join_on += f' source.{col} = target.{col} AND'
643
- col_join_on = col_join_on[:-3]
644
-
645
- sql_statement += f'UPDATE target SET {col_update_set} '
646
-
647
- if len(output) > 0:
648
- sql_statement += f" OUTPUT {','.join(sql_output)} "
649
-
650
- sql_statement += f'''FROM {schema}.{table} target
651
- JOIN {temp_table} as source
652
- ON {col_join_on}
653
- WHERE {col_difference_check}
654
- '''
655
-
656
- sql_statement += f' DROP TABLE IF EXISTS {temp_table} '
657
-
658
- if print_sql:
659
- print(sql_statement)
660
-
661
- # executing statement
662
- if len(sql_statement) > 0:
663
- if len(output) > 0:
664
- cursor.execute(sql_statement)
665
- cursor.nextset()
666
- results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
667
- else:
668
- cursor.execute(sql_statement)
669
-
670
- self.con.commit()
671
-
672
- if len(output) > 0:
673
- return results.reset_index(drop=True)
674
-
675
- except Exception:
676
- raise Exception(traceback.format_exc())
677
-
678
- finally:
679
- if cursor:
680
- cursor.close()
681
- self.close_connection()
682
-
683
- def merge(self, data, staging_schema, staging_table, sp_schema, sp_name, truncate=False, chunk=1000):
684
- """ Merge data from Staging table using a Stored Procedure. It requires a table in SQL which will store the
685
- Staging data. The method will work as follows:
686
- 1.- Truncate the staging table according to the truncate parameter
687
- 2.- Insert the data into the staging table
688
- 3.- Execute a stored procedure to merge the staging table with the destination table
689
- -----------------------------
690
- df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
691
- con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
692
- -----------------------------
693
- :param data: DataFrame to insert in the staging table
694
- :param staging_schema: Staging table schema
695
- :param staging_table: Staging table name
696
- :param sp_schema: Stored Procedure schema
697
- :param sp_name: Stored Procedure name
698
- :param truncate: Indicate whether the staging table has to be truncated or not
699
- :param chunk: Indicate how many rows will be uploaded at once
700
- :return: None
701
- """
702
- if data is None:
703
- # no data to upload
704
- return ValueError("The data provided is invalid!")
705
- cursor = None
706
- self.open_write_connection()
707
- try:
708
- cursor = self.con.cursor()
709
- # Truncate Staging table if needed
710
- if truncate:
711
- trunc_insert = """TRUNCATE TABLE [%s].[%s]""" % (staging_schema, staging_table)
712
- cursor.execute(trunc_insert)
713
- self.con.commit()
714
- # Convert category columns to string
715
- cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
716
- data[cat_cols] = data[cat_cols].astype(str)
717
- # Deal with null values and apostrophes (')
718
- data = data.replace("'NULL'", "NULL")
719
- data = data.replace("'", "~~", regex=True)
720
- data = data.fillna("null")
721
- # Insert in Staging Table
722
- records = [tuple(x) for x in data.values]
723
- insert_ = """INSERT INTO [%s].[%s] """ % (staging_schema, staging_table)
724
- insert_ = insert_ + str(tuple(data.columns.values)).replace("\'", "") + """ VALUES """
725
- for batch in self._chunker(records, chunk):
726
- rows = str(batch).strip('[]').replace("~~", "''")
727
- rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
728
- insert_rows = insert_ + rows
729
- insert_rows = self.convert_decimal_str(insert_rows)
730
- cursor.execute(insert_rows)
731
- self.con.commit()
732
- # Execute Stored Procedure
733
- exec_sp = """EXECUTE [%s].[%s]""" % (sp_schema, sp_name)
734
- cursor.execute(exec_sp)
735
- self.con.commit()
736
- except Exception:
737
- raise Exception(traceback.format_exc())
738
- finally:
739
- if cursor:
740
- cursor.close()
741
- self.close_connection()
742
-
743
- def merge_into(self, data, schema, table, on_list, update_check=False, update_set=None, bool_cols=None,
744
- identity=False, print_sql=False, nullable=False):
745
- """
746
- This method is equivalent to the 'merge into' of T-sql. Schema and table defines the Target, while data is the
747
- Source. Please refer to below schema for more arguments use clarifications.
748
- Aspects to take into consideration:
749
- 1.- This method will not work properly if data contains duplicates. It is not relevant if the target contains
750
- duplicates because DISTINCT is used to call the table.
751
- 2.- When having booleans in the dataset you have to pay attention because pandas get bool from sql server as
752
- [True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
753
- 3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
754
- the name of a date column and as values the format that the columns has to have.
755
- Versions comments...
756
- + Difference between version 1.0 and 1.01 is that the last one is a bit simpler, it waits for names of columns
757
- which types are booleans or datetime (and format for this one) instead of trying to figure out this columns
758
- as in version 1.0 what is sometimes problematic. So, version 1.01 is more reliable but requires more time
759
- to write the call to the method.
760
- -------------------------
761
- MERGE INTO [SCHEMA].[TABLE] AS TARGET
762
- USING (
763
- data
764
- ) AS SOURCE
765
- ON TARGET.on_list[0] = SOURCE.on_list[0]
766
- AND TARGET.on_list[1] = SOURCE.on_list[1]
767
- ...
768
- AND TARGET.on_list[n] = SOURCE.on_list[n]
769
- WHEN MATCHED AND (
770
- TARGET.update_check[0] <> SOURCE.update_check[0]
771
- OR TARGET.update_check[1] <> SOURCE.update_check[1]
772
- ...
773
- OR TARGET.update_check[n] <> SOURCE.update_check[n]
774
- )
775
- UPDATE SET TARGET.update_check[0] = SOURCE.update_check[0],
776
- ...
777
- TARGET.update_check[n] = SOURCE.update_check[n],
778
- TARGET.update_set[0] = SOURCE.update_set[0],
779
- TARGET.update_set[1] = SOURCE.update_set[1],
780
- ....
781
- TARGET.update_set[n] = SOURCE.update_set[n]
782
- WHEN NOT MATCHED BY TARGET THEN
783
- INSERT
784
- (
785
- all columns from [SCHEMA].[TABLE]
786
- )
787
- VALUES
788
- (all columns from data)
789
- -------------------------------
790
- :param data: DataFrame containing the data to upload/update
791
- :param schema: Schema of the table in which the data will be uploaded
792
- :param table: Table in which the data will be uploaded
793
- :param on_list: list of columns to apply the on clause
794
- :param update_check: list of columns to do the check
795
- :param update_set: list of columns to update
796
- :param bool_cols: list of columns gathering boolean types
797
- :param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
798
- in its definition has it. Its a boolean.
799
- :param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
800
- :return: None
801
- :param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
802
- """
803
- if data is None:
804
- # no data to upload
805
- return ValueError("The data provided is invalid!")
806
-
807
- if data.shape[0] != data.drop_duplicates().shape[0]:
808
- return TypeError("There are duplicates values in your dataframe, it will not work properly on "
809
- "pd.concat().drop_duplicates()")
810
-
811
- # if update_set has values assigned, update check has to have values assigned
812
- if update_set is not None:
813
- if update_check is None:
814
- return ValueError("Please, to use update_set assigned values to update_check")
815
- else:
816
- update_set = update_check
817
-
818
- # Mapping boolean columns
819
- if bool_cols is not None:
820
- for col in bool_cols:
821
- data[col] = data[col].astype(bool)
822
-
823
- # Mapping date and boolean type for SQL
824
- data = self.date_mapping_data_types(data)
825
- data = self.boolean_mapping_data_types(data, nullable)
826
-
827
- try:
828
- # call the table from the server
829
- data_table = self.query("""SELECT DISTINCT * FROM [%s].[%s]""" % (schema, table))
830
-
831
- if data_table.shape[0] == 0:
832
- print("The destination table is empty so all the data will be inserted")
833
- self.insert(data, schema, table)
834
-
835
- else:
836
- for data_col in data.columns:
837
- if ("int" in str(type(data_table[data_col].iloc[0]))) & (
838
- data_table[data_col].isnull().sum() > 0):
839
- data_table[data_col] = data_table[data_col].astype(float)
840
- else:
841
- data_table[data_col] = data_table[data_col].astype(type(data[data_col].iloc[0]))
842
-
843
- coincidence = pd.DataFrame()
844
- if data_table.shape[0] > 0:
845
- for col in data_table.columns.values.tolist():
846
- if isinstance(data_table.loc[0, col], bool):
847
- data_table[col] = data_table[col].apply(
848
- lambda x: 1 if x is True else 0 if x is False else np.NaN)
849
- if bool_cols is not None:
850
- for col in bool_cols:
851
- data_table[col] = data_table[col].astype(bool)
852
- # join the input table with the one in the database
853
- coincidence = data.merge(data_table[on_list], how='inner', on=on_list)
854
- # WHEN MATCHED AND ... UPDATE SET
855
- if update_check:
856
- coincidence2 = coincidence.merge(data_table[list(set(on_list + update_check))],
857
- how='inner',
858
- on=list(set(on_list + update_check)))
859
- data_update = pd.concat([coincidence, coincidence2], ignore_index=True)
860
- data_update.drop_duplicates(keep=False, inplace=True)
861
- if data_update.shape[0] > 0:
862
- self.update(data_update, list(set(update_set + update_check)), on_list, schema, table,
863
- print_sql=print_sql)
864
-
865
- # WHEN NOT MATCHED BY TARGET THEN... INSERT
866
- data_insert = pd.concat([data, coincidence], ignore_index=True)
867
- data_insert.drop_duplicates(keep=False, inplace=True)
868
- if data_insert.shape[0] > 0:
869
- self.insert(data_insert, schema, table, identity=identity, print_sql=print_sql)
870
-
871
- except Exception:
872
- raise Exception(traceback.format_exc())
873
-
874
- @staticmethod
875
- def date_mapping_data_types(data):
876
- """
877
- Map datetime and boolean variables so they can be inserted in SQL
878
- :param data: DataFrame containing the variables to map
879
- :return: The mapped DataFrame
880
- """
881
- first_index = data.index[0]
882
- date_col = data.columns[
883
- [('date' in str(type(data.loc[first_index, col]))) | ('timestamp' in str(type(data.loc[first_index, col])))
884
- for col in data.columns]]
885
- if len(date_col) > 0:
886
- for col in date_col:
887
- data[col] = pd.to_datetime(data[col])
888
- if data[col].dtypes == 'O':
889
- data[col] = data[col].dt.strftime('%Y-%m-%d')
890
- else:
891
- data[col] = data[col].dt.strftime('%Y-%m-%d %H:%M:%S')
892
- data.loc[data[col] == 'NaT', col] = np.nan
893
-
894
- return data
895
-
896
- @staticmethod
897
- def boolean_mapping_data_types(data, nullable=False):
898
- """
899
- Map datetime and boolean variables so they can be inserted in SQL
900
- :param data: DataFrame containing the variables to map
901
- :return: The mapped DataFrame
902
- :param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
903
- """
904
- first_index = data.index[0]
905
- bool_col = data.columns[
906
- [('bool' in str(type(data.loc[first_index, col]))) | ('object' in str(type(data.loc[first_index, col]))) for
907
- col in data.columns]]
908
- if len(bool_col) > 0:
909
- for col in bool_col:
910
- if nullable:
911
- bool_not_null = data[data[col].notna()]
912
- if bool_not_null.shape[0] > 0:
913
- for iindex in bool_not_null.index:
914
- data.at[iindex, col] = int(data.loc[iindex, col])
915
- else:
916
- data[col] = data[col].apply(lambda x: 1 if x is True else 0)
917
-
918
- return data
919
-
920
- @staticmethod
921
- def id_next(con_db, table, schema, id_col, print_sql=False):
922
- """
923
- This static method returns the next id to be inserted into a table for sql_server
924
- :param con_db: class to connect to a sql server dabatase
925
- :param table: name of the table
926
- :param schema: name of the schema
927
- :param id_col: name of the id column
928
- :param print_sql: bool to indicate if you want sql statement to be print on Python Console
929
- :return: Max ID + 1 for id_col
930
- """
931
- sql_statement = ("SELECT CASE WHEN MAX(%s) IS NULL THEN 1 ELSE MAX(%s) + 1 END AS [Id] FROM [%s].[%s]" % (
932
- id_col, id_col, schema, table))
933
- if print_sql:
934
- print(sql_statement)
935
- df = con_db.query(sql_statement)
936
- next_id = df.loc[0, 'Id']
937
-
938
- return next_id
939
-
940
- @staticmethod
941
- def convert_decimal_str(string):
942
- """ Method to parse the Decimal type in python
943
- :param string: String variable to parse
944
- """
945
- string = re.sub("'\)(?!(,[ ]+\())(?=([^$]))", "", string)
946
- return re.sub("Decimal\('", "", string)