berryworld 1.0.0.189823__py3-none-any.whl → 1.0.0.192676__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- berryworld/__init__.py +2 -1
- berryworld/sql_connenction.py +1016 -0
- {berryworld-1.0.0.189823.dist-info → berryworld-1.0.0.192676.dist-info}/METADATA +1 -1
- {berryworld-1.0.0.189823.dist-info → berryworld-1.0.0.192676.dist-info}/RECORD +7 -6
- {berryworld-1.0.0.189823.dist-info → berryworld-1.0.0.192676.dist-info}/WHEEL +1 -1
- {berryworld-1.0.0.189823.dist-info → berryworld-1.0.0.192676.dist-info}/licenses/LICENSE +0 -0
- {berryworld-1.0.0.189823.dist-info → berryworld-1.0.0.192676.dist-info}/top_level.txt +0 -0
berryworld/__init__.py
CHANGED
|
@@ -23,4 +23,5 @@ from .vivantio import Vivantio
|
|
|
23
23
|
from .teams_logging import TeamsLogging
|
|
24
24
|
from .vivantio_logging import VivantioLogging
|
|
25
25
|
from .snowflake_conn import SnowflakeConn
|
|
26
|
-
from .logging import PythonLogs
|
|
26
|
+
from .logging import PythonLogs
|
|
27
|
+
from .sql_connenction import SQLConnection
|
|
@@ -0,0 +1,1016 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import math
|
|
4
|
+
import pyodbc
|
|
5
|
+
import traceback
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import sqlalchemy as sa
|
|
9
|
+
from urllib import parse
|
|
10
|
+
from numbers import Number
|
|
11
|
+
|
|
12
|
+
from lief import exception
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SQLConnection:
|
|
16
|
+
""" Connect to Microsoft SQL """
|
|
17
|
+
|
|
18
|
+
def __init__(self, db_reference, server, master=False, trusted_certificate=True, azure=False, encrypt=True,
|
|
19
|
+
multi_db=False):
|
|
20
|
+
""" Initialize the class
|
|
21
|
+
It requires the
|
|
22
|
+
SQL-DBREFERENCE-DBNAME = 'db name'
|
|
23
|
+
SQL-DBREFERENCE-USERNAME = 'user'
|
|
24
|
+
SQL-DBREFERENCE-PASSW = 'password'
|
|
25
|
+
-----------------------------
|
|
26
|
+
db_reference = 'FruitFlow'
|
|
27
|
+
server = 'prod'
|
|
28
|
+
wincred = True
|
|
29
|
+
master = False
|
|
30
|
+
|
|
31
|
+
con_ = SQLConnection(db_reference, server, wincred, master)
|
|
32
|
+
-----------------------------
|
|
33
|
+
:param db_reference: Database reference to connect to
|
|
34
|
+
:param server: Server to connect to
|
|
35
|
+
:param master: Indicate whether the connection will be done to master or to a specific database
|
|
36
|
+
:param trusted_certificate: Indicate whether the connection will be done using the TrustServerCertificate
|
|
37
|
+
:param azure: Indicate whether the connection will be done to an Azure SQL database or to an on-premise SQL
|
|
38
|
+
:param encrypt: Indicate whether the connection will use SSL/TLS encryption
|
|
39
|
+
:param multi_db: Indicate whether the connection will be done to a specific database or to multiple databases
|
|
40
|
+
"""
|
|
41
|
+
self.db_reference = db_reference
|
|
42
|
+
self.server = server
|
|
43
|
+
if self.server is None:
|
|
44
|
+
raise ValueError("Please provide a value for server parameter")
|
|
45
|
+
|
|
46
|
+
self.multi_db = multi_db
|
|
47
|
+
self.master = master
|
|
48
|
+
if trusted_certificate:
|
|
49
|
+
self.trusted_certificate = '&TrustServerCertificate=yes'
|
|
50
|
+
else:
|
|
51
|
+
self.trusted_certificate = ''
|
|
52
|
+
|
|
53
|
+
if encrypt:
|
|
54
|
+
self.encrypt = '&Encrypt=yes'
|
|
55
|
+
else:
|
|
56
|
+
self.encrypt = ''
|
|
57
|
+
|
|
58
|
+
drivers = [driver for driver in pyodbc.drivers() if (bool(re.search(r'\d', driver)))]
|
|
59
|
+
self.azure = azure
|
|
60
|
+
self.creds = {}
|
|
61
|
+
try:
|
|
62
|
+
self.server_name, self.db_name, self.user_name, self.password = self.credentials()
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f'Cannot find a reference to {self.db_reference} - Error: {str(e)}')
|
|
66
|
+
|
|
67
|
+
self.con = None
|
|
68
|
+
self.engine = None
|
|
69
|
+
self.con_string = None
|
|
70
|
+
|
|
71
|
+
driver_attempt = ''
|
|
72
|
+
for driver in drivers:
|
|
73
|
+
try:
|
|
74
|
+
self.driver = driver
|
|
75
|
+
self.query('''SELECT TOP 1 * FROM information_schema.tables;''')
|
|
76
|
+
break
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(e)
|
|
79
|
+
driver_attempt = str(e)
|
|
80
|
+
|
|
81
|
+
if driver_attempt != '':
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Cannot connect to db: {self.db_name} - Error: {str(driver_attempt)}")
|
|
84
|
+
|
|
85
|
+
def credentials(self):
|
|
86
|
+
""" Return the credentials used to connect to the SQL Server
|
|
87
|
+
:return: Dictionary with the credentials used to connect to the SQL Server
|
|
88
|
+
"""
|
|
89
|
+
if self.azure:
|
|
90
|
+
server_name = os.environ.get(f"SQL-{self.server.upper()}")
|
|
91
|
+
else:
|
|
92
|
+
server_name = os.environ.get(f"SQL-ONPREM-{self.server.upper()}")
|
|
93
|
+
|
|
94
|
+
if os.environ.get("SQL-" + self.db_reference.upper() + '-DBNAME-' + self.server.upper()) is not None:
|
|
95
|
+
db_name = os.environ.get("SQL-" + self.db_reference.upper() + '-DBNAME-' + self.server.upper())
|
|
96
|
+
else:
|
|
97
|
+
db_name = os.environ.get("SQL-" + self.db_reference.upper() + '-DBNAME')
|
|
98
|
+
|
|
99
|
+
user_name = os.environ.get("SQL_" + self.db_reference.upper() + '_USERNAME')
|
|
100
|
+
password = os.environ.get("SQL_" + self.db_reference.upper() + '_PASSWORD')
|
|
101
|
+
|
|
102
|
+
return re.sub(r'(\\)\1*', r'\1', server_name), db_name, user_name, password
|
|
103
|
+
|
|
104
|
+
def open_read_connection(self, commit_as_transaction=True):
|
|
105
|
+
""" Open a reading connection with the Server
|
|
106
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
107
|
+
:return: The opened connection
|
|
108
|
+
"""
|
|
109
|
+
database = self.db_name
|
|
110
|
+
if self.multi_db:
|
|
111
|
+
database = str(self.db_name).lower().replace('primary;', '')
|
|
112
|
+
|
|
113
|
+
if self.master:
|
|
114
|
+
self.con_string = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/master' +
|
|
115
|
+
'?driver=' + self.driver + '&trusted_connection=yes' + self.trusted_certificate +
|
|
116
|
+
self.encrypt)
|
|
117
|
+
self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
|
|
118
|
+
else:
|
|
119
|
+
self.con_string = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/' + database +
|
|
120
|
+
'?driver=' + self.driver + self.trusted_certificate + self.encrypt)
|
|
121
|
+
self.engine = sa.create_engine(self.con_string % parse.quote_plus(self.password))
|
|
122
|
+
if not commit_as_transaction:
|
|
123
|
+
self.engine = self.engine.execution_options(isolation_level="AUTOCOMMIT")
|
|
124
|
+
self.con = self.engine.connect().connection
|
|
125
|
+
|
|
126
|
+
def open_write_connection(self, commit_as_transaction=True):
|
|
127
|
+
""" Open a writing connection with the Server
|
|
128
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
129
|
+
:return: The opened connection
|
|
130
|
+
"""
|
|
131
|
+
constring = ('mssql+pyodbc://' + self.user_name + ':%s@' + self.server + '/' + self.db_name +
|
|
132
|
+
'?driver=' + self.driver + self.trusted_certificate + self.encrypt)
|
|
133
|
+
self.engine = sa.create_engine(constring % parse.quote_plus(self.password))
|
|
134
|
+
if not commit_as_transaction:
|
|
135
|
+
self.engine = self.engine.execution_options(isolation_level="AUTOCOMMIT")
|
|
136
|
+
|
|
137
|
+
self.con = self.engine.connect().connection
|
|
138
|
+
|
|
139
|
+
def close_connection(self):
|
|
140
|
+
""" Close any opened connections with the Server
|
|
141
|
+
:return: None
|
|
142
|
+
"""
|
|
143
|
+
self.con.close()
|
|
144
|
+
if self.engine:
|
|
145
|
+
self.engine.dispose()
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _chunker(seq, size):
|
|
149
|
+
""" Split the data set in chunks to be sent to SQL
|
|
150
|
+
:param seq: Sequence of records to be split
|
|
151
|
+
:param size: Size of any of the chunks to split the data
|
|
152
|
+
:return: The DataFrame divided in chunks
|
|
153
|
+
"""
|
|
154
|
+
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
|
|
155
|
+
|
|
156
|
+
def query(self, sql_query, coerce_float=False):
|
|
157
|
+
""" Read data from SQL according to the sql_query
|
|
158
|
+
-----------------------------
|
|
159
|
+
query_str = "SELECT * FROM %s" & table
|
|
160
|
+
con_.query(query_str)
|
|
161
|
+
-----------------------------
|
|
162
|
+
:param sql_query: Query to be sent to SQL
|
|
163
|
+
:param coerce_float: Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
164
|
+
to floating point.
|
|
165
|
+
:return: DataFrame gathering the requested data
|
|
166
|
+
"""
|
|
167
|
+
self.open_read_connection()
|
|
168
|
+
data = None
|
|
169
|
+
try:
|
|
170
|
+
with self.engine.begin() as conn:
|
|
171
|
+
data = pd.read_sql_query(sa.text(sql_query), conn, coerce_float=coerce_float)
|
|
172
|
+
except ValueError:
|
|
173
|
+
print(traceback.format_exc())
|
|
174
|
+
finally:
|
|
175
|
+
self.close_connection()
|
|
176
|
+
return data
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def _parse_df(parse_, data, col_names):
|
|
180
|
+
""" Auxiliar function to convert list to DataFrame
|
|
181
|
+
:param parse_: Parameter to indicate whether the data has to be transformed into a DataFrame or not
|
|
182
|
+
:param data: List gathering the data retrieved from SQL
|
|
183
|
+
:param col_names: List of columns to create the DataFrame
|
|
184
|
+
:return: Formatted data
|
|
185
|
+
"""
|
|
186
|
+
if parse_ is True:
|
|
187
|
+
col_names = list(zip(*list(col_names)))[0]
|
|
188
|
+
res = pd.DataFrame(list(zip(*data)), index=col_names).T
|
|
189
|
+
else:
|
|
190
|
+
res = [col_names, data]
|
|
191
|
+
return res
|
|
192
|
+
|
|
193
|
+
def sp_results(self, sql_query, resp_number=None, parse_=True, commit_as_transaction=True, no_count=True):
|
|
194
|
+
""" Execute a stored procedure and retrieves all its output data
|
|
195
|
+
-----------------------------
|
|
196
|
+
query_str = "EXECUTE %s" & stored_procedure
|
|
197
|
+
con_.sp_results(query_str, resp_number=1)
|
|
198
|
+
-----------------------------
|
|
199
|
+
:param sql_query: Query to be sent to SQL
|
|
200
|
+
:param resp_number: Indicate which of the stored procedures responses will be retrieved
|
|
201
|
+
:param parse_: Indicate whether the output needs to be converted to a DataFrame or not
|
|
202
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
203
|
+
:param no_count: Indicate whether SET NOCOUNT option is ON (True) or OFF (False)
|
|
204
|
+
:return: DataFrame list gathering the requested data
|
|
205
|
+
"""
|
|
206
|
+
self.open_read_connection(commit_as_transaction)
|
|
207
|
+
data_list = list()
|
|
208
|
+
cursor = None
|
|
209
|
+
try:
|
|
210
|
+
cursor = self.con.cursor()
|
|
211
|
+
if no_count:
|
|
212
|
+
cursor.execute("SET NOCOUNT ON;" + sql_query)
|
|
213
|
+
else:
|
|
214
|
+
cursor.execute(sql_query)
|
|
215
|
+
if resp_number is not None:
|
|
216
|
+
for cursor_number in range(resp_number - 1):
|
|
217
|
+
cursor.nextset()
|
|
218
|
+
try:
|
|
219
|
+
data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
|
|
220
|
+
except ValueError:
|
|
221
|
+
raise ValueError('Please indicate a valid resp_number')
|
|
222
|
+
else:
|
|
223
|
+
aux_cursor = True
|
|
224
|
+
count = 0
|
|
225
|
+
while aux_cursor is not False and count < 100:
|
|
226
|
+
try:
|
|
227
|
+
data_list.append(self._parse_df(parse_, cursor.fetchall(), cursor.description))
|
|
228
|
+
aux_cursor = cursor.nextset()
|
|
229
|
+
except Exception as e:
|
|
230
|
+
print(e)
|
|
231
|
+
cursor.nextset()
|
|
232
|
+
finally:
|
|
233
|
+
count += 1
|
|
234
|
+
if count >= 100:
|
|
235
|
+
raise RuntimeError("Method sp_results has loop over 100 times for database '%s' on server '%s'"
|
|
236
|
+
% (self.db_name, self.server))
|
|
237
|
+
self.con.commit()
|
|
238
|
+
except ValueError:
|
|
239
|
+
print(traceback.format_exc())
|
|
240
|
+
finally:
|
|
241
|
+
if cursor:
|
|
242
|
+
cursor.close()
|
|
243
|
+
self.close_connection()
|
|
244
|
+
return data_list
|
|
245
|
+
|
|
246
|
+
def run_statement(self, sql_statement, commit_as_transaction=True):
|
|
247
|
+
""" Execute SQL statement
|
|
248
|
+
-----------------------------
|
|
249
|
+
query_str = "DELETE FROM %s WHERE Id > 100" & table
|
|
250
|
+
con_.run_statement(query_str)
|
|
251
|
+
-----------------------------
|
|
252
|
+
:param sql_statement: Statement as string to be run in SQL
|
|
253
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
254
|
+
:return: Statement result
|
|
255
|
+
"""
|
|
256
|
+
self.open_write_connection(commit_as_transaction)
|
|
257
|
+
cursor = self.con.cursor()
|
|
258
|
+
# Execute SQL statement
|
|
259
|
+
try:
|
|
260
|
+
cursor.execute(sql_statement)
|
|
261
|
+
self.con.commit()
|
|
262
|
+
except Exception:
|
|
263
|
+
raise Exception(traceback.format_exc())
|
|
264
|
+
finally:
|
|
265
|
+
if cursor:
|
|
266
|
+
cursor.close()
|
|
267
|
+
self.close_connection()
|
|
268
|
+
|
|
269
|
+
def insert(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1000, print_sql=False,
|
|
270
|
+
commit_all_together=False, output=None, bools2bits=True, nullable=False, commit_as_transaction=True,
|
|
271
|
+
infer_datetime_format=None):
|
|
272
|
+
""" Insert data in a table in SQL truncating the table if needed
|
|
273
|
+
-----------------------------
|
|
274
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
275
|
+
con_.insert(df, table_schema, table_name)
|
|
276
|
+
-----------------------------
|
|
277
|
+
:param data: DataFrame containing the data to upload
|
|
278
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
279
|
+
:param table: Table in which the data will be uploaded
|
|
280
|
+
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
281
|
+
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
282
|
+
external constraints)
|
|
283
|
+
:param identity: Indicate whether the identity columns will be inserted or not
|
|
284
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
285
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
286
|
+
:param commit_all_together: when it is true, it only commits data if all data has been inserted. When it is
|
|
287
|
+
false, it commits data by chunks.
|
|
288
|
+
:param output: Outputs the columns indicated in this list
|
|
289
|
+
:param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
|
|
290
|
+
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
291
|
+
:param nullable: Used within bools2bits function to indicate which boolean column values to convert
|
|
292
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
293
|
+
:param infer_datetime_format: Indicate whether the datetime columns should be converted to string and if so,
|
|
294
|
+
then the format to be used
|
|
295
|
+
"""
|
|
296
|
+
if output is None:
|
|
297
|
+
output = []
|
|
298
|
+
if data is None:
|
|
299
|
+
# no data to upload
|
|
300
|
+
return ValueError("The data provided is invalid!")
|
|
301
|
+
cursor = None
|
|
302
|
+
self.open_write_connection(commit_as_transaction)
|
|
303
|
+
results = pd.DataFrame(columns=output)
|
|
304
|
+
|
|
305
|
+
# Mapping the date datatype columns for SQL
|
|
306
|
+
data = self.date_mapping_data_types(data)
|
|
307
|
+
|
|
308
|
+
# Infer datetime format if provided
|
|
309
|
+
if infer_datetime_format is not None:
|
|
310
|
+
data = self.infer_datetime(data, infer_datetime_format)
|
|
311
|
+
|
|
312
|
+
# Mapping the boolean columns to bit
|
|
313
|
+
if bools2bits:
|
|
314
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
cursor = self.con.cursor()
|
|
318
|
+
# Truncate table if needed
|
|
319
|
+
if truncate:
|
|
320
|
+
cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
|
|
321
|
+
# Delete all records from the table if needed
|
|
322
|
+
if delete:
|
|
323
|
+
cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
|
|
324
|
+
# Allow to insert to an Identity column
|
|
325
|
+
if identity:
|
|
326
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
|
|
327
|
+
# Convert category columns to string
|
|
328
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
329
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
330
|
+
# Deal with bull values and apostrophes (')
|
|
331
|
+
data = data.replace("'NULL'", "NULL")
|
|
332
|
+
data = data.replace("'", "~~", regex=True)
|
|
333
|
+
data = data.fillna("null")
|
|
334
|
+
# Insert data into the table destination
|
|
335
|
+
records = [tuple(x) for x in data.values]
|
|
336
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
|
|
337
|
+
insert_ += str(tuple(data.columns.values)).replace("(\'", "([").replace('\', \'', '], [').replace('\')',
|
|
338
|
+
'])')
|
|
339
|
+
if len(output) > 0:
|
|
340
|
+
insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
|
|
341
|
+
insert_ += """ VALUES """
|
|
342
|
+
|
|
343
|
+
for batch in self._chunker(records, chunk):
|
|
344
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
345
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
346
|
+
string = insert_ + rows
|
|
347
|
+
string = self.convert_decimal_str(string)
|
|
348
|
+
if print_sql:
|
|
349
|
+
print(string)
|
|
350
|
+
cursor.execute(string)
|
|
351
|
+
if len(output) > 0:
|
|
352
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
353
|
+
if ~commit_all_together:
|
|
354
|
+
self.con.commit()
|
|
355
|
+
if commit_all_together:
|
|
356
|
+
self.con.commit()
|
|
357
|
+
|
|
358
|
+
# Restrict to insert to an Identity column
|
|
359
|
+
if identity:
|
|
360
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
|
|
361
|
+
|
|
362
|
+
if len(output) > 0:
|
|
363
|
+
return results.reset_index(drop=True)
|
|
364
|
+
|
|
365
|
+
except Exception:
|
|
366
|
+
raise Exception(traceback.format_exc())
|
|
367
|
+
|
|
368
|
+
finally:
|
|
369
|
+
if cursor:
|
|
370
|
+
cursor.close()
|
|
371
|
+
self.close_connection()
|
|
372
|
+
|
|
373
|
+
def insert_at_once(self, data, schema, table, truncate=False, delete=False, identity=False, chunk=1,
|
|
374
|
+
print_sql=False, output=None, bools2bits=True, nullable=False, commit_as_transaction=True):
|
|
375
|
+
""" Build all the insert statements and commit them all at once
|
|
376
|
+
-----------------------------
|
|
377
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
378
|
+
con_.insert(df, table_schema, table_name)
|
|
379
|
+
-----------------------------
|
|
380
|
+
:param data: DataFrame containing the data to upload
|
|
381
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
382
|
+
:param table: Table in which the data will be uploaded
|
|
383
|
+
:param truncate: Indicate whether the table has to be truncated before the data is sent or not
|
|
384
|
+
:param delete: Delete the rows from a table (Suitable for tables that cannot be truncated because of
|
|
385
|
+
external constraints)
|
|
386
|
+
:param identity: Indicate whether the identity columns will be inserted or not
|
|
387
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
388
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
389
|
+
:param output: Outputs the columns indicated in this list
|
|
390
|
+
:param bools2bits: Indicate whether the Boolean columns should be converted to BIT to be inserted into SQL
|
|
391
|
+
:param nullable: Used within bools2bits function to indicate which boolean column values to convert
|
|
392
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
393
|
+
:return: A DataFrame with the output columns requested if output is not None, else None
|
|
394
|
+
"""
|
|
395
|
+
if output is None:
|
|
396
|
+
output = []
|
|
397
|
+
if data is None:
|
|
398
|
+
# no data to upload
|
|
399
|
+
return ValueError("The data provided is invalid!")
|
|
400
|
+
cursor = None
|
|
401
|
+
self.open_write_connection(commit_as_transaction)
|
|
402
|
+
results = pd.DataFrame(columns=output)
|
|
403
|
+
|
|
404
|
+
# Mapping the date datatype columns for SQL
|
|
405
|
+
data = self.date_mapping_data_types(data)
|
|
406
|
+
|
|
407
|
+
# Mapping the boolean columns to bit
|
|
408
|
+
if bools2bits:
|
|
409
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
cursor = self.con.cursor()
|
|
413
|
+
# Truncate table if needed
|
|
414
|
+
if truncate:
|
|
415
|
+
cursor.execute("TRUNCATE TABLE [%s].[%s]" % (schema, table))
|
|
416
|
+
# Delete all records from the table if needed
|
|
417
|
+
if delete:
|
|
418
|
+
cursor.execute("DELETE FROM [%s].[%s]" % (schema, table))
|
|
419
|
+
# Allow to insert to an Identity column
|
|
420
|
+
if identity:
|
|
421
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] ON" % (schema, table))
|
|
422
|
+
# Convert category columns to string
|
|
423
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
424
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
425
|
+
# Deal with bull values and apostrophes (')
|
|
426
|
+
data = data.replace("'NULL'", "NULL")
|
|
427
|
+
data = data.replace("'", "~~", regex=True)
|
|
428
|
+
data = data.fillna("null")
|
|
429
|
+
# Insert data into the table destination
|
|
430
|
+
records = [tuple(x) for x in data.values]
|
|
431
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (schema, table)
|
|
432
|
+
insert_ += str(tuple(data.columns.values)).replace("(\'", "([").replace('\', \'', '], [').replace('\')',
|
|
433
|
+
'])')
|
|
434
|
+
if len(output) > 0:
|
|
435
|
+
insert_ += " OUTPUT Inserted.[" + "], Inserted.[".join(output) + "] "
|
|
436
|
+
insert_ += """ VALUES """
|
|
437
|
+
|
|
438
|
+
insert_statements = list()
|
|
439
|
+
for batch in self._chunker(records, chunk):
|
|
440
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
441
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
442
|
+
string = insert_ + rows
|
|
443
|
+
string = self.convert_decimal_str(string)
|
|
444
|
+
insert_statements.append(string)
|
|
445
|
+
|
|
446
|
+
if print_sql:
|
|
447
|
+
print(';'.join(insert_statements))
|
|
448
|
+
cursor.execute(';'.join(insert_statements))
|
|
449
|
+
if len(output) > 0:
|
|
450
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
451
|
+
self.con.commit()
|
|
452
|
+
|
|
453
|
+
# Restrict to insert to an Identity column
|
|
454
|
+
if identity:
|
|
455
|
+
cursor.execute("SET IDENTITY_INSERT [%s].[%s] OFF" % (schema, table))
|
|
456
|
+
|
|
457
|
+
if len(output) > 0:
|
|
458
|
+
return results.reset_index(drop=True)
|
|
459
|
+
|
|
460
|
+
except Exception:
|
|
461
|
+
raise Exception(traceback.format_exc())
|
|
462
|
+
|
|
463
|
+
finally:
|
|
464
|
+
if cursor:
|
|
465
|
+
cursor.close()
|
|
466
|
+
self.close_connection()
|
|
467
|
+
|
|
468
|
+
def update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, batch_size=100,
|
|
469
|
+
output=None, nullable=True, commit_as_transaction=True):
|
|
470
|
+
""" This method updates a table in batches in sql server.
|
|
471
|
+
-----------------------------
|
|
472
|
+
UPDATE [SCHEMA].[TABLE]
|
|
473
|
+
SET update_list[0] = data[index, update_list{0}],
|
|
474
|
+
update_list[1] = data[index, update_list[1]]
|
|
475
|
+
OUTPUT output[0], output[1]
|
|
476
|
+
WHERE on_list[0] = data[index, on_list[0]]
|
|
477
|
+
AND on_list[1] = data[index, on_list[1]]
|
|
478
|
+
-----------------------------
|
|
479
|
+
:param data: DataFrame containing the data to update
|
|
480
|
+
:param update_list: list of columns to update
|
|
481
|
+
:param on_list: list of columns to apply the on clause
|
|
482
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
483
|
+
:param table: Table in which the data will be uploaded
|
|
484
|
+
:param bool_cols: list of columns gathering boolean types
|
|
485
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
486
|
+
:param bool_cols: columns to include as booleans
|
|
487
|
+
:param batch_size: Number of records to update in each iteration
|
|
488
|
+
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
489
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
|
|
490
|
+
retrieved)
|
|
491
|
+
:param nullable: Indicate whether to update the table column with null or exclude the reference from the update
|
|
492
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
493
|
+
:return: None
|
|
494
|
+
"""
|
|
495
|
+
cursor = None
|
|
496
|
+
if data is None:
|
|
497
|
+
# no data to update
|
|
498
|
+
return ValueError("The data provided is invalid!")
|
|
499
|
+
|
|
500
|
+
if output is None:
|
|
501
|
+
output = []
|
|
502
|
+
else:
|
|
503
|
+
output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for out
|
|
504
|
+
in output]
|
|
505
|
+
results = pd.DataFrame(columns=output)
|
|
506
|
+
|
|
507
|
+
# re-starting indexes
|
|
508
|
+
data.reset_index(drop=True, inplace=True)
|
|
509
|
+
|
|
510
|
+
# Mapping boolean columns
|
|
511
|
+
if bool_cols is not None:
|
|
512
|
+
for col in bool_cols:
|
|
513
|
+
data[col] = data[col].astype(bool)
|
|
514
|
+
|
|
515
|
+
# Mapping date type for SQL
|
|
516
|
+
data = self.date_mapping_data_types(data)
|
|
517
|
+
|
|
518
|
+
# create connection
|
|
519
|
+
self.open_write_connection(commit_as_transaction)
|
|
520
|
+
|
|
521
|
+
try:
|
|
522
|
+
# initialise cursor
|
|
523
|
+
cursor = self.con.cursor()
|
|
524
|
+
|
|
525
|
+
# extraction of the useful columns
|
|
526
|
+
data_update = data[list(set(update_list + on_list))]
|
|
527
|
+
|
|
528
|
+
# initialisation of the sql statement
|
|
529
|
+
sql_start = ''' UPDATE [%s].[%s] SET ''' % (schema, table)
|
|
530
|
+
iter_batch = math.ceil(data_update.shape[0] / batch_size)
|
|
531
|
+
for batch in range(iter_batch):
|
|
532
|
+
batch_update = data_update.iloc[batch * batch_size: (batch + 1) * batch_size]
|
|
533
|
+
|
|
534
|
+
sql_statement = ''
|
|
535
|
+
for iindex in batch_update.index:
|
|
536
|
+
# UPDATE [SCHEMA].[TABLE]
|
|
537
|
+
sql_statement += sql_start
|
|
538
|
+
|
|
539
|
+
# VALUES
|
|
540
|
+
for col in update_list:
|
|
541
|
+
if nullable:
|
|
542
|
+
if pd.isna(batch_update.loc[iindex, col]):
|
|
543
|
+
sql_statement += " [%s] = NULL ," % col
|
|
544
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
545
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
546
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
547
|
+
sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
|
|
548
|
+
else:
|
|
549
|
+
sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
|
|
550
|
+
else:
|
|
551
|
+
if pd.notna(batch_update.loc[iindex, col]):
|
|
552
|
+
if str(batch_update.loc[iindex, col]).upper() == 'NULL':
|
|
553
|
+
continue
|
|
554
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
555
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
556
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
557
|
+
sql_statement += " [%s] = %s ," % (col, batch_update.loc[iindex, col])
|
|
558
|
+
else:
|
|
559
|
+
sql_statement += " [%s] = '%s' ," % (col, batch_update.loc[iindex, col])
|
|
560
|
+
|
|
561
|
+
# OUTPUT
|
|
562
|
+
if len(output) > 0:
|
|
563
|
+
sql_statement = sql_statement[:-1] + " OUTPUT " + ",".join(output) + ' '
|
|
564
|
+
|
|
565
|
+
# WHERE
|
|
566
|
+
sql_statement = sql_statement[:-1] + ' WHERE '
|
|
567
|
+
for col in on_list:
|
|
568
|
+
if pd.isna(batch_update.loc[iindex, col]):
|
|
569
|
+
sql_statement += " [%s] = NULL AND" % col
|
|
570
|
+
elif isinstance(batch_update.loc[iindex, col], bool):
|
|
571
|
+
sql_statement += " [%s] = %s ," % (col, int(batch_update.loc[iindex, col]))
|
|
572
|
+
elif isinstance(batch_update.loc[iindex, col], Number):
|
|
573
|
+
sql_statement += " [%s] = %s AND" % (col, batch_update.loc[iindex, col])
|
|
574
|
+
else:
|
|
575
|
+
sql_statement += " [%s] = '%s' AND" % (col, batch_update.loc[iindex, col])
|
|
576
|
+
|
|
577
|
+
# Addition of semicolon
|
|
578
|
+
sql_statement = sql_statement[:-3] + ';'
|
|
579
|
+
|
|
580
|
+
if print_sql:
|
|
581
|
+
print(sql_statement)
|
|
582
|
+
|
|
583
|
+
# executing statement
|
|
584
|
+
if len(sql_statement) > 0:
|
|
585
|
+
if len(output) > 0:
|
|
586
|
+
cursor.execute(sql_statement)
|
|
587
|
+
for cursor_number in range(len(sql_statement.split(';')) - 1):
|
|
588
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
589
|
+
cursor.nextset()
|
|
590
|
+
else:
|
|
591
|
+
cursor.execute(sql_statement)
|
|
592
|
+
self.con.commit()
|
|
593
|
+
|
|
594
|
+
if len(output) > 0:
|
|
595
|
+
return results.reset_index(drop=True)
|
|
596
|
+
|
|
597
|
+
except Exception:
|
|
598
|
+
raise Exception(traceback.format_exc())
|
|
599
|
+
|
|
600
|
+
finally:
|
|
601
|
+
if cursor:
|
|
602
|
+
cursor.close()
|
|
603
|
+
self.close_connection()
|
|
604
|
+
|
|
605
|
+
def bulk_update(self, data, update_list, on_list, schema, table, bool_cols=None, print_sql=False, output=None,
|
|
606
|
+
chunk=1000, commit_as_transaction=True):
|
|
607
|
+
""" This method updates a table in batches in sql server.
|
|
608
|
+
-----------------------------
|
|
609
|
+
UPDATE [SCHEMA].[TABLE]
|
|
610
|
+
SET update_list[0] = data[index, update_list{0}],
|
|
611
|
+
update_list[1] = data[index, update_list[1]]
|
|
612
|
+
OUTPUT output[0], output[1]
|
|
613
|
+
WHERE on_list[0] = data[index, on_list[0]]
|
|
614
|
+
AND on_list[1] = data[index, on_list[1]]
|
|
615
|
+
-----------------------------
|
|
616
|
+
:param data: DataFrame containing the data to update
|
|
617
|
+
:param update_list: list of columns to update
|
|
618
|
+
:param on_list: list of columns to apply the on clause
|
|
619
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
620
|
+
:param table: Table in which the data will be uploaded
|
|
621
|
+
:param bool_cols: list of columns gathering boolean types
|
|
622
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
623
|
+
:param bool_cols: columns to include as booleans
|
|
624
|
+
:param output: Outputs the columns indicated in this list as a DataFrame. It should indicate if the column to
|
|
625
|
+
retrieve is the inserted one or the deleted one (If nothing is indicated, then the Deleted one will be
|
|
626
|
+
retrieved)
|
|
627
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
628
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
629
|
+
:return: None
|
|
630
|
+
"""
|
|
631
|
+
cursor = None
|
|
632
|
+
if data is None:
|
|
633
|
+
# no data to update
|
|
634
|
+
return ValueError("The data provided is invalid!")
|
|
635
|
+
|
|
636
|
+
if output is None:
|
|
637
|
+
output = []
|
|
638
|
+
sql_output = []
|
|
639
|
+
else:
|
|
640
|
+
sql_output = [out if 'inserted' in out.lower() or 'deleted' in out.lower() else 'Deleted.[' + out + ']' for
|
|
641
|
+
out
|
|
642
|
+
in output]
|
|
643
|
+
results = pd.DataFrame(columns=output)
|
|
644
|
+
|
|
645
|
+
# re-starting indexes
|
|
646
|
+
data.reset_index(drop=True, inplace=True)
|
|
647
|
+
|
|
648
|
+
# Mapping boolean columns
|
|
649
|
+
if bool_cols is not None:
|
|
650
|
+
for col in bool_cols:
|
|
651
|
+
data[col] = data[col].astype(bool)
|
|
652
|
+
|
|
653
|
+
# Mapping date type for SQL
|
|
654
|
+
data = data[on_list + update_list]
|
|
655
|
+
data = self.date_mapping_data_types(data)
|
|
656
|
+
|
|
657
|
+
# create connection
|
|
658
|
+
self.open_write_connection(commit_as_transaction)
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
# initialise cursor
|
|
662
|
+
cursor = self.con.cursor()
|
|
663
|
+
|
|
664
|
+
# Convert category columns to string
|
|
665
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
666
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
667
|
+
# Deal with bull values and apostrophes (')
|
|
668
|
+
data = data.replace("'NULL'", "NULL")
|
|
669
|
+
data = data.replace("'", "~~", regex=True)
|
|
670
|
+
data = data.fillna("null")
|
|
671
|
+
|
|
672
|
+
records = [tuple(x) for x in data.values]
|
|
673
|
+
temp_table = f'#Temp{schema}{table}'
|
|
674
|
+
|
|
675
|
+
for batch in self._chunker(records, chunk):
|
|
676
|
+
batch_records = [tuple(x) for x in batch]
|
|
677
|
+
# initialisation of the sql statement
|
|
678
|
+
insert_ = f'DROP TABLE IF EXISTS {temp_table} '
|
|
679
|
+
insert_ += f"SELECT * INTO {temp_table} FROM ( VALUES "
|
|
680
|
+
temp_columns = str(tuple(data.columns.values)).replace("(\'", "([").replace(
|
|
681
|
+
'\', \'', '], [').replace('\')', '])')
|
|
682
|
+
rows = str(batch_records).strip('[]').replace("~~", "''")
|
|
683
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
684
|
+
sql_statement = insert_ + rows
|
|
685
|
+
sql_statement = self.convert_decimal_str(sql_statement)
|
|
686
|
+
sql_statement += f') AS TempTable {temp_columns}'
|
|
687
|
+
|
|
688
|
+
col_update_set = ''
|
|
689
|
+
for col in update_list:
|
|
690
|
+
col_update_set += f' target.{col} = source.{col},'
|
|
691
|
+
col_update_set = col_update_set[:-1]
|
|
692
|
+
|
|
693
|
+
col_difference_check = ''
|
|
694
|
+
for col in update_list:
|
|
695
|
+
col_difference_check += f' target.{col} <> source.{col} OR'
|
|
696
|
+
col_difference_check = col_difference_check[:-2]
|
|
697
|
+
|
|
698
|
+
col_join_on = ''
|
|
699
|
+
for col in on_list:
|
|
700
|
+
col_join_on += f' source.{col} = target.{col} AND'
|
|
701
|
+
col_join_on = col_join_on[:-3]
|
|
702
|
+
|
|
703
|
+
sql_statement += f'UPDATE target SET {col_update_set} '
|
|
704
|
+
|
|
705
|
+
if len(output) > 0:
|
|
706
|
+
sql_statement += f" OUTPUT {','.join(sql_output)} "
|
|
707
|
+
|
|
708
|
+
sql_statement += f'''FROM {schema}.{table} target
|
|
709
|
+
JOIN {temp_table} as source
|
|
710
|
+
ON {col_join_on}
|
|
711
|
+
WHERE {col_difference_check}
|
|
712
|
+
'''
|
|
713
|
+
|
|
714
|
+
sql_statement += f' DROP TABLE IF EXISTS {temp_table} '
|
|
715
|
+
|
|
716
|
+
if print_sql:
|
|
717
|
+
print(sql_statement)
|
|
718
|
+
|
|
719
|
+
# executing statement
|
|
720
|
+
if len(sql_statement) > 0:
|
|
721
|
+
if len(output) > 0:
|
|
722
|
+
cursor.execute(sql_statement)
|
|
723
|
+
cursor.nextset()
|
|
724
|
+
results = pd.concat([results, pd.DataFrame.from_records(cursor.fetchall(), columns=output)])
|
|
725
|
+
else:
|
|
726
|
+
cursor.execute(sql_statement)
|
|
727
|
+
|
|
728
|
+
self.con.commit()
|
|
729
|
+
|
|
730
|
+
if len(output) > 0:
|
|
731
|
+
return results.reset_index(drop=True)
|
|
732
|
+
|
|
733
|
+
except Exception:
|
|
734
|
+
raise Exception(traceback.format_exc())
|
|
735
|
+
|
|
736
|
+
finally:
|
|
737
|
+
if cursor:
|
|
738
|
+
cursor.close()
|
|
739
|
+
self.close_connection()
|
|
740
|
+
|
|
741
|
+
def merge(self, data, staging_schema, staging_table, sp_schema, sp_name, truncate=False, chunk=1000,
|
|
742
|
+
commit_as_transaction=True):
|
|
743
|
+
""" Merge data from Staging table using a Stored Procedure. It requires a table in SQL which will store the
|
|
744
|
+
Staging data. The method will work as follows:
|
|
745
|
+
1.- Truncate the staging table according to the truncate parameter
|
|
746
|
+
2.- Insert the data into the staging table
|
|
747
|
+
3.- Execute a stored procedure to merge the staging table with the destination table
|
|
748
|
+
-----------------------------
|
|
749
|
+
df = pd.DataFrame({'col1': ['a', 'b'], 'col2': [1, 2]})
|
|
750
|
+
con_.merge(df, staging_schema, staging_table, sp_schema, sp_name, truncate=True)
|
|
751
|
+
-----------------------------
|
|
752
|
+
:param data: DataFrame to insert in the staging table
|
|
753
|
+
:param staging_schema: Staging table schema
|
|
754
|
+
:param staging_table: Staging table name
|
|
755
|
+
:param sp_schema: Stored Procedure schema
|
|
756
|
+
:param sp_name: Stored Procedure name
|
|
757
|
+
:param truncate: Indicate whether the staging table has to be truncated or not
|
|
758
|
+
:param chunk: Indicate how many rows will be uploaded at once
|
|
759
|
+
:param commit_as_transaction: Indicate whether the connection will be done using the autocommit option or not
|
|
760
|
+
:return: None
|
|
761
|
+
"""
|
|
762
|
+
if data is None:
|
|
763
|
+
# no data to upload
|
|
764
|
+
return ValueError("The data provided is invalid!")
|
|
765
|
+
cursor = None
|
|
766
|
+
self.open_write_connection(commit_as_transaction)
|
|
767
|
+
try:
|
|
768
|
+
cursor = self.con.cursor()
|
|
769
|
+
# Truncate Staging table if needed
|
|
770
|
+
if truncate:
|
|
771
|
+
trunc_insert = """TRUNCATE TABLE [%s].[%s]""" % (staging_schema, staging_table)
|
|
772
|
+
cursor.execute(trunc_insert)
|
|
773
|
+
self.con.commit()
|
|
774
|
+
# Convert category columns to string
|
|
775
|
+
cat_cols = data.columns[(data.dtypes == 'category').values].to_list()
|
|
776
|
+
data[cat_cols] = data[cat_cols].astype(str)
|
|
777
|
+
# Deal with null values and apostrophes (')
|
|
778
|
+
data = data.replace("'NULL'", "NULL")
|
|
779
|
+
data = data.replace("'", "~~", regex=True)
|
|
780
|
+
data = data.fillna("null")
|
|
781
|
+
# Insert in Staging Table
|
|
782
|
+
records = [tuple(x) for x in data.values]
|
|
783
|
+
insert_ = """INSERT INTO [%s].[%s] """ % (staging_schema, staging_table)
|
|
784
|
+
insert_ = insert_ + str(tuple(data.columns.values)).replace("\'", "") + """ VALUES """
|
|
785
|
+
for batch in self._chunker(records, chunk):
|
|
786
|
+
rows = str(batch).strip('[]').replace("~~", "''")
|
|
787
|
+
rows = rows.replace("'NULL'", "NULL").replace("'null'", 'null')
|
|
788
|
+
string = insert_ + rows
|
|
789
|
+
string = self.convert_decimal_str(string)
|
|
790
|
+
cursor.execute(string)
|
|
791
|
+
self.con.commit()
|
|
792
|
+
# Execute Stored Procedure
|
|
793
|
+
exec_sp = """EXECUTE [%s].[%s]""" % (sp_schema, sp_name)
|
|
794
|
+
cursor.execute(exec_sp)
|
|
795
|
+
self.con.commit()
|
|
796
|
+
except Exception:
|
|
797
|
+
raise Exception(traceback.format_exc())
|
|
798
|
+
finally:
|
|
799
|
+
if cursor:
|
|
800
|
+
cursor.close()
|
|
801
|
+
self.close_connection()
|
|
802
|
+
|
|
803
|
+
def merge_into(self, data, schema, table, on_list, update_check=False, update_set=None, bool_cols=None,
|
|
804
|
+
identity=False, print_sql=False, nullable=False):
|
|
805
|
+
"""
|
|
806
|
+
This method is equivalent to the 'merge into' of T-sql. Schema and table defines the Target, while data is the
|
|
807
|
+
Source. Please refer to below schema for more arguments use clarifications.
|
|
808
|
+
Aspects to take into consideration:
|
|
809
|
+
1.- This method will not work properly if data contains duplicates. It is not relevant if the target contains
|
|
810
|
+
duplicates because DISTINCT is used to call the table.
|
|
811
|
+
2.- When having booleans in the dataset you have to pay attention because pandas get bool from sql server as
|
|
812
|
+
[True, False], instead of [0,1]. The method need data from type boolean to be inserted as [0, 1].
|
|
813
|
+
3.- When dealing with datetime columns a similar problem arises. time_format is a dict that contains as keys
|
|
814
|
+
the name of a date column and as values the format that the columns has to have.
|
|
815
|
+
Versions comments...
|
|
816
|
+
+ Difference between version 1.0 and 1.01 is that the last one is a bit simpler, it waits for names of columns
|
|
817
|
+
which types are booleans or datetime (and format for this one) instead of trying to figure out this columns
|
|
818
|
+
as in version 1.0 what is sometimes problematic. So, version 1.01 is more reliable but requires more time
|
|
819
|
+
to write the call to the method.
|
|
820
|
+
-------------------------
|
|
821
|
+
MERGE INTO [SCHEMA].[TABLE] AS TARGET
|
|
822
|
+
USING (
|
|
823
|
+
data
|
|
824
|
+
) AS SOURCE
|
|
825
|
+
ON TARGET.on_list[0] = SOURCE.on_list[0]
|
|
826
|
+
AND TARGET.on_list[1] = SOURCE.on_list[1]
|
|
827
|
+
...
|
|
828
|
+
AND TARGET.on_list[n] = SOURCE.on_list[n]
|
|
829
|
+
WHEN MATCHED AND (
|
|
830
|
+
TARGET.update_check[0] <> SOURCE.update_check[0]
|
|
831
|
+
OR TARGET.update_check[1] <> SOURCE.update_check[1]
|
|
832
|
+
...
|
|
833
|
+
OR TARGET.update_check[n] <> SOURCE.update_check[n]
|
|
834
|
+
)
|
|
835
|
+
UPDATE SET TARGET.update_check[0] = SOURCE.update_check[0],
|
|
836
|
+
...
|
|
837
|
+
TARGET.update_check[n] = SOURCE.update_check[n],
|
|
838
|
+
TARGET.update_set[0] = SOURCE.update_set[0],
|
|
839
|
+
TARGET.update_set[1] = SOURCE.update_set[1],
|
|
840
|
+
....
|
|
841
|
+
TARGET.update_set[n] = SOURCE.update_set[n]
|
|
842
|
+
WHEN NOT MATCHED BY TARGET THEN
|
|
843
|
+
INSERT
|
|
844
|
+
(
|
|
845
|
+
all columns from [SCHEMA].[TABLE]
|
|
846
|
+
)
|
|
847
|
+
VALUES
|
|
848
|
+
(all columns from data)
|
|
849
|
+
-------------------------------
|
|
850
|
+
:param data: DataFrame containing the data to upload/update
|
|
851
|
+
:param schema: Schema of the table in which the data will be uploaded
|
|
852
|
+
:param table: Table in which the data will be uploaded
|
|
853
|
+
:param on_list: list of columns to apply the on clause
|
|
854
|
+
:param update_check: list of columns to do the check
|
|
855
|
+
:param update_set: list of columns to update
|
|
856
|
+
:param bool_cols: list of columns gathering boolean types
|
|
857
|
+
:param identity: Indicate whether the identity columns will be inserted or not, only make sense when the table
|
|
858
|
+
in its definition has it. Its a boolean.
|
|
859
|
+
:param print_sql: boolean to indicate that you want the sql_statement to be printed on the console
|
|
860
|
+
:return: None
|
|
861
|
+
:param nullable: Used for the boolean_mapping_data_types to indicate which boolean column values to convert
|
|
862
|
+
"""
|
|
863
|
+
if data is None:
|
|
864
|
+
# no data to upload
|
|
865
|
+
return ValueError("The data provided is invalid!")
|
|
866
|
+
|
|
867
|
+
if data.shape[0] != data.drop_duplicates().shape[0]:
|
|
868
|
+
return TypeError("There are duplicates values in your dataframe, it will not work properly on "
|
|
869
|
+
"pd.concat().drop_duplicates()")
|
|
870
|
+
|
|
871
|
+
# if update_set has values assigned, update check has to have values assigned
|
|
872
|
+
if update_set is not None:
|
|
873
|
+
if update_check is None:
|
|
874
|
+
return ValueError("Please, to use update_set assigned values to update_check")
|
|
875
|
+
else:
|
|
876
|
+
update_set = update_check
|
|
877
|
+
|
|
878
|
+
# Mapping boolean columns
|
|
879
|
+
if bool_cols is not None:
|
|
880
|
+
for col in bool_cols:
|
|
881
|
+
data[col] = data[col].astype(bool)
|
|
882
|
+
|
|
883
|
+
# Mapping date and boolean type for SQL
|
|
884
|
+
data = self.date_mapping_data_types(data)
|
|
885
|
+
data = self.boolean_mapping_data_types(data, nullable)
|
|
886
|
+
|
|
887
|
+
try:
|
|
888
|
+
# call the table from the server
|
|
889
|
+
data_table = self.query("""SELECT DISTINCT * FROM [%s].[%s]""" % (schema, table))
|
|
890
|
+
|
|
891
|
+
if data_table.shape[0] == 0:
|
|
892
|
+
print("The destination table is empty so all the data will be inserted")
|
|
893
|
+
self.insert(data, schema, table)
|
|
894
|
+
|
|
895
|
+
else:
|
|
896
|
+
for data_col in data.columns:
|
|
897
|
+
if ("int" in str(type(data_table[data_col].iloc[0]))) & (
|
|
898
|
+
data_table[data_col].isnull().sum() > 0):
|
|
899
|
+
data_table[data_col] = data_table[data_col].astype(float)
|
|
900
|
+
else:
|
|
901
|
+
data_table[data_col] = data_table[data_col].astype(type(data[data_col].iloc[0]))
|
|
902
|
+
|
|
903
|
+
coincidence = pd.DataFrame()
|
|
904
|
+
if data_table.shape[0] > 0:
|
|
905
|
+
for col in data_table.columns.values.tolist():
|
|
906
|
+
if isinstance(data_table.loc[0, col], bool):
|
|
907
|
+
data_table[col] = data_table[col].apply(
|
|
908
|
+
lambda x: 1 if x is True else 0 if x is False else np.NaN)
|
|
909
|
+
if bool_cols is not None:
|
|
910
|
+
for col in bool_cols:
|
|
911
|
+
data_table[col] = data_table[col].astype(bool)
|
|
912
|
+
# join the input table with the one in the database
|
|
913
|
+
coincidence = data.merge(data_table[on_list], how='inner', on=on_list)
|
|
914
|
+
# WHEN MATCHED AND ... UPDATE SET
|
|
915
|
+
if update_check:
|
|
916
|
+
coincidence2 = coincidence.merge(data_table[list(set(on_list + update_check))],
|
|
917
|
+
how='inner',
|
|
918
|
+
on=list(set(on_list + update_check)))
|
|
919
|
+
data_update = pd.concat([coincidence, coincidence2], ignore_index=True)
|
|
920
|
+
data_update.drop_duplicates(keep=False, inplace=True)
|
|
921
|
+
if data_update.shape[0] > 0:
|
|
922
|
+
self.update(data_update, list(set(update_set + update_check)), on_list, schema, table,
|
|
923
|
+
print_sql=print_sql)
|
|
924
|
+
|
|
925
|
+
# WHEN NOT MATCHED BY TARGET THEN... INSERT
|
|
926
|
+
data_insert = pd.concat([data, coincidence], ignore_index=True)
|
|
927
|
+
data_insert.drop_duplicates(keep=False, inplace=True)
|
|
928
|
+
if data_insert.shape[0] > 0:
|
|
929
|
+
self.insert(data_insert, schema, table, identity=identity, print_sql=print_sql)
|
|
930
|
+
|
|
931
|
+
except Exception:
|
|
932
|
+
raise Exception(traceback.format_exc())
|
|
933
|
+
|
|
934
|
+
@staticmethod
|
|
935
|
+
def date_mapping_data_types(data):
|
|
936
|
+
"""
|
|
937
|
+
Map datetime and boolean variables so they can be inserted in SQL
|
|
938
|
+
:param data: DataFrame containing the variables to map
|
|
939
|
+
:return: The mapped DataFrame
|
|
940
|
+
"""
|
|
941
|
+
first_index = data.index[0]
|
|
942
|
+
date_col = data.columns[
|
|
943
|
+
[('date' in str(type(data.loc[first_index, col]))) | ('timestamp' in str(type(data.loc[first_index, col])))
|
|
944
|
+
for col in data.columns]]
|
|
945
|
+
if len(date_col) > 0:
|
|
946
|
+
for col in date_col:
|
|
947
|
+
data[col] = pd.to_datetime(data[col])
|
|
948
|
+
if data[col].dtypes == 'O':
|
|
949
|
+
data[col] = data[col].dt.strftime('%Y-%m-%d')
|
|
950
|
+
else:
|
|
951
|
+
data[col] = data[col].dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
952
|
+
data.loc[data[col] == 'NaT', col] = np.nan
|
|
953
|
+
|
|
954
|
+
return data
|
|
955
|
+
|
|
956
|
+
@staticmethod
|
|
957
|
+
def boolean_mapping_data_types(data, nullable=False):
|
|
958
|
+
"""
|
|
959
|
+
Map datetime and boolean variables so they can be inserted in SQL
|
|
960
|
+
:param data: DataFrame containing the variables to map
|
|
961
|
+
:return: The mapped DataFrame
|
|
962
|
+
:param nullable: Determine if you want to convert null values within boolean columns to boolean format or not
|
|
963
|
+
"""
|
|
964
|
+
first_index = data.index[0]
|
|
965
|
+
bool_col = data.columns[
|
|
966
|
+
[('bool' in str(type(data.loc[first_index, col]))) | ('object' in str(type(data.loc[first_index, col]))) for
|
|
967
|
+
col in data.columns]]
|
|
968
|
+
if len(bool_col) > 0:
|
|
969
|
+
for col in bool_col:
|
|
970
|
+
if nullable:
|
|
971
|
+
bool_not_null = data[data[col].notna()]
|
|
972
|
+
if bool_not_null.shape[0] > 0:
|
|
973
|
+
for iindex in bool_not_null.index:
|
|
974
|
+
data.at[iindex, col] = int(data.loc[iindex, col])
|
|
975
|
+
else:
|
|
976
|
+
data[col] = data[col].apply(lambda x: 1 if x is True else 0)
|
|
977
|
+
|
|
978
|
+
return data
|
|
979
|
+
|
|
980
|
+
@staticmethod
|
|
981
|
+
def id_next(con_db, table, schema, id_col, print_sql=False):
|
|
982
|
+
"""
|
|
983
|
+
This static method returns the next id to be inserted into a table for sql_server
|
|
984
|
+
:param con_db: class to connect to a sql server dabatase
|
|
985
|
+
:param table: name of the table
|
|
986
|
+
:param schema: name of the schema
|
|
987
|
+
:param id_col: name of the id column
|
|
988
|
+
:param print_sql: bool to indicate if you want sql statement to be print on Python Console
|
|
989
|
+
:return: Max ID + 1 for id_col
|
|
990
|
+
"""
|
|
991
|
+
sql_statement = ("SELECT CASE WHEN MAX(%s) IS NULL THEN 1 ELSE MAX(%s) + 1 END AS [Id] FROM [%s].[%s]" % (
|
|
992
|
+
id_col, id_col, schema, table))
|
|
993
|
+
if print_sql:
|
|
994
|
+
print(sql_statement)
|
|
995
|
+
df = con_db.query(sql_statement)
|
|
996
|
+
id_ = df.loc[0, 'Id']
|
|
997
|
+
return id_
|
|
998
|
+
|
|
999
|
+
@staticmethod
|
|
1000
|
+
def convert_decimal_str(string):
|
|
1001
|
+
""" Method to parse the Decimal type in python
|
|
1002
|
+
:param string: String variable to parse
|
|
1003
|
+
"""
|
|
1004
|
+
string = re.sub("'\)(?!(,[ ]+\())(?=([^$]))", "", string)
|
|
1005
|
+
return re.sub("Decimal\('", "", string)
|
|
1006
|
+
|
|
1007
|
+
@staticmethod
|
|
1008
|
+
def infer_datetime(data, infer_datetime_format):
|
|
1009
|
+
""" Method to infer datetime columns and format them as string
|
|
1010
|
+
:param data: DataFrame to parse
|
|
1011
|
+
:param infer_datetime_format: format to be used for the datetime columns
|
|
1012
|
+
"""
|
|
1013
|
+
for col in data.select_dtypes(include=['datetime64']).columns:
|
|
1014
|
+
data[col] = pd.to_datetime(data[col]).dt.strftime(infer_datetime_format)
|
|
1015
|
+
|
|
1016
|
+
return data
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
berryworld/__init__.py,sha256
|
|
1
|
+
berryworld/__init__.py,sha256=-441T8nq6t3cSYeHweND8HAttfEcBfqij3PdbhhvoUw,1163
|
|
2
2
|
berryworld/aks_logs.py,sha256=Gb2_cokiZbEX01Yoptd0MxpDociaug-GrXdwliyxFBo,2293
|
|
3
3
|
berryworld/allocation_solver.py,sha256=asFtaCAze6-eHUGWXA0kAp67UBS-Upj1KKdrVLj_ttQ,8513
|
|
4
4
|
berryworld/app_logs.py,sha256=MKzKPYd3JuPfOQNAapIgaeZeFHw1z_w2mbn9I6QCADE,4180
|
|
@@ -19,18 +19,19 @@ berryworld/power_automate.py,sha256=V86QEGG9H36DrDvod9Q6yp8OUu307hfYcXJhw06pYrA,
|
|
|
19
19
|
berryworld/sharepoint_con.py,sha256=TuH-Vxk1VxjTi7x80KFssf_J8YPLRXpV27RBaFZi37U,22254
|
|
20
20
|
berryworld/snowflake_conn.py,sha256=go5ZJjnhz5SkG83B0G0XZSwKgU6tg7AFTBso59oRG5M,2434
|
|
21
21
|
berryworld/sql_conn.py,sha256=tYKgD8ja7NQuvLB1WBjdsJbPcm3eX1Y76QPTEgx8R8Q,47564
|
|
22
|
+
berryworld/sql_connenction.py,sha256=CsenHXXe1fSKOgXI5vAdWLmFMkiAnfXAaOVjFD3UMtk,48361
|
|
22
23
|
berryworld/teams_logging.py,sha256=8NwXyWr4fLj7W6GzAm2nRQCGFDxibQpAHDHHD24FrP8,6997
|
|
23
24
|
berryworld/transportation_solver.py,sha256=tNc1JJk71azIBccdWVHbqcvXWhalOdKffv6HmBD6tG0,5014
|
|
24
25
|
berryworld/verify_keys.py,sha256=X4Nuz3o0XbRDYofbJGvxIDeN5gfWj19PN7lhO6T3hR8,4356
|
|
25
26
|
berryworld/vivantio.py,sha256=QfZo0UKqkzVRg_LyiwivNd3aEup4TH57x4KxLZkCJwc,10627
|
|
26
27
|
berryworld/vivantio_logging.py,sha256=ciy7gA4u3FrgUIpEBnMgocbNPp6jcu9TPoy-kLcrTZU,5736
|
|
27
28
|
berryworld/xml_parser.py,sha256=HWD71NaTN3DaIOGT6Wzxs4CEsroFhGQwe9iPLIL80Co,957
|
|
28
|
-
berryworld-1.0.0.
|
|
29
|
+
berryworld-1.0.0.192676.dist-info/licenses/LICENSE,sha256=vtkVCJM6E2af2gnsi2XxKPr4WY-uIbvzVLXieFND0UU,1074
|
|
29
30
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
31
|
tests/test_allocation_config.py,sha256=e12l6fE9U57eSPS35g6ekJ_hol7-RHg89JV60_m1BlE,4633
|
|
31
32
|
tests/test_handy_mix_config.py,sha256=Un56mz9KJmdn4K4OwzHAHLSRzDU1Xv2nFrONNuzOG04,2594
|
|
32
33
|
tests/test_xml_parser.py,sha256=3QTlhFEd6KbK6nRFKZnc35tad6wqukTbe4QrFi8mr_8,859
|
|
33
|
-
berryworld-1.0.0.
|
|
34
|
-
berryworld-1.0.0.
|
|
35
|
-
berryworld-1.0.0.
|
|
36
|
-
berryworld-1.0.0.
|
|
34
|
+
berryworld-1.0.0.192676.dist-info/METADATA,sha256=qBhrjmW4Enunr7bxkhNmYgY6bKAYYVpC0V-Vomoj6kc,1362
|
|
35
|
+
berryworld-1.0.0.192676.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
+
berryworld-1.0.0.192676.dist-info/top_level.txt,sha256=GIZ5qy-P5oxfEH755vA1IMFeTVdX3-40JxMe6nOe5I8,17
|
|
37
|
+
berryworld-1.0.0.192676.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|